diff --git a/sglang/.github/workflows/open-pr-copy-from-oss.yml b/sglang/.github/workflows/open-pr-copy-from-oss.yml
new file mode 100644
index 0000000000000000000000000000000000000000..40592f5e0bf29a498d7b83e2aa4a69a2ec282425
--- /dev/null
+++ b/sglang/.github/workflows/open-pr-copy-from-oss.yml
@@ -0,0 +1,28 @@
+name: Open A PR to Copy Code From OSS
+
+on:
+ workflow_dispatch:
+ # schedule:
+ # - cron: '0 10 * * *'
+
+permissions:
+ contents: write
+
+jobs:
+ copy:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ ref: 'main'
+
+ - name: Install GitHub CLI (if not present)
+ run: |
+ bash scripts/code_sync/install_github_cli.sh
+
+ - name: Copy from OSS code
+ env:
+ GH_TOKEN: ${{ secrets.GH_PAT_FOR_OPEN_PR_TO_PRIVATE }}
+ run: |
+ python3 scripts/code_sync/copy_from_oss.py
diff --git a/sglang/.github/workflows/release-branch-cut.yml b/sglang/.github/workflows/release-branch-cut.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d5d796d092921188ad04b978a565c675380b65ea
--- /dev/null
+++ b/sglang/.github/workflows/release-branch-cut.yml
@@ -0,0 +1,213 @@
+name: Release Branch Cut
+
+on:
+ workflow_dispatch:
+ inputs:
+ branch_name:
+ description: 'Branch name to create (e.g., release/v0.5.7)'
+ required: true
+ type: string
+ commit_sha:
+ description: 'Commit SHA from main to cut the release branch from (defaults to latest main)'
+ required: false
+ type: string
+ default: ''
+
+permissions:
+ actions: write
+ contents: write
+ pull-requests: read
+
+jobs:
+ cut-release-branch:
+ if: github.repository == 'sgl-project/sglang'
+ runs-on: ubuntu-latest
+ environment: 'prod'
+ outputs:
+ branch_name: ${{ steps.set_output.outputs.branch_name }}
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ with:
+ ref: main
+ fetch-depth: 0
+ token: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Validate branch name
+ run: |
+ BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+
+ if [ -z "$BRANCH_NAME" ]; then
+ echo "::error::Branch name is required"
+ exit 1
+ fi
+
+ # Validate branch name format (should start with release/)
+ if [[ ! "$BRANCH_NAME" =~ ^release/ ]]; then
+ echo "::warning::Branch name '$BRANCH_NAME' does not follow convention 'release/vX.Y.Z'"
+ fi
+
+ echo "Branch name: $BRANCH_NAME"
+
+ - name: Validate commit SHA
+ id: validate
+ run: |
+ COMMIT_SHA="${{ github.event.inputs.commit_sha }}"
+
+ # If no commit SHA provided, use latest main
+ if [ -z "$COMMIT_SHA" ]; then
+ COMMIT_SHA=$(git rev-parse HEAD)
+ echo "No commit SHA provided, using latest main: $COMMIT_SHA"
+ fi
+
+ # Verify the commit exists and is on main
+ if ! git cat-file -t "$COMMIT_SHA" > /dev/null 2>&1; then
+ echo "::error::Commit SHA '$COMMIT_SHA' does not exist"
+ exit 1
+ fi
+
+ # Check if commit is an ancestor of main (i.e., is on main branch)
+ if ! git merge-base --is-ancestor "$COMMIT_SHA" main; then
+ echo "::error::Commit SHA '$COMMIT_SHA' is not on the main branch"
+ exit 1
+ fi
+
+ echo "COMMIT_SHA=$COMMIT_SHA" >> $GITHUB_OUTPUT
+ echo "Validated commit SHA: $COMMIT_SHA"
+
+ - name: Check if branch already exists
+ run: |
+ BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+
+ if git ls-remote --heads origin "$BRANCH_NAME" | grep -q "$BRANCH_NAME"; then
+ echo "::error::Branch '$BRANCH_NAME' already exists"
+ exit 1
+ fi
+
+ echo "Branch '$BRANCH_NAME' does not exist, proceeding with creation"
+
+ - name: Create release branch
+ id: set_output
+ run: |
+ COMMIT_SHA="${{ steps.validate.outputs.COMMIT_SHA }}"
+ BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+
+ git config user.name "sglang-bot"
+ git config user.email "sglang-bot@users.noreply.github.com"
+
+ # Create branch from the specified commit
+ git checkout -b "$BRANCH_NAME" "$COMMIT_SHA"
+
+ echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
+ echo "Successfully created branch '$BRANCH_NAME' from commit '$COMMIT_SHA'"
+
+ - name: Update version references in documentation
+ run: |
+ BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+ # Extract version from branch name (e.g., release/v0.5.8 -> v0.5.8)
+ VERSION=$(echo "$BRANCH_NAME" | sed 's/release\///')
+
+ # Update git clone version references in docs
+ sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/get_started/install.md
+ sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/platforms/amd_gpu.md
+
+ # Check if any changes were made
+ if git diff --quiet; then
+ echo "No version references needed updating"
+ else
+ git add docs/get_started/install.md docs/platforms/amd_gpu.md
+ git commit -m "docs: update version references to $VERSION"
+ echo "Updated version references to $VERSION"
+ fi
+
+ - name: Push release branch
+ run: |
+ BRANCH_NAME="${{ steps.set_output.outputs.branch_name }}"
+ git push origin "$BRANCH_NAME"
+ echo "Successfully pushed branch '$BRANCH_NAME'"
+
+ - name: Summary
+ run: |
+ COMMIT_SHA="${{ steps.validate.outputs.COMMIT_SHA }}"
+ BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+
+ echo "## Release Branch Cut Summary" >> $GITHUB_STEP_SUMMARY
+ echo "" >> $GITHUB_STEP_SUMMARY
+ echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
+ echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
+ echo "| Branch | \`$BRANCH_NAME\` |" >> $GITHUB_STEP_SUMMARY
+ echo "| Commit | \`$COMMIT_SHA\` |" >> $GITHUB_STEP_SUMMARY
+ echo "| Triggered by | @${{ github.actor }} |" >> $GITHUB_STEP_SUMMARY
+ echo "" >> $GITHUB_STEP_SUMMARY
+ echo "### Next Steps" >> $GITHUB_STEP_SUMMARY
+ echo "1. Tests are automatically triggered on the release branch" >> $GITHUB_STEP_SUMMARY
+ echo "2. Apply any hotfixes if needed" >> $GITHUB_STEP_SUMMARY
+ echo "3. Create a tag to trigger release: \`gh workflow run release-tag.yml -f version=X.Y.Z -f ref=$BRANCH_NAME\`" >> $GITHUB_STEP_SUMMARY
+
+ run-pr-tests-nvidia:
+ needs: cut-release-branch
+ uses: ./.github/workflows/pr-test.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ run_all_tests: true
+ secrets: inherit
+
+ run-pr-tests-amd:
+ needs: cut-release-branch
+ uses: ./.github/workflows/pr-test-amd.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ run_all_tests: true
+ secrets: inherit
+
+ run-pr-test-npu:
+ needs: cut-release-branch
+ uses: ./.github/workflows/pr-test-npu.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ run_all_tests: true
+ secrets: inherit
+
+ run-pr-tests-xeon:
+ needs: cut-release-branch
+ uses: ./.github/workflows/pr-test-xeon.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ run_all_tests: true
+ secrets: inherit
+
+ run-pr-tests-xpu:
+ needs: cut-release-branch
+ uses: ./.github/workflows/pr-test-xpu.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ run_all_tests: true
+ secrets: inherit
+
+ run-nightly-tests-nvidia:
+ needs: cut-release-branch
+ uses: ./.github/workflows/nightly-test-nvidia.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ secrets: inherit
+
+ run-nightly-tests-amd:
+ needs: cut-release-branch
+ uses: ./.github/workflows/nightly-test-amd.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ secrets: inherit
+
+ run-nightly-tests-npu:
+ needs: cut-release-branch
+ uses: ./.github/workflows/nightly-test-npu.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ secrets: inherit
+
+ run-nightly-tests-intel:
+ needs: cut-release-branch
+ uses: ./.github/workflows/nightly-test-intel.yml
+ with:
+ ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+ secrets: inherit
diff --git a/sglang/.github/workflows/rerun-ut.yml b/sglang/.github/workflows/rerun-ut.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0e1cf7379da7ba3fd33da771c28234da7690d9eb
--- /dev/null
+++ b/sglang/.github/workflows/rerun-ut.yml
@@ -0,0 +1,71 @@
+name: Rerun UT
+run-name: ${{ inputs.pr_head_sha && format('[rerun-ut] {0}', inputs.pr_head_sha) || '[rerun-ut]' }}
+
+on:
+ workflow_dispatch:
+ inputs:
+ test_command:
+ description: "Test command to run (e.g. 'registered/core/test_srt_endpoint.py TestSRTEndpoint.test_simple_decode')"
+ required: true
+ type: string
+ runner_label:
+ description: "Runner label (e.g. '1-gpu-runner', '1-gpu-5090', '4-gpu-h100')"
+ required: true
+ type: string
+ pr_head_sha:
+ description: "PR head SHA to checkout (for /rerun-ut on fork PRs)"
+ required: false
+ type: string
+ default: ""
+ use_deepep:
+ description: "Use ci_install_deepep.sh instead of ci_install_dependency.sh"
+ required: false
+ type: string
+ default: "false"
+
+env:
+ SGLANG_IS_IN_CI: true
+ SGLANG_CUDA_COREDUMP: "1"
+ SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true
+
+permissions:
+ actions: write
+ contents: read
+
+jobs:
+ rerun-ut-cuda:
+ runs-on: ${{ inputs.runner_label }}
+ timeout-minutes: 120
+ env:
+ RUNNER_LABELS: ${{ inputs.runner_label }}
+ IS_BLACKWELL: ${{ (inputs.runner_label == '1-gpu-5090' || contains(inputs.runner_label, 'b200')) && '1' || '' }}
+ SGLANG_CI_RDMA_ALL_DEVICES: ${{ inputs.runner_label == '8-gpu-h20' && 'mlx5_1,mlx5_2,mlx5_3,mlx5_4' || '' }}
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ inputs.pr_head_sha || github.sha }}
+
+ - name: Install dependencies
+ timeout-minutes: 20
+ run: |
+ if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then
+ source /etc/profile.d/sglang-ci.sh
+ fi
+ if [[ "${{ inputs.use_deepep }}" == "true" ]]; then
+ bash scripts/ci/cuda/ci_install_deepep.sh
+ else
+ bash scripts/ci/cuda/ci_install_dependency.sh
+ fi
+
+ - name: Run test
+ timeout-minutes: 60
+ run: |
+ if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then
+ source /etc/profile.d/sglang-ci.sh
+ fi
+ cd test/
+ python3 ${{ inputs.test_command }}
+
+ - uses: ./.github/actions/upload-cuda-coredumps
+ if: always()
diff --git a/sglang/docs/_static/css/custom_log.css b/sglang/docs/_static/css/custom_log.css
new file mode 100644
index 0000000000000000000000000000000000000000..61f65d0199df9e97886560f7f97c6c9b026bd34e
--- /dev/null
+++ b/sglang/docs/_static/css/custom_log.css
@@ -0,0 +1,29 @@
+.output_area {
+ color: #615656;
+}
+
+table.autosummary td {
+ width: 50%
+ }
+
+ img.align-center {
+ display: block;
+ margin-left: auto;
+ margin-right: auto;
+}
+
+.output_area.stderr {
+ color: #d3d3d3 !important;
+}
+
+.output_area.stdout {
+ color: #d3d3d3 !important;
+}
+
+div.output_area.stderr {
+ color: #d3d3d3 !important;
+}
+
+div.output_area.stdout {
+ color: #d3d3d3 !important;
+}
diff --git a/sglang/docs/_static/css/readthedocs.css b/sglang/docs/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..aca6649b436a35cf39b2c924ce2f74ed2cdc8b90
--- /dev/null
+++ b/sglang/docs/_static/css/readthedocs.css
@@ -0,0 +1,9 @@
+table.autosummary td {
+ width: 50%
+}
+
+img.align-center {
+ display: block;
+ margin-left: auto;
+ margin-right: auto;
+}
diff --git a/sglang/docs/_static/image/logo.ico b/sglang/docs/_static/image/logo.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7a0b5b2f00e4b626e250e5c4a03de4136d96943a
Binary files /dev/null and b/sglang/docs/_static/image/logo.ico differ
diff --git a/sglang/docs/advanced_features/checkpoint_engine.md b/sglang/docs/advanced_features/checkpoint_engine.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e39a7ee227474c67e683e4d51b8906bd69b9f47
--- /dev/null
+++ b/sglang/docs/advanced_features/checkpoint_engine.md
@@ -0,0 +1,254 @@
+# Checkpoint Engine Integration
+
+The SGLang checkpoint engine integration provides an efficient way to load model weights using a distributed checkpoint loading system. This feature significantly reduces model loading time, especially for large models and multi-node setups, by parallelizing the weight loading process across multiple processes and nodes.
+
+## Overview
+
+The checkpoint engine integration allows SGLang to:
+- Load model weights in parallel using multiple processes
+- Distribute weight loading across multiple nodes to increase effective disk bandwidth
+- Overlap weight loading with other initialization tasks like CUDA graph capture
+- Support both single-node and multi-node deployments
+
+## Installation
+
+First, install the checkpoint engine package:
+
+```bash
+pip install 'checkpoint-engine[p2p]'
+```
+
+## Architecture
+
+The system consists of two main components:
+
+1. **SGLang Server**: Runs with `--wait-for-initial-weights` flag to wait for weights before becoming ready
+2. **Checkpoint Engine Workers**: Separate processes (managed by torchrun) that load and distribute model weights
+
+The checkpoint engine uses a parameter server architecture with support for:
+- **Broadcast mode**: Weights are broadcast from loading processes to inference processes
+- **P2P mode**: Direct peer-to-peer weight transfer between processes
+- **All mode**: Combination of both broadcast and P2P methods
+
+## Usage Examples
+
+### Single Node Setup
+
+**Terminal 1 - Launch SGLang Server:**
+```bash
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-8B \
+ --tp 8 \
+ --load-format dummy \
+ --wait-for-initial-weights
+```
+
+**Terminal 2 - Run Checkpoint Engine:**
+
+Using sglang entrypoint:
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+ examples/checkpoint_engine/update.py \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+### Multi-Node Setup (2 Nodes)
+
+**Node 0:**
+
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-8B \
+ --tp 8 \
+ --load-format dummy \
+ --wait-for-initial-weights \
+ --host [IP]
+```
+
+Run checkpoint engine:
+
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+ --nnodes 2 \
+ --node-rank 0 \
+ --master-addr [IP] \
+ --master-port 29500 \
+ examples/checkpoint_engine/update.py \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+**Node 1:**
+
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-8B \
+ --tp 8 \
+ --load-format dummy \
+ --wait-for-initial-weights \
+ --host [IP]
+```
+
+Run checkpoint engine:
+
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+ --nnodes 2 \
+ --node-rank 1 \
+ --master-addr [IP] \
+ --master-port 29500 \
+ examples/checkpoint_engine/update.py \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 8
+```
+
+### Multi-Node Setup with Tensor Parallelism (TP=16)
+
+**Node 0:**
+
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-8B \
+ --tp 8 \
+ --load-format dummy \
+ --wait-for-initial-weights \
+ --host [IP] \
+ --dist-init-addr [IP]:9120 \
+ --nnodes 2 \
+ --node-rank 0
+```
+
+Run checkpoint engine:
+
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 16
+```
+
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+ --nnodes 2 \
+ --node-rank 0 \
+ --master-addr [IP] \
+ --master-port 29500 \
+ examples/checkpoint_engine/update.py \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 16
+```
+
+**Node 1:**
+
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-8B \
+ --tp 8 \
+ --load-format dummy \
+ --wait-for-initial-weights \
+ --host [IP] \
+ --dist-init-addr [IP]:9120 \
+ --nnodes 2 \
+ --node-rank 1
+```
+
+Run checkpoint engine:
+
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 16
+```
+
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+ --nnodes 2 \
+ --node-rank 1 \
+ --master-addr [IP] \
+ --master-port 29500 \
+ examples/checkpoint_engine/update.py \
+ --update-method broadcast \
+ --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+ --inference-parallel-size 16
+```
+
+## Configuration Options
+
+### SGLang Server Options
+
+- `--load-format dummy`: Use dummy format for initial loading (allows overlapping with other tasks)
+- `--wait-for-initial-weights`: Wait for checkpoint engine to provide weights before becoming ready
+- `--host`: Host address for multi-node setups
+- `--dist-init-addr`: Distributed initialization address for tensor parallelism
+
+### Checkpoint Engine Options
+
+- `--update-method`: Weight update method (`broadcast`, `p2p`, or `all`)
+- `--checkpoint-path`: Path to model checkpoint directory
+- `--inference-parallel-size`: Number of inference parallel processes
+- `--endpoint`: SGLang server endpoint (default: `http://localhost:19730`)
+- `--checkpoint-name`: Name for the checkpoint (default: `my-checkpoint-iter-0`)
+- `--save-metas-file`: File to save checkpoint metadata
+- `--load-metas-file`: File to load checkpoint metadata from
+- `--uds`: Unix domain socket path for communication
+- `--weight-version`: Version identifier for weights
+
+## Performance Benefits
+
+The checkpoint engine provides significant time savings in two main aspects:
+
+1. **Multi-node Loading**: Each node only loads a portion of weights from disk, effectively increasing disk bandwidth. More participating nodes provide greater acceleration. Preliminary tests show 20-second acceleration when loading DeepSeek-R1 on H20-3e with two nodes.
+
+2. **Single Process Optimization**: Using dummy format allows overlapping disk-to-CPU transfer with CUDA graph capture and other initialization tasks, providing additional time savings.
+
+## Troubleshooting
+
+- Ensure checkpoint engine package is installed: `pip install 'checkpoint-engine[p2p]'`
+- Verify network connectivity between nodes in multi-node setups
+- Check that the checkpoint path contains valid model files
+- Monitor logs for connection errors between SGLang server and checkpoint engine
+- Use `--sleep-time` parameter to add delays if needed for debugging
+
+## References
+
+- [Checkpoint Engine Repository](https://github.com/MoonshotAI/checkpoint-engine)
diff --git a/sglang/docs/advanced_features/structured_outputs.ipynb b/sglang/docs/advanced_features/structured_outputs.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..ec603e4e62bf5db3554e5f8577654b8c9aed3b8c
--- /dev/null
+++ b/sglang/docs/advanced_features/structured_outputs.ipynb
@@ -0,0 +1,997 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Structured Outputs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.\n",
+ "\n",
+ "SGLang supports three grammar backends:\n",
+ "\n",
+ "- [XGrammar](https://github.com/mlc-ai/xgrammar)(default): Supports JSON schema, regular expression, and EBNF constraints.\n",
+ "- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.\n",
+ "- [Llguidance](https://github.com/guidance-ai/llguidance): Supports JSON schema, regular expression, and EBNF constraints.\n",
+ "\n",
+ "We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
+ "\n",
+ "To use Outlines, simply add `--grammar-backend outlines` when launching the server.\n",
+ "To use llguidance, add `--grammar-backend llguidance` when launching the server.\n",
+ "If no backend is specified, XGrammar will be used as the default.\n",
+ "\n",
+ "For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## OpenAI Compatible API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "import os\n",
+ "\n",
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+ "\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### JSON\n",
+ "\n",
+ "you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Using Pydantic**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pydantic import BaseModel, Field\n",
+ "\n",
+ "\n",
+ "# Define the schema using Pydantic\n",
+ "class CapitalInfo(BaseModel):\n",
+ " name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+ " population: int = Field(..., description=\"Population of the capital city\")\n",
+ "\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n",
+ " },\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=128,\n",
+ " response_format={\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": {\n",
+ " \"name\": \"foo\",\n",
+ " # convert the pydantic model to json schema\n",
+ " \"schema\": CapitalInfo.model_json_schema(),\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "response_content = response.choices[0].message.content\n",
+ "# validate the JSON response by the pydantic model\n",
+ "capital_info = CapitalInfo.model_validate_json(response_content)\n",
+ "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**JSON Schema Directly**\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "json_schema = json.dumps(\n",
+ " {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+ " \"population\": {\"type\": \"integer\"},\n",
+ " },\n",
+ " \"required\": [\"name\", \"population\"],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
+ " },\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=128,\n",
+ " response_format={\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### EBNF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ebnf_grammar = \"\"\"\n",
+ "root ::= city | description\n",
+ "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
+ "description ::= city \" is \" status\n",
+ "status ::= \"the capital of \" country\n",
+ "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
+ "\"\"\"\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Give me the information of the capital of France.\",\n",
+ " },\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=32,\n",
+ " extra_body={\"ebnf\": ebnf_grammar},\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Regular expression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=128,\n",
+ " extra_body={\"regex\": \"(Paris|London)\"},\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Structural Tag"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tool_get_current_weather = {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_current_weather\",\n",
+ " \"description\": \"Get the current weather in a given location\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"city\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+ " },\n",
+ " \"state\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+ " \" in, e.g. 'CA' which would mean 'California'\",\n",
+ " },\n",
+ " \"unit\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The unit to fetch the temperature in\",\n",
+ " \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+ " },\n",
+ " },\n",
+ " \"required\": [\"city\", \"state\", \"unit\"],\n",
+ " },\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "tool_get_current_date = {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_current_date\",\n",
+ " \"description\": \"Get the current date and time for a given timezone\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"timezone\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"timezone\"],\n",
+ " },\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
+ "schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
+ "\n",
+ "\n",
+ "def get_messages():\n",
+ " return [\n",
+ " {\n",
+ " \"role\": \"system\",\n",
+ " \"content\": f\"\"\"\n",
+ "# Tool Instructions\n",
+ "- Always execute python code in messages that you share.\n",
+ "- When looking for real time information use relevant functions if available else fallback to brave_search\n",
+ "You have access to the following functions:\n",
+ "Use the function 'get_current_weather' to: Get the current weather in a given location\n",
+ "{tool_get_current_weather[\"function\"]}\n",
+ "Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
+ "{tool_get_current_date[\"function\"]}\n",
+ "If a you choose to call a function ONLY reply in the following format:\n",
+ "<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
+ "where\n",
+ "start_tag => ` a JSON dict with the function argument name as key and function argument value as value.\n",
+ "end_tag => ``\n",
+ "Here is an example,\n",
+ "{{\"example_name\": \"example_value\"}}\n",
+ "Reminder:\n",
+ "- Function calls MUST follow the specified format\n",
+ "- Required parameters MUST be specified\n",
+ "- Only call one function at a time\n",
+ "- Put the entire function call reply on one line\n",
+ "- Always add your sources when using search results to answer the user query\n",
+ "You are a helpful assistant.\"\"\",\n",
+ " },\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
+ " },\n",
+ " ]\n",
+ "\n",
+ "\n",
+ "messages = get_messages()\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+ " messages=messages,\n",
+ " response_format={\n",
+ " \"type\": \"structural_tag\",\n",
+ " \"structures\": [\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_weather,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_date,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"triggers\": [\"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_weather,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_date,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"at_least_one\": False,\n",
+ " \"stop_after_first\": False,\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Native API and SGLang Runtime (SRT)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### JSON"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Using Pydantic**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "import json\n",
+ "from pydantic import BaseModel, Field\n",
+ "\n",
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+ "\n",
+ "\n",
+ "# Define the schema using Pydantic\n",
+ "class CapitalInfo(BaseModel):\n",
+ " name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+ " population: int = Field(..., description=\"Population of the capital city\")\n",
+ "\n",
+ "\n",
+ "# Make API request\n",
+ "messages = [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
+ " }\n",
+ "]\n",
+ "text = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+ ")\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": text,\n",
+ " \"sampling_params\": {\n",
+ " \"temperature\": 0,\n",
+ " \"max_new_tokens\": 64,\n",
+ " \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "print_highlight(response.json())\n",
+ "\n",
+ "\n",
+ "response_data = json.loads(response.json()[\"text\"])\n",
+ "# validate the response by the pydantic model\n",
+ "capital_info = CapitalInfo.model_validate(response_data)\n",
+ "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**JSON Schema Directly**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "json_schema = json.dumps(\n",
+ " {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+ " \"population\": {\"type\": \"integer\"},\n",
+ " },\n",
+ " \"required\": [\"name\", \"population\"],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# JSON\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": text,\n",
+ " \"sampling_params\": {\n",
+ " \"temperature\": 0,\n",
+ " \"max_new_tokens\": 64,\n",
+ " \"json_schema\": json_schema,\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### EBNF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "messages = [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Give me the information of the capital of France.\",\n",
+ " }\n",
+ "]\n",
+ "text = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+ ")\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": text,\n",
+ " \"sampling_params\": {\n",
+ " \"max_new_tokens\": 128,\n",
+ " \"temperature\": 0,\n",
+ " \"n\": 3,\n",
+ " \"ebnf\": (\n",
+ " \"root ::= city | description\\n\"\n",
+ " 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+ " 'description ::= city \" is \" status\\n'\n",
+ " 'status ::= \"the capital of \" country\\n'\n",
+ " 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+ " ),\n",
+ " },\n",
+ " \"stream\": False,\n",
+ " \"return_logprob\": False,\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Regular expression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "messages = [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"Paris is the capital of\",\n",
+ " }\n",
+ "]\n",
+ "text = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+ ")\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": text,\n",
+ " \"sampling_params\": {\n",
+ " \"temperature\": 0,\n",
+ " \"max_new_tokens\": 64,\n",
+ " \"regex\": \"(France|England)\",\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Structural Tag"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "# generate an answer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+ "\n",
+ "text = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+ ")\n",
+ "payload = {\n",
+ " \"text\": text,\n",
+ " \"sampling_params\": {\n",
+ " \"structural_tag\": json.dumps(\n",
+ " {\n",
+ " \"type\": \"structural_tag\",\n",
+ " \"structures\": [\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_weather,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_date,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"triggers\": [\"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_weather,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_date,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"at_least_one\": False,\n",
+ " \"stop_after_first\": False,\n",
+ " },\n",
+ " }\n",
+ " )\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# Send POST request to the API endpoint\n",
+ "response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Offline Engine API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sglang as sgl\n",
+ "\n",
+ "llm = sgl.Engine(\n",
+ " model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### JSON"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Using Pydantic**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "from pydantic import BaseModel, Field\n",
+ "\n",
+ "prompts = [\n",
+ " \"Give me the information of the capital of China in the JSON format.\",\n",
+ " \"Give me the information of the capital of France in the JSON format.\",\n",
+ " \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+ "]\n",
+ "\n",
+ "\n",
+ "# Define the schema using Pydantic\n",
+ "class CapitalInfo(BaseModel):\n",
+ " name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+ " population: int = Field(..., description=\"Population of the capital city\")\n",
+ "\n",
+ "\n",
+ "sampling_params = {\n",
+ " \"temperature\": 0.1,\n",
+ " \"top_p\": 0.95,\n",
+ " \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+ "}\n",
+ "\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print_highlight(\"===============================\")\n",
+ " print_highlight(f\"Prompt: {prompt}\") # validate the output by the pydantic model\n",
+ " capital_info = CapitalInfo.model_validate_json(output[\"text\"])\n",
+ " print_highlight(f\"Validated output: {capital_info.model_dump_json()}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**JSON Schema Directly**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Give me the information of the capital of China in the JSON format.\",\n",
+ " \"Give me the information of the capital of France in the JSON format.\",\n",
+ " \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+ "]\n",
+ "\n",
+ "json_schema = json.dumps(\n",
+ " {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+ " \"population\": {\"type\": \"integer\"},\n",
+ " },\n",
+ " \"required\": [\"name\", \"population\"],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n",
+ "\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print_highlight(\"===============================\")\n",
+ " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### EBNF\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Give me the information of the capital of France.\",\n",
+ " \"Give me the information of the capital of Germany.\",\n",
+ " \"Give me the information of the capital of Italy.\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\n",
+ " \"temperature\": 0.8,\n",
+ " \"top_p\": 0.95,\n",
+ " \"ebnf\": (\n",
+ " \"root ::= city | description\\n\"\n",
+ " 'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+ " 'description ::= city \" is \" status\\n'\n",
+ " 'status ::= \"the capital of \" country\\n'\n",
+ " 'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+ " ),\n",
+ "}\n",
+ "\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print_highlight(\"===============================\")\n",
+ " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Regular expression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Please provide information about London as a major global city:\",\n",
+ " \"Please provide information about Paris as a major global city:\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
+ "\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print_highlight(\"===============================\")\n",
+ " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Structural Tag"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+ ")\n",
+ "prompts = [text]\n",
+ "\n",
+ "\n",
+ "sampling_params = {\n",
+ " \"temperature\": 0.8,\n",
+ " \"top_p\": 0.95,\n",
+ " \"structural_tag\": json.dumps(\n",
+ " {\n",
+ " \"type\": \"structural_tag\",\n",
+ " \"structures\": [\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_weather,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"schema\": schema_get_current_date,\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"triggers\": [\"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_weather,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " {\n",
+ " \"begin\": \"\",\n",
+ " \"content\": {\n",
+ " \"type\": \"json_schema\",\n",
+ " \"json_schema\": schema_get_current_date,\n",
+ " },\n",
+ " \"end\": \"\",\n",
+ " },\n",
+ " ],\n",
+ " \"at_least_one\": False,\n",
+ " \"stop_after_first\": False,\n",
+ " },\n",
+ " }\n",
+ " ),\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# Send POST request to the API endpoint\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print_highlight(\"===============================\")\n",
+ " print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "llm.shutdown()"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/advanced_features/tool_parser.ipynb b/sglang/docs/advanced_features/tool_parser.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..9afc9663e64f6462da3639f204cc62590e00967d
--- /dev/null
+++ b/sglang/docs/advanced_features/tool_parser.ipynb
@@ -0,0 +1,856 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Tool Parser\n",
+ "\n",
+ "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Currently supported parsers:\n",
+ "\n",
+ "| Parser | Supported Models | Notes |\n",
+ "|---|---|---|\n",
+ "| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | Recommend adding `--chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja` to launch command. |\n",
+ "| `deepseekv31` | DeepSeek-V3.1 and DeepSeek-V3.2-Exp (e.g. `deepseek-ai/DeepSeek-V3.1`, `deepseek-ai/DeepSeek-V3.2-Exp`) | Recommend adding `--chat-template ./examples/chat_template/tool_chat_template_deepseekv31.jinja` (Or ..deepseekv32.jinja for DeepSeek-V3.2) to launch command. |\n",
+ "| `deepseekv32` | DeepSeek-V3.2 (`deepseek-ai/DeepSeek-V3.2`) | |\n",
+ "| `glm` | GLM series (e.g. `zai-org/GLM-4.6`) | |\n",
+ "| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
+ "| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
+ "| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
+ "| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
+ "| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
+ "| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n",
+ "| `qwen` | Qwen series (e.g. `Qwen/Qwen3-Next-80B-A3B-Instruct`, `Qwen/Qwen3-VL-30B-A3B-Thinking`) except Qwen3-Coder| |\n",
+ "| `qwen3_coder` | Qwen3-Coder (e.g. `Qwen/Qwen3-Coder-30B-A3B-Instruct`) | |\n",
+ "| `step3` | Step-3 | |\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## OpenAI Compatible API"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Launching the Server"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "from openai import OpenAI\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\" # qwen25\n",
+ ")\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note that `--tool-call-parser` defines the parser used to interpret responses."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Define Tools for Function Call\n",
+ "Below is a Python snippet that shows how to define a tool as a dictionary. The dictionary includes a tool name, a description, and property defined Parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define tools\n",
+ "tools = [\n",
+ " {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_current_weather\",\n",
+ " \"description\": \"Get the current weather in a given location\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"city\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+ " },\n",
+ " \"state\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+ " \" in, e.g. 'CA' which would mean 'California'\",\n",
+ " },\n",
+ " \"unit\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The unit to fetch the temperature in\",\n",
+ " \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+ " },\n",
+ " },\n",
+ " \"required\": [\"city\", \"state\", \"unit\"],\n",
+ " },\n",
+ " },\n",
+ " }\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Define Messages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_messages():\n",
+ " return [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": \"What's the weather like in Boston today? Output a reasoning before act, then use the tools to help you.\",\n",
+ " }\n",
+ " ]\n",
+ "\n",
+ "\n",
+ "messages = get_messages()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Initialize the Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize OpenAI-like client\n",
+ "client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
+ "model_name = client.models.list().data[0].id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Non-Streaming Request"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Non-streaming mode test\n",
+ "response_non_stream = client.chat.completions.create(\n",
+ " model=model_name,\n",
+ " messages=messages,\n",
+ " temperature=0,\n",
+ " top_p=0.95,\n",
+ " max_tokens=1024,\n",
+ " stream=False, # Non-streaming\n",
+ " tools=tools,\n",
+ ")\n",
+ "print_highlight(\"Non-stream response:\")\n",
+ "print_highlight(response_non_stream)\n",
+ "print_highlight(\"==== content ====\")\n",
+ "print_highlight(response_non_stream.choices[0].message.content)\n",
+ "print_highlight(\"==== tool_calls ====\")\n",
+ "print_highlight(response_non_stream.choices[0].message.tool_calls)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Handle Tools\n",
+ "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_non_stream = response_non_stream.choices[0].message.tool_calls[0].function.name\n",
+ "arguments_non_stream = (\n",
+ " response_non_stream.choices[0].message.tool_calls[0].function.arguments\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Final streamed function call name: {name_non_stream}\")\n",
+ "print_highlight(f\"Final streamed function call arguments: {arguments_non_stream}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Streaming Request"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Streaming mode test\n",
+ "print_highlight(\"Streaming response:\")\n",
+ "response_stream = client.chat.completions.create(\n",
+ " model=model_name,\n",
+ " messages=messages,\n",
+ " temperature=0,\n",
+ " top_p=0.95,\n",
+ " max_tokens=1024,\n",
+ " stream=True, # Enable streaming\n",
+ " tools=tools,\n",
+ ")\n",
+ "\n",
+ "texts = \"\"\n",
+ "tool_calls = []\n",
+ "name = \"\"\n",
+ "arguments = \"\"\n",
+ "for chunk in response_stream:\n",
+ " if chunk.choices[0].delta.content:\n",
+ " texts += chunk.choices[0].delta.content\n",
+ " if chunk.choices[0].delta.tool_calls:\n",
+ " tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+ "print_highlight(\"==== Text ====\")\n",
+ "print_highlight(texts)\n",
+ "\n",
+ "print_highlight(\"==== Tool Call ====\")\n",
+ "for tool_call in tool_calls:\n",
+ " print_highlight(tool_call)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Handle Tools\n",
+ "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Parse and combine function call arguments\n",
+ "arguments = []\n",
+ "for tool_call in tool_calls:\n",
+ " if tool_call.function.name:\n",
+ " print_highlight(f\"Streamed function call name: {tool_call.function.name}\")\n",
+ "\n",
+ " if tool_call.function.arguments:\n",
+ " arguments.append(tool_call.function.arguments)\n",
+ "\n",
+ "# Combine all fragments into a single JSON string\n",
+ "full_arguments = \"\".join(arguments)\n",
+ "print_highlight(f\"streamed function call arguments: {full_arguments}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Define a Tool Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This is a demonstration, define real function according to your usage.\n",
+ "def get_current_weather(city: str, state: str, unit: \"str\"):\n",
+ " return (\n",
+ " f\"The weather in {city}, {state} is 85 degrees {unit}. It is \"\n",
+ " \"partly cloudly, with highs in the 90's.\"\n",
+ " )\n",
+ "\n",
+ "\n",
+ "available_tools = {\"get_current_weather\": get_current_weather}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### Execute the Tool"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "messages.append(response_non_stream.choices[0].message)\n",
+ "\n",
+ "# Call the corresponding tool function\n",
+ "tool_call = messages[-1].tool_calls[0]\n",
+ "tool_name = tool_call.function.name\n",
+ "tool_to_call = available_tools[tool_name]\n",
+ "result = tool_to_call(**(json.loads(tool_call.function.arguments)))\n",
+ "print_highlight(f\"Function call result: {result}\")\n",
+ "# messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
+ "messages.append(\n",
+ " {\n",
+ " \"role\": \"tool\",\n",
+ " \"tool_call_id\": tool_call.id,\n",
+ " \"content\": str(result),\n",
+ " \"name\": tool_name,\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Updated message history: {messages}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Send Results Back to Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_response = client.chat.completions.create(\n",
+ " model=model_name,\n",
+ " messages=messages,\n",
+ " temperature=0,\n",
+ " top_p=0.95,\n",
+ " stream=False,\n",
+ " tools=tools,\n",
+ ")\n",
+ "print_highlight(\"Non-stream response:\")\n",
+ "print_highlight(final_response)\n",
+ "\n",
+ "print_highlight(\"==== Text ====\")\n",
+ "print_highlight(final_response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Native API and SGLang Runtime (SRT)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer\n",
+ "import requests\n",
+ "\n",
+ "# generate an answer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
+ "\n",
+ "messages = get_messages()\n",
+ "\n",
+ "input = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=False, add_generation_prompt=True, tools=tools, return_dict=False\n",
+ ")\n",
+ "\n",
+ "gen_url = f\"http://localhost:{port}/generate\"\n",
+ "gen_data = {\n",
+ " \"text\": input,\n",
+ " \"sampling_params\": {\n",
+ " \"skip_special_tokens\": False,\n",
+ " \"max_new_tokens\": 1024,\n",
+ " \"temperature\": 0,\n",
+ " \"top_p\": 0.95,\n",
+ " },\n",
+ "}\n",
+ "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
+ "print_highlight(\"==== Response ====\")\n",
+ "print_highlight(gen_response)\n",
+ "\n",
+ "# parse the response\n",
+ "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
+ "\n",
+ "function_call_input = {\n",
+ " \"text\": gen_response,\n",
+ " \"tool_call_parser\": \"qwen25\",\n",
+ " \"tools\": tools,\n",
+ "}\n",
+ "\n",
+ "function_call_response = requests.post(parse_url, json=function_call_input)\n",
+ "function_call_response_json = function_call_response.json()\n",
+ "\n",
+ "print_highlight(\"==== Text ====\")\n",
+ "print(function_call_response_json[\"normal_text\"])\n",
+ "print_highlight(\"==== Calls ====\")\n",
+ "print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
+ "print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Offline Engine API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sglang as sgl\n",
+ "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
+ "from sglang.srt.managers.io_struct import Tool, Function\n",
+ "\n",
+ "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
+ "tokenizer = llm.tokenizer_manager.tokenizer\n",
+ "input_ids = tokenizer.apply_chat_template(\n",
+ " messages, tokenize=True, add_generation_prompt=True, tools=tools, return_dict=False\n",
+ ")\n",
+ "\n",
+ "# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
+ "# to make sure the tool call token is not trimmed.\n",
+ "\n",
+ "sampling_params = {\n",
+ " \"max_new_tokens\": 1024,\n",
+ " \"temperature\": 0,\n",
+ " \"top_p\": 0.95,\n",
+ " \"skip_special_tokens\": False,\n",
+ "}\n",
+ "\n",
+ "# 1) Offline generation\n",
+ "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
+ "generated_text = result[\"text\"] # Assume there is only one prompt\n",
+ "\n",
+ "print_highlight(\"=== Offline Engine Output Text ===\")\n",
+ "print_highlight(generated_text)\n",
+ "\n",
+ "\n",
+ "# 2) Parse using FunctionCallParser\n",
+ "def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
+ " function_dict = tool_dict.get(\"function\", {})\n",
+ " return Tool(\n",
+ " type=tool_dict.get(\"type\", \"function\"),\n",
+ " function=Function(\n",
+ " name=function_dict.get(\"name\"),\n",
+ " description=function_dict.get(\"description\"),\n",
+ " parameters=function_dict.get(\"parameters\"),\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ "\n",
+ "tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
+ "\n",
+ "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
+ "normal_text, calls = parser.parse_non_stream(generated_text)\n",
+ "\n",
+ "print_highlight(\"=== Parsing Result ===\")\n",
+ "print(\"Normal text portion:\", normal_text)\n",
+ "print_highlight(\"Function call portion:\")\n",
+ "for call in calls:\n",
+ " # call: ToolCallItem\n",
+ " print_highlight(f\" - tool name: {call.name}\")\n",
+ " print_highlight(f\" parameters: {call.parameters}\")\n",
+ "\n",
+ "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "llm.shutdown()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tool Choice Mode\n",
+ "\n",
+ "SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
+ "\n",
+ "### Supported Tool Choice Options\n",
+ "\n",
+ "- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
+ "- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
+ "\n",
+ "### Backend Compatibility\n",
+ "\n",
+ "Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
+ "\n",
+ "### Example: Required Tool Choice"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from openai import OpenAI\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "\n",
+ "# Start a new server session for tool choice examples\n",
+ "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
+ " \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "wait_for_server(\n",
+ " f\"http://localhost:{port_tool_choice}\", process=server_process_tool_choice\n",
+ ")\n",
+ "\n",
+ "# Initialize client for tool choice examples\n",
+ "client_tool_choice = OpenAI(\n",
+ " api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
+ ")\n",
+ "model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
+ "\n",
+ "# Example with tool_choice=\"required\" - forces the model to call a tool\n",
+ "messages_required = [\n",
+ " {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
+ "]\n",
+ "\n",
+ "# Define tools\n",
+ "tools = [\n",
+ " {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_current_weather\",\n",
+ " \"description\": \"Get the current weather in a given location\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"city\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+ " },\n",
+ " \"unit\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The unit to fetch the temperature in\",\n",
+ " \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+ " },\n",
+ " },\n",
+ " \"required\": [\"city\", \"unit\"],\n",
+ " },\n",
+ " },\n",
+ " }\n",
+ "]\n",
+ "\n",
+ "response_required = client_tool_choice.chat.completions.create(\n",
+ " model=model_name_tool_choice,\n",
+ " messages=messages_required,\n",
+ " temperature=0,\n",
+ " max_tokens=1024,\n",
+ " tools=tools,\n",
+ " tool_choice=\"required\", # Force the model to call a tool\n",
+ ")\n",
+ "\n",
+ "print_highlight(\"Response with tool_choice='required':\")\n",
+ "print(\"Content:\", response_required.choices[0].message.content)\n",
+ "print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Example: Specific Function Choice\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example with specific function choice - forces the model to call a specific function\n",
+ "messages_specific = [\n",
+ " {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
+ "]\n",
+ "\n",
+ "response_specific = client_tool_choice.chat.completions.create(\n",
+ " model=model_name_tool_choice,\n",
+ " messages=messages_specific,\n",
+ " temperature=0,\n",
+ " max_tokens=1024,\n",
+ " tools=tools,\n",
+ " tool_choice={\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\"name\": \"get_current_weather\"},\n",
+ " }, # Force the model to call the specific get_current_weather function\n",
+ ")\n",
+ "\n",
+ "print_highlight(\"Response with specific function choice:\")\n",
+ "print(\"Content:\", response_specific.choices[0].message.content)\n",
+ "print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
+ "\n",
+ "if response_specific.choices[0].message.tool_calls:\n",
+ " tool_call = response_specific.choices[0].message.tool_calls[0]\n",
+ " print_highlight(f\"Called function: {tool_call.function.name}\")\n",
+ " print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process_tool_choice)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Pythonic Tool Call Format (Llama-3.2 / Llama-3.3 / Llama-4)\n",
+ "\n",
+ "Some Llama models (such as Llama-3.2-1B, Llama-3.2-3B, Llama-3.3-70B, and Llama-4) support a \"pythonic\" tool call format, where the model outputs function calls as Python code, e.g.:\n",
+ "\n",
+ "```python\n",
+ "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\")]\n",
+ "```\n",
+ "\n",
+ "- The output is a Python list of function calls, with arguments as Python literals (not JSON).\n",
+ "- Multiple tool calls can be returned in the same list:\n",
+ "```python\n",
+ "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\"),\n",
+ " get_current_weather(city=\"New York\", state=\"NY\", unit=\"fahrenheit\")]\n",
+ "```\n",
+ "\n",
+ "For more information, refer to Meta’s documentation on [Zero shot function calling](https://github.com/meta-llama/llama-models/blob/main/models/llama4/prompt_format.md#zero-shot-function-calling---system-message).\n",
+ "\n",
+ "Note that this feature is still under development on Blackwell.\n",
+ "\n",
+ "### How to enable\n",
+ "- Launch the server with `--tool-call-parser pythonic`\n",
+ "- You may also specify --chat-template with the improved template for the model (e.g., `--chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja`).\n",
+ "This is recommended because the model expects a special prompt format to reliably produce valid pythonic tool call outputs. The template ensures that the prompt structure (e.g., special tokens, message boundaries like `<|eom|>`, and function call delimiters) matches what the model was trained or fine-tuned on. If you do not use the correct chat template, tool calling may fail or produce inconsistent results.\n",
+ "\n",
+ "#### Forcing Pythonic Tool Call Output Without a Chat Template\n",
+ "If you don't want to specify a chat template, you must give the model extremely explicit instructions in your messages to enforce pythonic output. For example, for `Llama-3.2-1B-Instruct`, you need:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1 --log-level warning\" # llama-3.2-1b-instruct\n",
+ ")\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+ "\n",
+ "tools = [\n",
+ " {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_weather\",\n",
+ " \"description\": \"Get the current weather for a given location.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"location\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The name of the city or location.\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"location\"],\n",
+ " },\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"function\",\n",
+ " \"function\": {\n",
+ " \"name\": \"get_tourist_attractions\",\n",
+ " \"description\": \"Get a list of top tourist attractions for a given city.\",\n",
+ " \"parameters\": {\n",
+ " \"type\": \"object\",\n",
+ " \"properties\": {\n",
+ " \"city\": {\n",
+ " \"type\": \"string\",\n",
+ " \"description\": \"The name of the city to find attractions for.\",\n",
+ " }\n",
+ " },\n",
+ " \"required\": [\"city\"],\n",
+ " },\n",
+ " },\n",
+ " },\n",
+ "]\n",
+ "\n",
+ "\n",
+ "def get_messages():\n",
+ " return [\n",
+ " {\n",
+ " \"role\": \"system\",\n",
+ " \"content\": (\n",
+ " \"You are a travel assistant. \"\n",
+ " \"When asked to call functions, ALWAYS respond ONLY with a python list of function calls, \"\n",
+ " \"using this format: [func_name1(param1=value1, param2=value2), func_name2(param=value)]. \"\n",
+ " \"Do NOT use JSON, do NOT use variables, do NOT use any other format. \"\n",
+ " \"Here is an example:\\n\"\n",
+ " '[get_weather(location=\"Paris\"), get_tourist_attractions(city=\"Paris\")]'\n",
+ " ),\n",
+ " },\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": (\n",
+ " \"I'm planning a trip to Tokyo next week. What's the weather like and what are some top tourist attractions? \"\n",
+ " \"Propose parallel tool calls at once, using the python list of function calls format as shown above.\"\n",
+ " ),\n",
+ " },\n",
+ " ]\n",
+ "\n",
+ "\n",
+ "messages = get_messages()\n",
+ "\n",
+ "client = openai.Client(base_url=f\"http://localhost:{port}/v1\", api_key=\"xxxxxx\")\n",
+ "model_name = client.models.list().data[0].id\n",
+ "\n",
+ "\n",
+ "response_non_stream = client.chat.completions.create(\n",
+ " model=model_name,\n",
+ " messages=messages,\n",
+ " temperature=0,\n",
+ " top_p=0.9,\n",
+ " stream=False, # Non-streaming\n",
+ " tools=tools,\n",
+ ")\n",
+ "print_highlight(\"Non-stream response:\")\n",
+ "print_highlight(response_non_stream)\n",
+ "\n",
+ "response_stream = client.chat.completions.create(\n",
+ " model=model_name,\n",
+ " messages=messages,\n",
+ " temperature=0,\n",
+ " top_p=0.9,\n",
+ " stream=True,\n",
+ " tools=tools,\n",
+ ")\n",
+ "texts = \"\"\n",
+ "tool_calls = []\n",
+ "name = \"\"\n",
+ "arguments = \"\"\n",
+ "\n",
+ "for chunk in response_stream:\n",
+ " if chunk.choices[0].delta.content:\n",
+ " texts += chunk.choices[0].delta.content\n",
+ " if chunk.choices[0].delta.tool_calls:\n",
+ " tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+ "\n",
+ "print_highlight(\"Streaming Response:\")\n",
+ "print_highlight(\"==== Text ====\")\n",
+ "print_highlight(texts)\n",
+ "\n",
+ "print_highlight(\"==== Tool Call ====\")\n",
+ "for tool_call in tool_calls:\n",
+ " print_highlight(tool_call)\n",
+ "\n",
+ "terminate_process(server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> **Note:** \n",
+ "> The model may still default to JSON if it was heavily finetuned on that format. Prompt engineering (including examples) is the only way to increase the chance of pythonic output if you are not using a chat template."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to support a new model?\n",
+ "1. Update the TOOLS_TAG_LIST in sglang/srt/function_call_parser.py with the model’s tool tags. Currently supported tags include:\n",
+ "```\n",
+ "\tTOOLS_TAG_LIST = [\n",
+ "\t “<|plugin|>“,\n",
+ "\t ““,\n",
+ "\t “<|python_tag|>“,\n",
+ "\t “[TOOL_CALLS]”\n",
+ "\t]\n",
+ "```\n",
+ "2. Create a new detector class in sglang/srt/function_call_parser.py that inherits from BaseFormatDetector. The detector should handle the model’s specific function call format. For example:\n",
+ "```\n",
+ " class NewModelDetector(BaseFormatDetector):\n",
+ "```\n",
+ "3. Add the new detector to the MultiFormatParser class that manages all the format detectors."
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/sglang/docs/advanced_features/vlm_query.ipynb b/sglang/docs/advanced_features/vlm_query.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..13491d1b95a39b4cb454b864aa7fba929ca18252
--- /dev/null
+++ b/sglang/docs/advanced_features/vlm_query.ipynb
@@ -0,0 +1,388 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0",
+ "metadata": {},
+ "source": [
+ "# Query VLM with Offline Engine\n",
+ "\n",
+ "This tutorial demonstrates how to use SGLang's **offline Engine API** to query VLMs. We will demonstrate usage with Qwen2.5-VL and Llama 4. This section demonstrates three different calling approaches:\n",
+ "\n",
+ "1. **Basic Call**: Directly pass images and text.\n",
+ "2. **Processor Output**: Use HuggingFace processor for data preprocessing.\n",
+ "3. **Precomputed Embeddings**: Pre-calculate image features to improve inference efficiency."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1",
+ "metadata": {},
+ "source": [
+ "## Understanding the Three Input Formats\n",
+ "\n",
+ "SGLang supports three ways to pass visual data, each optimized for different scenarios:\n",
+ "\n",
+ "### 1. **Raw Images** - Simplest approach\n",
+ "- Pass PIL Images, file paths, URLs, or base64 strings directly\n",
+ "- SGLang handles all preprocessing automatically\n",
+ "- Best for: Quick prototyping, simple applications\n",
+ "\n",
+ "### 2. **Processor Output** - For custom preprocessing\n",
+ "- Pre-process images with HuggingFace processor\n",
+ "- Pass the complete processor output dict with `format: \"processor_output\"`\n",
+ "- Best for: Custom image transformations, integration with existing pipelines\n",
+ "- Requirement: Must use `input_ids` instead of text prompt\n",
+ "\n",
+ "### 3. **Precomputed Embeddings** - For maximum performance\n",
+ "- Pre-calculate visual embeddings using the vision encoder\n",
+ "- Pass embeddings with `format: \"precomputed_embedding\"`\n",
+ "- Best for: Repeated queries on same images, caching, high-throughput serving\n",
+ "- Performance gain: Avoids redundant vision encoder computation (30-50% speedup)\n",
+ "\n",
+ "**Key Rule**: Within a single request, use only one format for all images. Don't mix formats.\n",
+ "\n",
+ "The examples below demonstrate all three approaches with both Qwen2.5-VL and Llama 4 models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2",
+ "metadata": {},
+ "source": [
+ "## Querying Qwen2.5-VL Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import nest_asyncio\n",
+ "\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
+ "chat_template = \"qwen2-vl\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from io import BytesIO\n",
+ "import requests\n",
+ "from PIL import Image\n",
+ "\n",
+ "from sglang.srt.parser.conversation import chat_templates\n",
+ "\n",
+ "image = Image.open(\n",
+ " BytesIO(\n",
+ " requests.get(\n",
+ " \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ " ).content\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "conv = chat_templates[chat_template].copy()\n",
+ "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+ "conv.append_message(conv.roles[1], \"\")\n",
+ "conv.image_data = [image]\n",
+ "\n",
+ "print(\"Generated prompt text:\")\n",
+ "print(conv.get_prompt())\n",
+ "print(f\"\\nImage size: {image.size}\")\n",
+ "image"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5",
+ "metadata": {},
+ "source": [
+ "### Basic Offline Engine API Call"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang import Engine\n",
+ "\n",
+ "llm = Engine(model_path=model_path, chat_template=chat_template, log_level=\"warning\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+ "print(\"Model response:\")\n",
+ "print(out[\"text\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8",
+ "metadata": {},
+ "source": [
+ "### Call with Processor Output\n",
+ "\n",
+ "Using a HuggingFace processor to preprocess text and images, and passing the `processor_output` directly into `Engine.generate`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoProcessor\n",
+ "\n",
+ "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+ "processor_output = processor(\n",
+ " images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+ ")\n",
+ "\n",
+ "out = llm.generate(\n",
+ " input_ids=processor_output[\"input_ids\"][0].detach().cpu().tolist(),\n",
+ " image_data=[dict(processor_output, format=\"processor_output\")],\n",
+ ")\n",
+ "print(\"Response using processor output:\")\n",
+ "print(out[\"text\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10",
+ "metadata": {},
+ "source": [
+ "### Call with Precomputed Embeddings\n",
+ "\n",
+ "You can pre-calculate image features to avoid repeated visual encoding processes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "11",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoProcessor\n",
+ "from transformers import Qwen2_5_VLForConditionalGeneration\n",
+ "\n",
+ "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+ "vision = (\n",
+ " Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "processor_output = processor(\n",
+ " images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+ ")\n",
+ "\n",
+ "input_ids = processor_output[\"input_ids\"][0].detach().cpu().tolist()\n",
+ "\n",
+ "precomputed_embeddings = vision(\n",
+ " processor_output[\"pixel_values\"].cuda(), processor_output[\"image_grid_thw\"].cuda()\n",
+ ")\n",
+ "\n",
+ "multi_modal_item = dict(\n",
+ " processor_output,\n",
+ " format=\"precomputed_embedding\",\n",
+ " feature=precomputed_embeddings,\n",
+ ")\n",
+ "\n",
+ "out = llm.generate(input_ids=input_ids, image_data=[multi_modal_item])\n",
+ "print(\"Response using precomputed embeddings:\")\n",
+ "print(out[\"text\"])\n",
+ "\n",
+ "llm.shutdown()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13",
+ "metadata": {},
+ "source": [
+ "## Querying Llama 4 Vision Model\n",
+ "\n",
+ "```python\n",
+ "model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
+ "chat_template = \"llama-4\"\n",
+ "\n",
+ "from io import BytesIO\n",
+ "import requests\n",
+ "from PIL import Image\n",
+ "\n",
+ "from sglang.srt.parser.conversation import chat_templates\n",
+ "\n",
+ "# Download the same example image\n",
+ "image = Image.open(\n",
+ " BytesIO(\n",
+ " requests.get(\n",
+ " \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ " ).content\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "conv = chat_templates[chat_template].copy()\n",
+ "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+ "conv.append_message(conv.roles[1], \"\")\n",
+ "conv.image_data = [image]\n",
+ "\n",
+ "print(\"Llama 4 generated prompt text:\")\n",
+ "print(conv.get_prompt())\n",
+ "print(f\"Image size: {image.size}\")\n",
+ "\n",
+ "image\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "14",
+ "metadata": {},
+ "source": [
+ "### Llama 4 Basic Call\n",
+ "\n",
+ "Llama 4 requires more computational resources, so it's configured with multi-GPU parallelism (tp_size=4) and larger context length.\n",
+ "\n",
+ "```python\n",
+ "llm = Engine(\n",
+ " model_path=model_path,\n",
+ " enable_multimodal=True,\n",
+ " attention_backend=\"fa3\",\n",
+ " tp_size=4,\n",
+ " context_length=65536,\n",
+ ")\n",
+ "\n",
+ "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+ "print(\"Llama 4 response:\")\n",
+ "print(out[\"text\"])\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15",
+ "metadata": {},
+ "source": [
+ "### Call with Processor Output\n",
+ "\n",
+ "Using HuggingFace processor to preprocess data can reduce computational overhead during inference.\n",
+ "\n",
+ "```python\n",
+ "from transformers import AutoProcessor\n",
+ "\n",
+ "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+ "processor_output = processor(\n",
+ " images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+ ")\n",
+ "\n",
+ "out = llm.generate(\n",
+ " input_ids=processor_output[\"input_ids\"][0].detach().cpu().tolist(),\n",
+ " image_data=[dict(processor_output, format=\"processor_output\")],\n",
+ ")\n",
+ "print(\"Response using processor output:\")\n",
+ "print(out)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16",
+ "metadata": {},
+ "source": [
+ "### Call with Precomputed Embeddings\n",
+ "\n",
+ "```python\n",
+ "from transformers import AutoProcessor\n",
+ "from transformers import Llama4ForConditionalGeneration\n",
+ "\n",
+ "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+ "model = Llama4ForConditionalGeneration.from_pretrained(\n",
+ " model_path, torch_dtype=\"auto\"\n",
+ ").eval()\n",
+ "\n",
+ "vision = model.vision_model.cuda()\n",
+ "multi_modal_projector = model.multi_modal_projector.cuda()\n",
+ "\n",
+ "print(f'Image pixel values shape: {processor_output[\"pixel_values\"].shape}')\n",
+ "input_ids = processor_output[\"input_ids\"][0].detach().cpu().tolist()\n",
+ "\n",
+ "# Process image through vision encoder\n",
+ "image_outputs = vision(\n",
+ " processor_output[\"pixel_values\"].to(\"cuda\"), \n",
+ " aspect_ratio_ids=processor_output[\"aspect_ratio_ids\"].to(\"cuda\"),\n",
+ " aspect_ratio_mask=processor_output[\"aspect_ratio_mask\"].to(\"cuda\"),\n",
+ " output_hidden_states=False\n",
+ ")\n",
+ "image_features = image_outputs.last_hidden_state\n",
+ "\n",
+ "# Flatten image features and pass through multimodal projector\n",
+ "vision_flat = image_features.view(-1, image_features.size(-1))\n",
+ "precomputed_embeddings = multi_modal_projector(vision_flat)\n",
+ "\n",
+ "# Build precomputed embedding data item\n",
+ "mm_item = dict(\n",
+ " processor_output, \n",
+ " format=\"precomputed_embedding\", \n",
+ " feature=precomputed_embeddings\n",
+ ")\n",
+ "\n",
+ "# Use precomputed embeddings for efficient inference\n",
+ "out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
+ "print(\"Llama 4 precomputed embedding response:\")\n",
+ "print(out[\"text\"])\n",
+ "```"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "cell_metadata_filter": "-all",
+ "custom_cell_magics": "kql",
+ "encoding": "# -*- coding: utf-8 -*-",
+ "text_representation": {
+ "extension": ".py",
+ "format_name": "light",
+ "format_version": "1.5",
+ "jupytext_version": "1.16.1"
+ }
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sglang/docs/basic_usage/deepseek_ocr.md b/sglang/docs/basic_usage/deepseek_ocr.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f62713ebab4923e37fa35acd5ee4b3b08e35b87
--- /dev/null
+++ b/sglang/docs/basic_usage/deepseek_ocr.md
@@ -0,0 +1,54 @@
+# DeepSeek OCR (OCR-1 / OCR-2)
+
+DeepSeek OCR models are multimodal (image + text) models for OCR and document understanding.
+
+## Launch server
+
+```shell
+python -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-OCR-2 \
+ --trust-remote-code \
+ --host 0.0.0.0 \
+ --port 30000
+```
+
+> You can replace `deepseek-ai/DeepSeek-OCR-2` with `deepseek-ai/DeepSeek-OCR`.
+
+## Prompt examples
+
+Recommended prompts from the model card:
+
+```
+
+<|grounding|>Convert the document to markdown.
+```
+
+```
+
+Free OCR.
+```
+
+## OpenAI-compatible request example
+
+```python
+import requests
+
+url = "http://localhost:30000/v1/chat/completions"
+
+data = {
+ "model": "deepseek-ai/DeepSeek-OCR-2",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "\n<|grounding|>Convert the document to markdown."},
+ {"type": "image_url", "image_url": {"url": "https://example.com/your_image.jpg"}},
+ ],
+ }
+ ],
+ "max_tokens": 512,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
diff --git a/sglang/docs/basic_usage/deepseek_v32.md b/sglang/docs/basic_usage/deepseek_v32.md
new file mode 100644
index 0000000000000000000000000000000000000000..4894954e70569a09cc70fbe8bb07ba0829e7c9e0
--- /dev/null
+++ b/sglang/docs/basic_usage/deepseek_v32.md
@@ -0,0 +1,459 @@
+# DeepSeek V3.2 Usage
+
+DeepSeek-V3.2 model family equips DeepSeek-V3.1-Terminus with DeepSeek Sparse Attention (DSA) through continued training. With DSA, a fine-grained sparse attention mechanism powered by a lightning indexer, DeepSeek-V3.2 achieves efficiency improvements in long-context scenarios.
+
+For reporting issues or tracking upcoming features, please refer to this [Roadmap](https://github.com/sgl-project/sglang/issues/11060).
+
+Note: This document is originally written for the usage of [DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp) model. The usage of [DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2) or [DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale) is the same as DeepSeek-V3.2-Exp except for the tool call parser.
+
+
+## Installation
+
+### Docker
+
+```bash
+# H200/B200
+docker pull lmsysorg/sglang:latest
+
+# MI350/MI355
+docker pull lmsysorg/sglang:v0.5.8-rocm700-mi35x
+
+# MI300
+# v0.5.8-rocm700-mi30x does not include PR #17504. Prefer the newest MI30x ROCm
+# image tag from Docker Hub when available, or build from source (below).
+docker pull lmsysorg/sglang:v0.5.8-rocm700-mi30x
+
+
+# NPUs
+docker pull lmsysorg/sglang:dsv32-a2
+docker pull lmsysorg/sglang:dsv32-a3
+```
+
+### Build From Source
+
+```bash
+# Install SGLang
+git clone https://github.com/sgl-project/sglang
+cd sglang
+pip3 install pip --upgrade
+pip3 install -e "python"
+```
+## Launch DeepSeek V3.2 with SGLang
+
+To serve [DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp) on 8xH200/B200 GPUs:
+
+```bash
+# Launch with TP + DP (Recommended)
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention
+
+# Launch with EP + DP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --ep 8 --dp 8 --enable-dp-attention
+
+# Launch with Pure TP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8
+
+# Launch with TP on MI30x/MI35x
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --nsa-prefill-backend tilelang --nsa-decode-backend tilelang
+```
+
+### Configuration Tips
+- **DP Attention (Recommended)**: For DeepSeek V3.2 model, the kernels are customized for the use case of `dp_size=8`, so DP attention (`--dp 8 --enable-dp-attention`) is the recommended configuration for better stability and performance. All test cases use this configuration by default.
+- **Pure TP Mode**: Launching with pure TP (without `--dp` and `--enable-dp-attention`) is also supported. Note that this mode has not been fully validated in PD disaggregation scenarios.
+- **Short-sequence MHA prefill (adaptive)**: For short prefill sequences (default threshold: **2048 tokens**), the NSA backend uses standard MHA automatically (no extra flags). On H200 (SM90) this path uses the FlashAttention variable-length kernel; on B200 (SM100) it uses TRT-LLM ragged MHA. MHA uses `MHA_ONE_SHOT` for best performance. `MHA_ONE_SHOT` computes multi-head attention over all tokens (both cached prefix and newly extended tokens) in a single kernel invocation, avoiding the overhead of chunked KV cache processing. This achieves optimal throughput for short sequences where total sequence length fits within the chunk capacity limit.
+- **Choices of Attention Kernels**: The attention backend is automatically set to `nsa` attention backend for DeepSeek V3.2 model. In this backend, different kernels for sparse prefilling/decoding are implemented, which can be specified by `--nsa-prefill-backend` and `--nsa-decode-backend` server arguments. The choices of nsa prefill/decode attention kernels include:
+ - `flashmla_sparse`: `flash_mla_sparse_fwd` kernel from `flash_mla` library. Can run on both Hopper and Blackwell GPUs. It requires bf16 q, kv inputs.
+ - `flashmla_kv`: `flash_mla_with_kvcache` kernel from `flash_mla` library. Can run on both Hopper and Blackwell GPUs. It requires bf16 q, fp8 k_cache inputs.
+ - `fa3`: `flash_attn_with_kvcache` kernel from `flash_attn` library. Can only run on Hopper GPUs. It requires bf16 q, kv inputs.
+ - `tilelang`: `tilelang` implementation that can run on GPU, HPU and NPU.
+ - `aiter`: Aiter kernel on AMD HPUs. Can only be used as decode kernel.
+ - `trtllm`: `trtllm-mla` sparse kernel from flashinfer library. Only run on blackwell GPUs. It requires QKV bf16 or QKV fp8.
+- On the basis of performance benchmarks, the default configuration on H200 and B200 are set as follows :
+ - H200: `flashmla_sparse` prefill attention (short-seq prefill uses MHA via FlashAttention varlen), `fa3` decode attention, `bf16` kv cache dtype.
+ - B200: `flashmla_auto` prefill attention (short-seq prefill uses MHA via TRT-LLM ragged), `flashmla_kv` decode attention, `fp8_e4m3` kv cache dtype. `flashmla_auto` enables automatic selection of either `flashmla_sparse` or `flashmla_kv` kernel for prefill based on KV cache dtype, hardware, and heuristics. When FP8 KV cache is enabled and `total_kv_tokens < total_q_tokens * 512`, it uses the `flashmla_sparse` kernel; otherwise, it falls back to the `flashmla_kv` kernel. The heuristics may need to be tuned if the performance of either the `flashmla_sparse` or `flashmla_kv` kernel changes significantly.
+- On Blackwell platform, with slightly accuracy drop, the performance can boost up to 3x-5x
+ - B200: by choosing `trtllm` for both `--nsa-prefill-backend` and `--nsa-decode-backend`, the prefill attention use MHA via TRT-LLM ragged for both short and long sequence (**accuracy impact**). Combine the `trtllm` with `fp8_e4m3` kv cache, the kv cache dim is `576` (kv_lora_rank + qk_rope_head_dim) (**accuracy impact**), compare to the combination of `flashmla_auto` and `fp8_e4m` kv cache dim is `656` (kv_lora_rank + scale storage (kv_lora_rank // quant_block_size * 4 bytes) + rope dimension storage).
+
+
+## Multi-token Prediction
+SGLang implements Multi-Token Prediction (MTP) for DeepSeek V3.2 based on [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved significantly on small batch sizes. Please look at [this PR](https://github.com/sgl-project/sglang/pull/11652) for more information.
+
+Example usage with DP Attention:
+```bash
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention --speculative-algorithm EAGLE --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4
+```
+
+Example usage with Pure TP:
+```bash
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --speculative-algorithm EAGLE --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4
+```
+
+- The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes.
+- The default value of `--max-running-requests` is set to `48` for MTP. For larger batch sizes, this value should be increased beyond the default value.
+
+```{tip}
+To enable the experimental overlap scheduler for EAGLE speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+
+
+## Function Calling and Reasoning Parser
+The usage of function calling and reasoning parser is the same as DeepSeek V3.1. Please refer to [Reasoning Parser](https://docs.sglang.io/advanced_features/separate_reasoning.html) and [Tool Parser](https://docs.sglang.io/advanced_features/tool_parser.html) documents.
+
+To launch `DeepSeek-V3.2-Exp` with function calling and reasoning parser:
+> Note: It is recommended to specify the chat-template, ensuring that you are within the sglang's root directory.
+```bash
+python3 -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --trust-remote-code \
+ --tp-size 8 --dp-size 8 --enable-dp-attention \
+ --tool-call-parser deepseekv31 \
+ --reasoning-parser deepseek-v3 \
+ --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja
+```
+
+To launch `DeepSeek-V3.2` with function calling and reasoning parser:
+```bash
+python3 -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2 \
+ --trust-remote-code \
+ --tp-size 8 --dp-size 8 --enable-dp-attention \
+ --tool-call-parser deepseekv32 \
+ --reasoning-parser deepseek-v3
+```
+
+`DeepSeek-V3.2-Speciale` doesn't support tool calling, so can only be launched with reasoning parser:
+```bash
+python3 -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Speciale \
+ --trust-remote-code \
+ --tp-size 8 --dp-size 8 --enable-dp-attention \
+ --reasoning-parser deepseek-v3
+```
+
+## NVFP4 Checkpoint
+
+To launch deepseek v3.2 [NVFP4 checkpoint](https://huggingface.co/nvidia/DeepSeek-V3.2-NVFP4) on Blackwell devices, the user needs to specify the quantization method as `modelopt_fp4`, and moe runner backend as one of `flashinfer_trtllm`(recommended), `flashinfer_cutlass` and `flashinfer_cutedsl`. Any other usage (parallelism, reasoning parser, ...) is the same as FP8 checkpoint.
+
+An example launching command can be:
+```bash
+python -m sglang.launch_server --model nvidia/DeepSeek-V3.2-NVFP4 --tp 4 --quantization modelopt_fp4 --moe-runner-backend flashinfer_trtllm --tool-call-parser deepseekv32 --reasoning-parser deepseek-v3
+```
+
+## PD Disaggregation
+
+Prefill Command:
+```bash
+python -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --disaggregation-mode prefill \
+ --host $LOCAL_IP \
+ --port $PORT \
+ --tp 8 \
+ --dp 8 \
+ --enable-dp-attention \
+ --dist-init-addr ${HOST}:${DIST_PORT} \
+ --trust-remote-code \
+ --disaggregation-bootstrap-port 8998 \
+ --mem-fraction-static 0.9 \
+```
+
+Decode command:
+```bash
+python -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --disaggregation-mode decode \
+ --host $LOCAL_IP \
+ --port $PORT \
+ --tp 8 \
+ --dp 8 \
+ --enable-dp-attention \
+ --dist-init-addr ${HOST}:${DIST_PORT} \
+ --trust-remote-code \
+ --mem-fraction-static 0.9 \
+```
+
+Router command:
+```bash
+python -m sglang_router.launch_router --pd-disaggregation \
+ --prefill $PREFILL_ADDR 8998 \
+ --decode $DECODE_ADDR \
+ --host 127.0.0.1 \
+ --port 8000 \
+```
+
+If you need more advanced deployment methods or production-ready deployment methods, such as RBG or LWS-based deployment, please refer to [references/multi_node_deployment/rbg_pd/deepseekv32_pd.md](../references/multi_node_deployment/rbg_pd/deepseekv32_pd.md). Additionally, you can also find startup commands for DeepEP-based EP parallelism in the aforementioned documentation.
+
+
+## Benchmarking Results
+
+### Accuracy Test with `gsm8k`
+A simple accuracy benchmark can be tested with `gsm8k` dataset:
+```bash
+python3 benchmark/gsm8k/bench_sglang.py --num-shots 8 --num-questions 1319 --parallel 1319
+```
+
+The result is 0.956, which matches our expectation:
+```bash
+Accuracy: 0.956
+Invalid: 0.000
+Latency: 25.109 s
+Output throughput: 5226.235 token/s
+```
+
+To test long-context accuracy, run gsm8k with `--num-shots 20`. The results are very close to the 8 shots results:
+```
+Accuracy: 0.956
+Invalid: 0.000
+Latency: 29.545 s
+Output throughput: 4418.617 token/s
+```
+
+
+### Accuracy Test with `gpqa-diamond`
+
+Accuracy benchmark on long context can be tested on GPQA-diamond dataset with long output tokens and thinking enabled:
+```bash
+python3 -m sglang.test.run_eval --port 30000 --eval-name gpqa --num-examples 198 --max-tokens 128000 --repeat 8 --thinking-mode deepseek-v3
+```
+
+The mean accuracy over 8 runs shows 0.797, which matches the number 0.799 in official tech report.
+```bash
+Repeat: 8, mean: 0.797
+Scores: ['0.808', '0.798', '0.808', '0.798', '0.783', '0.788', '0.803', '0.793']
+```
+
+For Deepseek V3.2, Deepseek recommends setting the sampling parameters to temperature = 1.0, top_p = 0.95:
+
+```bash
+python3 -m sglang.test.run_eval --port 30000 --eval-name gpqa --num-examples 198 --max-tokens 128000 --repeat 8 --top-p 0.95 --temperature 1.0 --thinking-mode deepseek-v3
+
+Repeat: 8, mean: 0.840
+Scores: ['0.848', '0.808', '0.848', '0.838', '0.879', '0.813', '0.838', '0.848']
+```
+which matches the official score, 0.824, as reported in the [Deepseek-V3.2 technical report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/assets/paper.pdf).
+
+### Accuracy Test with `aime 2025`
+
+Prepare the environment by installing NeMo-Skills in the docker or your own virtual environment:
+
+ ```
+ pip install git+https://github.com/NVIDIA/NeMo-Skills.git --ignore-installed blinker
+ ```
+
+Then launch the SGLang server:
+```
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention
+```
+
+**For `DeepSeek-V3.2` and `DeepSeek-V3.2-Speciale`**:
+
+```
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3.2 --trust-remote-code --tp-size 8 --dp-size 8 --enable-dp-attention --tool-call-parser deepseekv32 --reasoning-parser deepseek-v3
+```
+
+Run the following script to evaluate AIME 2025:
+```
+#! /bin/bash
+export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
+
+ns prepare_data aime25
+
+PORT=30000
+BACKEND=sglang
+MODEL="deepseek-ai/DeepSeek-V3.2-Exp" # Should be changed to the model name
+MODEL_NAME="dsv32-fp8"
+
+echo "Starting AIME25 evaluation with model $MODEL on port $PORT using backend $BACKEND..."
+ns eval \
+ --benchmarks=aime25:4 \
+ --server_type=$BACKEND \
+ --model=$MODEL \
+ --server_address=http://localhost:${PORT}/v1 \
+ --output_dir=nemo_skills_aime25_${MODEL_NAME}_output_${BACKEND}_$(date +%Y%m%d_%H%M%S) \
+ ++chat_template_kwargs.thinking=true \
+ ++inference.temperature=1.0 \
+ ++inference.top_p=0.95 \
+ ++inference.tokens_to_generate=64000
+ # ++inference.tokens_to_generate=120000 for Speciale model
+```
+
+Test results (8*B200):
+
+DeepSeek-V3.2-Exp:
+
+| evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4] | 30 | 15040 | 1673 | 87.50% ± 1.67% | 0.00% |
+| majority@4 | 30 | 15040 | 1673 | 90.00% | 0.00% |
+| pass@4 | 30 | 15040 | 1673 | 90.00% | 0.00% |
+
+
+DeepSeek-V3.2:
+| evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4] | 30 | 13550 | 1632 | 92.50% ± 1.67% | 0.00% |
+| majority@4 | 30 | 13550 | 1632 | 94.71% | 0.00% |
+| pass@4 | 30 | 13550 | 1632 | 96.67% | 0.00% |
+
+
+DeepSeek-V3.2-Speciale:
+| evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4] | 30 | 24155 | 3583 | 95.00% ± 1.92% | 0.00% |
+| majority@4 | 30 | 24155 | 3583 | 95.83% | 0.00% |
+| pass@4 | 30 | 24155 | 3583 | 100.00% | 0.00% |
+
+
+
+## DSA long sequence context parallel optimization(experimental)
+
+**Note: This feature is only verified on Hopper machines**
+
+For context parallel in DeepSeek V3.2 model, we provide two different modes of splitting tokens, which can be controlled with argument `--nsa-prefill-cp-mode`.
+
+### In sequence splitting
+
+The first mode can be enabled by `--nsa-prefill-cp-mode in-seq-split`. This mode implements context parallel for DSA by splitting the sequence uniformly between context parallel ranks. At attention stage, each cp rank computes the indexer results of sharded sequence, and collects the whole kv cache through all gather operator. Add `attn_cp_size` for communication group for context parallel.
+
+Note that in sequence splitting mode has the following restrictions:
+- The batch size is restricted to 1 for prefill batches
+- `moe_dense_tp_size=1`, `moe_a2a_backend = "deepep"`
+- To ensure `cp_size > 1`, the passed in `tp_size` must be larger than `dp_size`
+
+For more details, please refer to PR https://github.com/sgl-project/sglang/pull/12065.
+
+Example:
+```bash
+# In-seq splitting mode launched with EP + DP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --ep 8 --dp 2 --enable-dp-attention --enable-nsa-prefill-context-parallel --attn-cp-size 4 --nsa-prefill-cp-mode in-seq-split --max-running-requests 32
+```
+
+### Round robin splitting (default setting)
+
+This mode can be enabled by specifying the parameter `--nsa-prefill-cp-mode round-robin-split`, which distributes tokens across ranks based on `token_idx % cp_size`.
+
+In this scenario, compared with the aforementioned method, it additionally supports the fused MoE backend (the fused MoE backend may deliver better performance than DeepEP in single-machine scenarios), FP8 KV-cache, and multi-batch prefill inference. But it cannot be enabled with dp attention together.
+
+For more details, please refer to PR https://github.com/sgl-project/sglang/pull/13959.
+
+Example usage:
+```bash
+# Launch with FusedMoe + CP8
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --enable-nsa-prefill-context-parallel --attn-cp-size 8 --nsa-prefill-cp-mode round-robin-split --max-running-requests 32
+```
+### Pipeline Parallel + Context Parallel (PP + CP)
+
+This mode combines Pipeline Parallelism (PP) and Context Parallelism (CP) to scale across multiple nodes, which can achieve better throughput and Time To First Token (TTFT). Note that this method has only been tested on H20 96G.
+
+#### Standard Usage
+
+To launch with PP=2 and CP (via `round-robin-split` mode) on 2 nodes. This configuration uses the fused MoE kernel by default, which generally provides better performance.
+
+For related development details, please refer to:
+- Fused MoE + CP support: [PR #13959](https://github.com/sgl-project/sglang/pull/13959)
+- PP + CP support: [Issue #15358](https://github.com/sgl-project/sglang/issues/15358) and [PR #16380](https://github.com/sgl-project/sglang/pull/16380)
+
+Node 0:
+```bash
+export SGLANG_PP_LAYER_PARTITION=30,31
+python3 -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --nnodes 2 --node-rank 0 \
+ --dist-init-addr :62001 \
+ --tp 8 --pp-size 2 \
+ --dp-size 1 --moe-dense-tp-size 1 \
+ --enable-nsa-prefill-context-parallel \
+ --attn-cp-size 8 \
+ --nsa-prefill-cp-mode round-robin-split \
+ --trust-remote-code \
+ --disable-radix-cache \
+ --mem-fraction-static 0.8 \
+ --max-running-requests 128 \
+ --chunked-prefill-size 16384 \
+ --cuda-graph-max-bs 8 \
+ --page-size 64 \
+ --watchdog-timeout 3600 \
+ --host 0.0.0.0 --port 8000 \
+ --tool-call-parser deepseekv32
+```
+
+Node 1:
+```bash
+export SGLANG_PP_LAYER_PARTITION=30,31
+python3 -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --nnodes 2 --node-rank 1 \
+ --dist-init-addr :62001 \
+ --tp 8 --pp-size 2 \
+ --dp-size 1 --moe-dense-tp-size 1 \
+ --enable-nsa-prefill-context-parallel \
+ --attn-cp-size 8 \
+ --nsa-prefill-cp-mode round-robin-split \
+ --trust-remote-code \
+ --disable-radix-cache \
+ --mem-fraction-static 0.8 \
+ --max-running-requests 128 \
+ --chunked-prefill-size 16384 \
+ --cuda-graph-max-bs 8 \
+ --page-size 64 \
+ --watchdog-timeout 3600 \
+ --host 0.0.0.0 --port 8000 \
+ --tool-call-parser deepseekv32
+```
+
+#### PD Disaggregation with PP + CP
+
+If using PD (Prefill-Decode) Disaggregation, the Prefill nodes can be configured with PP + CP as follows.
+
+Prefill Node 0:
+```bash
+python -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --served-model-name deepseek-v32 \
+ --nnodes 2 --node-rank 0 \
+ --dist-init-addr :20102 \
+ --tp 8 --pp-size 2 \
+ --dp-size 1 --moe-dense-tp-size 1 \
+ --enable-nsa-prefill-context-parallel \
+ --attn-cp-size 8 \
+ --nsa-prefill-cp-mode round-robin-split \
+ --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \
+ --trust-remote-code \
+ --disable-radix-cache \
+ --max-running-requests 512 \
+ --chunked-prefill-size 4096 \
+ --context-length 131072 \
+ --mem-fraction-static 0.9 \
+ --page-size 64 \
+ --enable-metrics \
+ --collect-tokens-histogram \
+ --tokenizer-worker-num 8 \
+ --host 0.0.0.0 --port 30000
+```
+
+Prefill Node 1:
+```bash
+python -m sglang.launch_server \
+ --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+ --served-model-name deepseek-v32-prefill \
+ --nnodes 2 --node-rank 1 \
+ --dist-init-addr :20102 \
+ --tp 8 --pp-size 2 \
+ --dp-size 1 --moe-dense-tp-size 1 \
+ --enable-nsa-prefill-context-parallel \
+ --attn-cp-size 8 \
+ --nsa-prefill-cp-mode round-robin-split \
+ --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \
+ --trust-remote-code \
+ --disable-radix-cache \
+ --max-running-requests 512 \
+ --chunked-prefill-size 4096 \
+ --context-length 131072 \
+ --mem-fraction-static 0.9 \
+ --page-size 64 \
+ --enable-metrics \
+ --collect-tokens-histogram \
+ --tokenizer-worker-num 8 \
+ --host 0.0.0.0 --port 30000
+```
+
+For the Decode nodes, it is recommended to use the **EP mode**.
diff --git a/sglang/docs/basic_usage/glm45.md b/sglang/docs/basic_usage/glm45.md
new file mode 100644
index 0000000000000000000000000000000000000000..aaf8e13b456f644ee2bb40f9e95d9f2a9af3f7d7
--- /dev/null
+++ b/sglang/docs/basic_usage/glm45.md
@@ -0,0 +1,70 @@
+## Launch GLM-4.5 / GLM-4.6 / GLM-4.7 with SGLang
+
+To serve GLM-4.5 / GLM-4.6 FP8 models on 8xH100/H200 GPUs:
+
+```bash
+python3 -m sglang.launch_server --model zai-org/GLM-4.6-FP8 --tp 8
+```
+
+### EAGLE Speculative Decoding
+
+**Description**: SGLang has supported GLM-4.5 / GLM-4.6 models
+with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and
+`--speculative-num-draft-tokens` to enable this feature. For example:
+
+``` bash
+python3 -m sglang.launch_server \
+ --model-path zai-org/GLM-4.6-FP8 \
+ --tp-size 8 \
+ --tool-call-parser glm45 \
+ --reasoning-parser glm45 \
+ --speculative-algorithm EAGLE \
+ --speculative-num-steps 3 \
+ --speculative-eagle-topk 1 \
+ --speculative-num-draft-tokens 4 \
+ --mem-fraction-static 0.9 \
+ --served-model-name glm-4.6-fp8 \
+ --enable-custom-logit-processor
+```
+
+```{tip}
+To enable the experimental overlap scheduler for EAGLE speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+
+### Thinking Budget for GLM-4.5 / GLM-4.6
+**Note**: For GLM-4.7, `--tool-call-parser` should be set to `glm47`, for GLM-4.5 and GLM-4.6, it should be set to `glm45`.
+
+In SGLang, we can implement thinking budget with `CustomLogitProcessor`.
+
+Launch a server with `--enable-custom-logit-processor` flag on.
+
+Sample Request:
+
+```python
+import openai
+from rich.pretty import pprint
+from sglang.srt.sampling.custom_logit_processor import Glm4MoeThinkingBudgetLogitProcessor
+
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="*")
+response = client.chat.completions.create(
+ model="zai-org/GLM-4.6",
+ messages=[
+ {
+ "role": "user",
+ "content": "Question: Is Paris the Capital of France?",
+ }
+ ],
+ max_tokens=1024,
+ extra_body={
+ "custom_logit_processor": Glm4MoeThinkingBudgetLogitProcessor().to_str(),
+ "custom_params": {
+ "thinking_budget": 512,
+ },
+ },
+)
+pprint(response)
+```
diff --git a/sglang/docs/basic_usage/glmv.md b/sglang/docs/basic_usage/glmv.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad36cea26ad2f86f0f8edcb428406a45f99ca25e
--- /dev/null
+++ b/sglang/docs/basic_usage/glmv.md
@@ -0,0 +1,136 @@
+# GLM-4.6V / GLM-4.5V Usage
+
+## Launch commands for SGLang
+
+Below are suggested launch commands tailored for different hardware / precision modes
+
+### FP8 (quantised) mode
+
+For high memory-efficiency and latency optimized deployments (e.g., on H100, H200) where FP8 checkpoint is supported:
+
+```bash
+python3 -m sglang.launch_server \
+ --model-path zai-org/GLM-4.6V-FP8 \
+ --tp 2 \
+ --ep 2 \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --keep-mm-feature-on-device
+```
+
+### Non-FP8 (BF16 / full precision) mode
+For deployments on A100/H100 where BF16 is used (or FP8 snapshot not used):
+```bash
+python3 -m sglang.launch_server \
+ --model-path zai-org/GLM-4.6V \
+ --tp 4 \
+ --ep 4 \
+ --host 0.0.0.0 \
+ --port 30000
+```
+
+## Hardware-specific notes / recommendations
+
+- On H100 with FP8: Use the FP8 checkpoint for best memory efficiency.
+- On A100 / H100 with BF16 (non-FP8): It’s recommended to use `--mm-max-concurrent-calls` to control parallel throughput and GPU memory usage during image/video inference.
+- On H200 & B200: The model can be run “out of the box”, supporting full context length plus concurrent image + video processing.
+
+## Sending Image/Video Requests
+
+### Image input:
+
+```python
+import requests
+
+url = f"http://localhost:30000/v1/chat/completions"
+
+data = {
+ "model": "zai-org/GLM-4.6V",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s in this image?"},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true"
+ },
+ },
+ ],
+ }
+ ],
+ "max_tokens": 300,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
+
+### Video Input:
+
+```python
+import requests
+
+url = f"http://localhost:30000/v1/chat/completions"
+
+data = {
+ "model": "zai-org/GLM-4.6V",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s happening in this video?"},
+ {
+ "type": "video_url",
+ "video_url": {
+ "url": "https://github.com/sgl-project/sgl-test-files/raw/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+ },
+ },
+ ],
+ }
+ ],
+ "max_tokens": 300,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
+
+## Important Server Parameters and Flags
+
+When launching the model server for **multimodal support**, you can use the following command-line arguments to fine-tune performance and behavior:
+
+- `--mm-attention-backend`: Specify multimodal attention backend. Eg. `fa3`(Flash Attention 3)
+- `--mm-max-concurrent-calls `: Specifies the **maximum number of concurrent asynchronous multimodal data processing calls** allowed on the server. Use this to control parallel throughput and GPU memory usage during image/video inference.
+- `--mm-per-request-timeout `: Defines the **timeout duration (in seconds)** for each multimodal request. If a request exceeds this time limit (e.g., for very large video inputs), it will be automatically terminated.
+- `--keep-mm-feature-on-device`: Instructs the server to **retain multimodal feature tensors on the GPU** after processing. This avoids device-to-host (D2H) memory copies and improves performance for repeated or high-frequency inference workloads.
+- `--mm-enable-dp-encoder`: Placing the ViT in data parallel while keeping the LLM in tensor parallel consistently lowers TTFT and boosts end-to-end throughput.
+- `SGLANG_USE_CUDA_IPC_TRANSPORT=1`: Shared memory pool based CUDA IPC for multi-modal data transport. For significantly improving e2e latency.
+
+### Example usage with the above optimizations:
+```bash
+SGLANG_USE_CUDA_IPC_TRANSPORT=1 \
+SGLANG_VLM_CACHE_SIZE_MB=0 \
+python -m sglang.launch_server \
+ --model-path zai-org/GLM-4.6V \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --trust-remote-code \
+ --tp-size 8 \
+ --enable-cache-report \
+ --log-level info \
+ --max-running-requests 64 \
+ --mem-fraction-static 0.65 \
+ --chunked-prefill-size 8192 \
+ --attention-backend fa3 \
+ --mm-attention-backend fa3 \
+ --mm-enable-dp-encoder \
+ --enable-metrics
+```
+
+### Thinking Budget for GLM-4.5V / GLM-4.6V
+
+In SGLang, we can implement thinking budget with `CustomLogitProcessor`.
+
+Launch a server with the `--enable-custom-logit-processor` flag. Then, use `Glm4MoeThinkingBudgetLogitProcessor` in the request, similar to the `GLM-4.6` example in [glm45.md](./glm45.md).
diff --git a/sglang/docs/basic_usage/gpt_oss.md b/sglang/docs/basic_usage/gpt_oss.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f81da4f4ee99d42f12b3ce71720c17319073f33
--- /dev/null
+++ b/sglang/docs/basic_usage/gpt_oss.md
@@ -0,0 +1,147 @@
+# GPT OSS Usage
+
+Please refer to [https://github.com/sgl-project/sglang/issues/8833](https://github.com/sgl-project/sglang/issues/8833).
+
+## Responses API & Built-in Tools
+
+### Responses API
+
+GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use. You can set reasoning level via `instructions`, e.g., "Reasoning: high" (also supports "medium" and "low") — levels: low (fast), medium (balanced), high (deep).
+
+### Built-in Tools
+
+GPT‑OSS can call built‑in tools for web search and Python execution. You can use the demo tool server or connect to external MCP tool servers.
+
+#### Python Tool
+
+- Executes short Python snippets for calculations, parsing, and quick scripts.
+- By default runs in a Docker-based sandbox. To run on the host, set `PYTHON_EXECUTION_BACKEND=UV` (this executes model-generated code locally; use with care).
+- Ensure Docker is available if you are not using the UV backend. It is recommended to run `docker pull python:3.11` in advance.
+
+#### Web Search Tool
+
+- Uses the Exa backend for web search.
+- Requires an Exa API key; set `EXA_API_KEY` in your environment. Create a key at `https://exa.ai`.
+
+### Tool & Reasoning Parser
+
+- We support OpenAI Reasoning and Tool Call parser, as well as our SGLang native api for tool call and reasoning. Refer to [reasoning parser](../advanced_features/separate_reasoning.ipynb) and [tool call parser](../advanced_features/function_calling.ipynb) for more details.
+
+
+## Notes
+
+- Use **Python 3.12** for the demo tools. And install the required `gpt-oss` packages.
+- The default demo integrates the web search tool (Exa backend) and a demo Python interpreter via Docker.
+- For search, set `EXA_API_KEY`. For Python execution, either have Docker available or set `PYTHON_EXECUTION_BACKEND=UV`.
+
+Examples:
+```bash
+export EXA_API_KEY=YOUR_EXA_KEY
+# Optional: run Python tool locally instead of Docker (use with care)
+export PYTHON_EXECUTION_BACKEND=UV
+```
+
+Launch the server with the demo tool server:
+
+```bash
+python3 -m sglang.launch_server \
+ --model-path openai/gpt-oss-120b \
+ --tool-server demo \
+ --tp 2
+```
+
+For production usage, sglang can act as an MCP client for multiple services. An [example tool server](https://github.com/openai/gpt-oss/tree/main/gpt-oss-mcp-server) is provided. Start the servers and point sglang to them:
+```bash
+mcp run -t sse browser_server.py:mcp
+mcp run -t sse python_server.py:mcp
+
+python -m sglang.launch_server ... --tool-server ip-1:port-1,ip-2:port-2
+```
+The URLs should be MCP SSE servers that expose server information and well-documented tools. These tools are added to the system prompt so the model can use them.
+
+## Speculative Decoding
+
+SGLang supports speculative decoding for GPT-OSS models using EAGLE3 algorithm. This can significantly improve decoding speed, especially for small batch sizes.
+
+**Usage**:
+Add `--speculative-algorithm EAGLE3` along with the draft model path.
+```bash
+python3 -m sglang.launch_server \
+ --model-path openai/gpt-oss-120b \
+ --speculative-algorithm EAGLE3 \
+ --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+ --tp 2
+```
+
+```{tip}
+To enable the experimental overlap scheduler for EAGLE3 speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+
+### Quick Demo
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+ base_url="http://localhost:30000/v1",
+ api_key="sk-123456"
+)
+
+tools = [
+ {"type": "code_interpreter"},
+ {"type": "web_search_preview"},
+]
+
+# Reasoning level example
+response = client.responses.create(
+ model="openai/gpt-oss-120b",
+ instructions="You are a helpful assistant."
+ reasoning_effort="high" # Supports high, medium, or low
+ input="In one sentence, explain the transformer architecture.",
+)
+print("====== reasoning: high ======")
+print(response.output_text)
+
+# Test python tool
+response = client.responses.create(
+ model="openai/gpt-oss-120b",
+ instructions="You are a helpful assistant, you could use python tool to execute code.",
+ input="Use python tool to calculate the sum of 29138749187 and 29138749187", # 58,277,498,374
+ tools=tools
+)
+print("====== test python tool ======")
+print(response.output_text)
+
+# Test browser tool
+response = client.responses.create(
+ model="openai/gpt-oss-120b",
+ instructions="You are a helpful assistant, you could use browser to search the web",
+ input="Search the web for the latest news about Nvidia stock price",
+ tools=tools
+)
+print("====== test browser tool ======")
+print(response.output_text)
+```
+
+Example output:
+```
+====== test python tool ======
+The sum of 29,138,749,187 and 29,138,749,187 is **58,277,498,374**.
+====== test browser tool ======
+**Recent headlines on Nvidia (NVDA) stock**
+
+| Date (2025) | Source | Key news points | Stock‑price detail |
+|-------------|--------|----------------|--------------------|
+| **May 13** | Reuters | The market data page shows Nvidia trading “higher” at **$116.61** with no change from the previous close. | **$116.61** – latest trade (delayed ≈ 15 min)【14†L34-L38】 |
+| **Aug 18** | CNBC | Morgan Stanley kept an **overweight** rating and lifted its price target to **$206** (up from $200), implying a 14 % upside from the Friday close. The firm notes Nvidia shares have already **jumped 34 % this year**. | No exact price quoted, but the article signals strong upside expectations【9†L27-L31】 |
+| **Aug 20** | The Motley Fool | Nvidia is set to release its Q2 earnings on Aug 27. The article lists the **current price of $175.36**, down 0.16 % on the day (as of 3:58 p.m. ET). | **$175.36** – current price on Aug 20【10†L12-L15】【10†L53-L57】 |
+
+**What the news tells us**
+
+* Nvidia’s share price has risen sharply this year – up roughly a third according to Morgan Stanley – and analysts are still raising targets (now $206).
+* The most recent market quote (Reuters, May 13) was **$116.61**, but the stock has surged since then, reaching **$175.36** by mid‑August.
+* Upcoming earnings on **Aug 27** are a focal point; both the Motley Fool and Morgan Stanley expect the results could keep the rally going.
+
+**Bottom line:** Nvidia’s stock is on a strong upward trajectory in 2025, with price targets climbing toward $200‑$210 and the market price already near $175 as of late August.
+
+```
diff --git a/sglang/docs/basic_usage/llama4.md b/sglang/docs/basic_usage/llama4.md
new file mode 100644
index 0000000000000000000000000000000000000000..05ffb2c60cd8349bf8acbbea01ef1791a4504741
--- /dev/null
+++ b/sglang/docs/basic_usage/llama4.md
@@ -0,0 +1,92 @@
+# Llama4 Usage
+
+[Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md) is Meta's latest generation of open-source LLM model with industry-leading performance.
+
+SGLang has supported Llama 4 Scout (109B) and Llama 4 Maverick (400B) since [v0.4.5](https://github.com/sgl-project/sglang/releases/tag/v0.4.5).
+
+Ongoing optimizations are tracked in the [Roadmap](https://github.com/sgl-project/sglang/issues/5118).
+
+## Launch Llama 4 with SGLang
+
+To serve Llama 4 models on 8xH100/H200 GPUs:
+
+```bash
+python3 -m sglang.launch_server \
+ --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
+ --tp 8 \
+ --context-length 1000000
+```
+
+### Configuration Tips
+
+- **OOM Mitigation**: Adjust `--context-length` to avoid a GPU out-of-memory issue. For the Scout model, we recommend setting this value up to 1M on 8\*H100 and up to 2.5M on 8\*H200. For the Maverick model, we don't need to set context length on 8\*H200. When hybrid kv cache is enabled, `--context-length` can be set up to 5M on 8\*H100 and up to 10M on 8\*H200 for the Scout model.
+
+- **Attention Backend Auto-Selection**: SGLang automatically selects the optimal attention backend for Llama 4 based on your hardware. You typically don't need to specify `--attention-backend` manually:
+ - **Blackwell GPUs (B200/GB200)**: `trtllm_mha`
+ - **Hopper GPUs (H100/H200)**: `fa3`
+ - **AMD GPUs**: `aiter`
+ - **Intel XPU**: `intel_xpu`
+ - **Other platforms**: `triton` (fallback)
+
+ To override the auto-selection, explicitly specify `--attention-backend` with one of the supported backends: `fa3`, `aiter`, `triton`, `trtllm_mha`, or `intel_xpu`.
+
+- **Chat Template**: Add `--chat-template llama-4` for chat completion tasks.
+- **Enable Multi-Modal**: Add `--enable-multimodal` for multi-modal capabilities.
+- **Enable Hybrid-KVCache**: Set `--swa-full-tokens-ratio` to adjust the ratio of SWA layer (for Llama4, it's local attention layer) KV tokens / full layer KV tokens. (default: 0.8, range: 0-1)
+
+
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Llama 4 Maverick (400B) with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+
+**Usage**:
+Add arguments `--speculative-draft-model-path`, `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+```
+python3 -m sglang.launch_server \
+ --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
+ --speculative-algorithm EAGLE3 \
+ --speculative-draft-model-path nvidia/Llama-4-Maverick-17B-128E-Eagle3 \
+ --speculative-num-steps 3 \
+ --speculative-eagle-topk 1 \
+ --speculative-num-draft-tokens 4 \
+ --trust-remote-code \
+ --tp 8 \
+ --context-length 1000000
+```
+
+- **Note** The Llama 4 draft model *nvidia/Llama-4-Maverick-17B-128E-Eagle3* can only recognize conversations in chat mode.
+
+## Benchmarking Results
+
+### Accuracy Test with `lm_eval`
+
+The accuracy on SGLang for both Llama4 Scout and Llama4 Maverick can match the [official benchmark numbers](https://ai.meta.com/blog/llama-4-multimodal-intelligence/).
+
+Benchmark results on MMLU Pro dataset with 8*H100:
+| | Llama-4-Scout-17B-16E-Instruct | Llama-4-Maverick-17B-128E-Instruct |
+|--------------------|--------------------------------|-------------------------------------|
+| Official Benchmark | 74.3 | 80.5 |
+| SGLang | 75.2 | 80.7 |
+
+Commands:
+
+```bash
+# Llama-4-Scout-17B-16E-Instruct model
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
+ --port 30000 \
+ --tp 8 \
+ --mem-fraction-static 0.8 \
+ --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Scout-17B-16E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+
+# Llama-4-Maverick-17B-128E-Instruct
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
+ --port 30000 \
+ --tp 8 \
+ --mem-fraction-static 0.8 \
+ --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Maverick-17B-128E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+```
+
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/5092).
diff --git a/sglang/docs/basic_usage/minimax_m2.md b/sglang/docs/basic_usage/minimax_m2.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ca6ed809fcba3437114badc88d7e79e1e9dba2d
--- /dev/null
+++ b/sglang/docs/basic_usage/minimax_m2.md
@@ -0,0 +1,85 @@
+# MiniMax M2.5/M2.1/M2 Usage
+
+[MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5), [MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1), and [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) are advanced large language models created by [MiniMax](https://www.minimax.io/).
+
+The MiniMax-M2 series redefines efficiency for agents. These compact, fast, and cost-effective MoE models (230 billion total parameters with 10 billion active parameters) are built for elite performance in coding and agentic tasks, all while maintaining powerful general intelligence. With just 10 billion activated parameters, the MiniMax-M2 series provides sophisticated, end-to-end tool use performance expected from today's leading models, but in a streamlined form factor that makes deployment and scaling easier than ever.
+
+## Supported Models
+
+This guide applies to the following models. You only need to update the model name during deployment. The following examples use **MiniMax-M2**:
+
+- [MiniMaxAI/MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5)
+- [MiniMaxAI/MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1)
+- [MiniMaxAI/MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2)
+
+## System Requirements
+
+The following are recommended configurations; actual requirements should be adjusted based on your use case:
+
+- 4x 96GB GPUs: Supported context length of up to 400K tokens.
+- 8x 144GB GPUs: Supported context length of up to 3M tokens.
+
+## Deployment with Python
+
+4-GPU deployment command:
+
+```bash
+python -m sglang.launch_server \
+ --model-path MiniMaxAI/MiniMax-M2 \
+ --tp-size 4 \
+ --tool-call-parser minimax-m2 \
+ --reasoning-parser minimax-append-think \
+ --host 0.0.0.0 \
+ --trust-remote-code \
+ --port 8000 \
+ --mem-fraction-static 0.85
+```
+
+8-GPU deployment command:
+
+```bash
+python -m sglang.launch_server \
+ --model-path MiniMaxAI/MiniMax-M2 \
+ --tp-size 8 \
+ --ep-size 8 \
+ --tool-call-parser minimax-m2 \
+ --reasoning-parser minimax-append-think \
+ --host 0.0.0.0 \
+ --trust-remote-code \
+ --port 8000 \
+ --mem-fraction-static 0.85
+```
+
+### AMD GPUs (MI300X/MI325X/MI355X)
+
+8-GPU deployment command:
+
+```bash
+SGLANG_USE_AITER=1 python -m sglang.launch_server \
+ --model-path MiniMaxAI/MiniMax-M2.5 \
+ --tp-size 8 \
+ --ep-size 8 \
+ --attention-backend aiter \
+ --tool-call-parser minimax-m2 \
+ --reasoning-parser minimax-append-think \
+ --host 0.0.0.0 \
+ --trust-remote-code \
+ --port 8000 \
+ --mem-fraction-static 0.85
+```
+
+## Testing Deployment
+
+After startup, you can test the SGLang OpenAI-compatible API with the following command:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "MiniMaxAI/MiniMax-M2",
+ "messages": [
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+ {"role": "user", "content": [{"type": "text", "text": "Who won the world series in 2020?"}]}
+ ]
+ }'
+```
diff --git a/sglang/docs/basic_usage/native_api.ipynb b/sglang/docs/basic_usage/native_api.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..05f4f368830662ab2fa1aafa1062f29f20b3f774
--- /dev/null
+++ b/sglang/docs/basic_usage/native_api.ipynb
@@ -0,0 +1,667 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# SGLang Native APIs\n",
+ "\n",
+ "Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce the following APIs:\n",
+ "\n",
+ "- `/generate` (text generation model)\n",
+ "- `/get_model_info`\n",
+ "- `/get_server_info`\n",
+ "- `/health`\n",
+ "- `/health_generate`\n",
+ "- `/flush_cache`\n",
+ "- `/update_weights`\n",
+ "- `/encode`(embedding model)\n",
+ "- `/v1/rerank`(cross encoder rerank model)\n",
+ "- `/v1/score`(decoder-only scoring)\n",
+ "- `/classify`(reward model)\n",
+ "- `/start_expert_distribution_record`\n",
+ "- `/stop_expert_distribution_record`\n",
+ "- `/dump_expert_distribution_record`\n",
+ "- `/tokenize`\n",
+ "- `/detokenize`\n",
+ "- A full list of these APIs can be found at [http_server.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py)\n",
+ "\n",
+ "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Generate (text generation model)\n",
+ "Generate completions. This is similar to the `/v1/completions` in OpenAI API. Detailed parameters can be found in the [sampling parameters](sampling_params.md)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "url = f\"http://localhost:{port}/generate\"\n",
+ "data = {\"text\": \"What is the capital of France?\"}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get Model Info\n",
+ "\n",
+ "Get the information of the model.\n",
+ "\n",
+ "- `model_path`: The path/name of the model.\n",
+ "- `is_generation`: Whether the model is used as generation model or embedding model.\n",
+ "- `tokenizer_path`: The path/name of the tokenizer.\n",
+ "- `preferred_sampling_params`: The default sampling params specified via `--preferred-sampling-params`. `None` is returned in this example as we did not explicitly configure it in server args.\n",
+ "- `weight_version`: This field contains the version of the model weights. This is often used to track changes or updates to the model’s trained parameters.\n",
+ "- `has_image_understanding`: Whether the model has image-understanding capability.\n",
+ "- `has_audio_understanding`: Whether the model has audio-understanding capability.\n",
+ "- `model_type`: The model type from the HuggingFace config (e.g., \"qwen2\", \"llama\").\n",
+ "- `architectures`: The model architectures from the HuggingFace config (e.g., [\"Qwen2ForCausalLM\"])."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = f\"http://localhost:{port}/get_model_info\"\n",
+ "\n",
+ "response = requests.get(url)\n",
+ "response_json = response.json()\n",
+ "print_highlight(response_json)\n",
+ "assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+ "assert response_json[\"is_generation\"] is True\n",
+ "assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+ "assert response_json[\"preferred_sampling_params\"] is None\n",
+ "assert response_json.keys() == {\n",
+ " \"model_path\",\n",
+ " \"is_generation\",\n",
+ " \"tokenizer_path\",\n",
+ " \"preferred_sampling_params\",\n",
+ " \"weight_version\",\n",
+ " \"has_image_understanding\",\n",
+ " \"has_audio_understanding\",\n",
+ " \"model_type\",\n",
+ " \"architectures\",\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get Server Info\n",
+ "Gets the server information including CLI arguments, token limits, and memory pool sizes.\n",
+ "- Note: `get_server_info` merges the following deprecated endpoints:\n",
+ " - `get_server_args`\n",
+ " - `get_memory_pool_size`\n",
+ " - `get_max_total_num_tokens`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = f\"http://localhost:{port}/get_server_info\"\n",
+ "\n",
+ "response = requests.get(url)\n",
+ "print_highlight(response.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Health Check\n",
+ "- `/health`: Check the health of the server.\n",
+ "- `/health_generate`: Check the health of the server by generating one token."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = f\"http://localhost:{port}/health_generate\"\n",
+ "\n",
+ "response = requests.get(url)\n",
+ "print_highlight(response.text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = f\"http://localhost:{port}/health\"\n",
+ "\n",
+ "response = requests.get(url)\n",
+ "print_highlight(response.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Flush Cache\n",
+ "\n",
+ "Flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = f\"http://localhost:{port}/flush_cache\"\n",
+ "\n",
+ "response = requests.post(url)\n",
+ "print_highlight(response.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Update Weights From Disk\n",
+ "\n",
+ "Update model weights from disk without restarting the server. Only applicable for models with the same architecture and parameter size.\n",
+ "\n",
+ "SGLang support `update_weights_from_disk` API for continuous evaluation during training (save checkpoint to disk and update weights from disk).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# successful update with same architecture and size\n",
+ "\n",
+ "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+ "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "print_highlight(response.text)\n",
+ "assert response.json()[\"success\"] is True\n",
+ "assert response.json()[\"message\"] == \"Succeeded to update model weights.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# failed update with different parameter size or wrong name\n",
+ "\n",
+ "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+ "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "response_json = response.json()\n",
+ "print_highlight(response_json)\n",
+ "assert response_json[\"success\"] is False\n",
+ "assert response_json[\"message\"] == (\n",
+ " \"Failed to get weights iterator: \"\n",
+ " \"qwen/qwen2.5-0.5b-instruct-wrong\"\n",
+ " \" (repository not found).\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Encode (embedding model)\n",
+ "\n",
+ "Encode text into embeddings. Note that this API is only available for [embedding models](openai_api_embeddings.ipynb) and will raise an error for generation models.\n",
+ "Therefore, we launch a new server to server an embedding model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embedding_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+ " --host 0.0.0.0 --is-embedding --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# successful encode for embedding model\n",
+ "\n",
+ "url = f\"http://localhost:{port}/encode\"\n",
+ "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "response_json = response.json()\n",
+ "print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(embedding_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## v1/rerank (cross encoder rerank model)\n",
+ "Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) with `attention-backend` `triton` and `torch_native`.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reranker_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
+ " --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=reranker_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# compute rerank scores for query and documents\n",
+ "\n",
+ "url = f\"http://localhost:{port}/v1/rerank\"\n",
+ "data = {\n",
+ " \"model\": \"BAAI/bge-reranker-v2-m3\",\n",
+ " \"query\": \"what is panda?\",\n",
+ " \"documents\": [\n",
+ " \"hi\",\n",
+ " \"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.\",\n",
+ " ],\n",
+ "}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "response_json = response.json()\n",
+ "for item in response_json:\n",
+ " print_highlight(f\"Score: {item['score']:.2f} - Document: '{item['document']}'\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(reranker_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## v1/score (decoder-only scoring)\n",
+ "\n",
+ "Compute token probabilities for specified tokens given a query and items. This is useful for classification tasks, scoring responses, or computing log-probabilities.\n",
+ "\n",
+ "Parameters:\n",
+ "- `query`: Query text\n",
+ "- `items`: Item text(s) to score\n",
+ "- `label_token_ids`: Token IDs to compute probabilities for\n",
+ "- `apply_softmax`: Whether to apply softmax to get normalized probabilities (default: False)\n",
+ "- `item_first`: Whether items come first in concatenation order (default: False)\n",
+ "- `model`: Model name\n",
+ "\n",
+ "The response contains `scores` - a list of probability lists, one per item, each in the order of `label_token_ids`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "score_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+ " --host 0.0.0.0 --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=score_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Score the probability of different completions given a query\n",
+ "query = \"The capital of France is\"\n",
+ "items = [\"Paris\", \"London\", \"Berlin\"]\n",
+ "\n",
+ "url = f\"http://localhost:{port}/v1/score\"\n",
+ "data = {\n",
+ " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
+ " \"query\": query,\n",
+ " \"items\": items,\n",
+ " \"label_token_ids\": [9454, 2753], # e.g. \"Yes\" and \"No\" token ids\n",
+ " \"apply_softmax\": True, # Normalize probabilities to sum to 1\n",
+ "}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "response_json = response.json()\n",
+ "\n",
+ "# Display scores for each item\n",
+ "for item, scores in zip(items, response_json[\"scores\"]):\n",
+ " print_highlight(f\"Item '{item}': probabilities = {[f'{s:.4f}' for s in scores]}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(score_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Classify (reward model)\n",
+ "\n",
+ "SGLang Runtime also supports reward models. Here we use a reward model to classify the quality of pairwise generations."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
+ "# This will be updated in the future.\n",
+ "\n",
+ "reward_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=reward_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "PROMPT = (\n",
+ " \"What is the range of the numeric output of a sigmoid node in a neural network?\"\n",
+ ")\n",
+ "\n",
+ "RESPONSE1 = \"The output of a sigmoid node is bounded between -1 and 1.\"\n",
+ "RESPONSE2 = \"The output of a sigmoid node is bounded between 0 and 1.\"\n",
+ "\n",
+ "CONVS = [\n",
+ " [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE1}],\n",
+ " [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE2}],\n",
+ "]\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
+ "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False, return_dict=False)\n",
+ "\n",
+ "url = f\"http://localhost:{port}/classify\"\n",
+ "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
+ "\n",
+ "responses = requests.post(url, json=data).json()\n",
+ "for response in responses:\n",
+ " print_highlight(f\"reward: {response['embedding'][0]}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(reward_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Capture expert selection distribution in MoE models\n",
+ "\n",
+ "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
+ "\n",
+ "*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "expert_record_server_process, port = launch_server_cmd(\n",
+ " \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=expert_record_server_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
+ "print_highlight(response)\n",
+ "\n",
+ "url = f\"http://localhost:{port}/generate\"\n",
+ "data = {\"text\": \"What is the capital of France?\"}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "print_highlight(response.json())\n",
+ "\n",
+ "response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
+ "print_highlight(response)\n",
+ "\n",
+ "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
+ "print_highlight(response)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(expert_record_server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tokenize/Detokenize Example (Round Trip)\n",
+ "\n",
+ "This example demonstrates how to use the /tokenize and /detokenize endpoints together. We first tokenize a string, then detokenize the resulting IDs to reconstruct the original text. This workflow is useful when you need to handle tokenization externally but still leverage the server for detokenization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer_free_server_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=tokenizer_free_server_process)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "from sglang.utils import print_highlight\n",
+ "\n",
+ "base_url = f\"http://localhost:{port}\"\n",
+ "tokenize_url = f\"{base_url}/tokenize\"\n",
+ "detokenize_url = f\"{base_url}/detokenize\"\n",
+ "\n",
+ "model_name = \"qwen/qwen2.5-0.5b-instruct\"\n",
+ "input_text = \"SGLang provides efficient tokenization endpoints.\"\n",
+ "print_highlight(f\"Original Input Text:\\n'{input_text}'\")\n",
+ "\n",
+ "# --- tokenize the input text ---\n",
+ "tokenize_payload = {\n",
+ " \"model\": model_name,\n",
+ " \"prompt\": input_text,\n",
+ " \"add_special_tokens\": False,\n",
+ "}\n",
+ "try:\n",
+ " tokenize_response = requests.post(tokenize_url, json=tokenize_payload)\n",
+ " tokenize_response.raise_for_status()\n",
+ " tokenization_result = tokenize_response.json()\n",
+ " token_ids = tokenization_result.get(\"tokens\")\n",
+ "\n",
+ " if not token_ids:\n",
+ " raise ValueError(\"Tokenization returned empty tokens.\")\n",
+ "\n",
+ " print_highlight(f\"\\nTokenized Output (IDs):\\n{token_ids}\")\n",
+ " print_highlight(f\"Token Count: {tokenization_result.get('count')}\")\n",
+ " print_highlight(f\"Max Model Length: {tokenization_result.get('max_model_len')}\")\n",
+ "\n",
+ " # --- detokenize the obtained token IDs ---\n",
+ " detokenize_payload = {\n",
+ " \"model\": model_name,\n",
+ " \"tokens\": token_ids,\n",
+ " \"skip_special_tokens\": True,\n",
+ " }\n",
+ "\n",
+ " detokenize_response = requests.post(detokenize_url, json=detokenize_payload)\n",
+ " detokenize_response.raise_for_status()\n",
+ " detokenization_result = detokenize_response.json()\n",
+ " reconstructed_text = detokenization_result.get(\"text\")\n",
+ "\n",
+ " print_highlight(f\"\\nDetokenized Output (Text):\\n'{reconstructed_text}'\")\n",
+ "\n",
+ " if input_text == reconstructed_text:\n",
+ " print_highlight(\n",
+ " \"\\nRound Trip Successful: Original and reconstructed text match.\"\n",
+ " )\n",
+ " else:\n",
+ " print_highlight(\n",
+ " \"\\nRound Trip Mismatch: Original and reconstructed text differ.\"\n",
+ " )\n",
+ "\n",
+ "except requests.exceptions.RequestException as e:\n",
+ " print_highlight(f\"\\nHTTP Request Error: {e}\")\n",
+ "except Exception as e:\n",
+ " print_highlight(f\"\\nAn error occurred: {e}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(tokenizer_free_server_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/sglang/docs/basic_usage/offline_engine_api.ipynb b/sglang/docs/basic_usage/offline_engine_api.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..9c03e90a79354a066dd4b9cff38012f5c3afb14f
--- /dev/null
+++ b/sglang/docs/basic_usage/offline_engine_api.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Offline Engine API\n",
+ "\n",
+ "SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
+ "\n",
+ "- Offline Batch Inference\n",
+ "- Custom Server on Top of the Engine\n",
+ "\n",
+ "This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
+ "\n",
+ "- Non-streaming synchronous generation\n",
+ "- Streaming synchronous generation\n",
+ "- Non-streaming asynchronous generation\n",
+ "- Streaming asynchronous generation\n",
+ "\n",
+ "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py).\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Nest Asyncio\n",
+ "Note that if you want to use **Offline Engine** in ipython or some other nested loop code, you need to add the following code:\n",
+ "```python\n",
+ "import nest_asyncio\n",
+ "\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Advanced Usage\n",
+ "\n",
+ "The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/hidden_states). \n",
+ "\n",
+ "Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Offline Batch Inference\n",
+ "\n",
+ "SGLang offline engine supports batch inference with efficient scheduling."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# launch the offline engine\n",
+ "import asyncio\n",
+ "\n",
+ "import sglang as sgl\n",
+ "import sglang.test.doc_patch\n",
+ "from sglang.utils import async_stream_and_merge, stream_and_merge\n",
+ "\n",
+ "llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Non-streaming Synchronous Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Hello, my name is\",\n",
+ " \"The president of the United States is\",\n",
+ " \"The capital of France is\",\n",
+ " \"The future of AI is\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+ "\n",
+ "outputs = llm.generate(prompts, sampling_params)\n",
+ "for prompt, output in zip(prompts, outputs):\n",
+ " print(\"===============================\")\n",
+ " print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Streaming Synchronous Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+ " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+ " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\n",
+ " \"temperature\": 0.2,\n",
+ " \"top_p\": 0.9,\n",
+ "}\n",
+ "\n",
+ "print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
+ "\n",
+ "for prompt in prompts:\n",
+ " print(f\"Prompt: {prompt}\")\n",
+ " merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
+ " print(\"Generated text:\", merged_output)\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Non-streaming Asynchronous Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+ " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+ " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+ "\n",
+ "print(\"\\n=== Testing asynchronous batch generation ===\")\n",
+ "\n",
+ "\n",
+ "async def main():\n",
+ " outputs = await llm.async_generate(prompts, sampling_params)\n",
+ "\n",
+ " for prompt, output in zip(prompts, outputs):\n",
+ " print(f\"\\nPrompt: {prompt}\")\n",
+ " print(f\"Generated text: {output['text']}\")\n",
+ "\n",
+ "\n",
+ "asyncio.run(main())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Streaming Asynchronous Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompts = [\n",
+ " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+ " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+ " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+ "]\n",
+ "\n",
+ "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+ "\n",
+ "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
+ "\n",
+ "\n",
+ "async def main():\n",
+ " for prompt in prompts:\n",
+ " print(f\"\\nPrompt: {prompt}\")\n",
+ " print(\"Generated text: \", end=\"\", flush=True)\n",
+ "\n",
+ " # Replace direct calls to async_generate with our custom overlap-aware version\n",
+ " async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
+ " print(cleaned_chunk, end=\"\", flush=True)\n",
+ "\n",
+ " print() # New line after each prompt\n",
+ "\n",
+ "\n",
+ "asyncio.run(main())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "llm.shutdown()"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/basic_usage/ollama_api.md b/sglang/docs/basic_usage/ollama_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7b302c9645826527d805a5a31fa7d83767f0660
--- /dev/null
+++ b/sglang/docs/basic_usage/ollama_api.md
@@ -0,0 +1,91 @@
+# Ollama-Compatible API
+
+SGLang provides Ollama API compatibility, allowing you to use the Ollama CLI and Python library with SGLang as the inference backend.
+
+## Prerequisites
+
+```bash
+# Install the Ollama Python library (for Python client usage)
+pip install ollama
+```
+
+> **Note**: You don't need the Ollama server installed - SGLang acts as the backend. You only need the `ollama` CLI or Python library as the client.
+
+## Endpoints
+
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/` | GET, HEAD | Health check for Ollama CLI |
+| `/api/tags` | GET | List available models |
+| `/api/chat` | POST | Chat completions (streaming & non-streaming) |
+| `/api/generate` | POST | Text generation (streaming & non-streaming) |
+| `/api/show` | POST | Model information |
+
+## Quick Start
+
+### 1. Launch SGLang Server
+
+```bash
+python -m sglang.launch_server \
+ --model Qwen/Qwen2.5-1.5B-Instruct \
+ --port 30001 \
+ --host 0.0.0.0
+```
+
+> **Note**: The model name used with `ollama run` must match exactly what you passed to `--model`.
+
+### 2. Use Ollama CLI
+
+```bash
+# List available models
+OLLAMA_HOST=http://localhost:30001 ollama list
+
+# Interactive chat
+OLLAMA_HOST=http://localhost:30001 ollama run "Qwen/Qwen2.5-1.5B-Instruct"
+```
+
+If connecting to a remote server behind a firewall:
+
+```bash
+# SSH tunnel
+ssh -L 30001:localhost:30001 user@gpu-server -N &
+
+# Then use Ollama CLI as above
+OLLAMA_HOST=http://localhost:30001 ollama list
+```
+
+### 3. Use Ollama Python Library
+
+```python
+import ollama
+
+client = ollama.Client(host='http://localhost:30001')
+
+# Non-streaming
+response = client.chat(
+ model='Qwen/Qwen2.5-1.5B-Instruct',
+ messages=[{'role': 'user', 'content': 'Hello!'}]
+)
+print(response['message']['content'])
+
+# Streaming
+stream = client.chat(
+ model='Qwen/Qwen2.5-1.5B-Instruct',
+ messages=[{'role': 'user', 'content': 'Tell me a story'}],
+ stream=True
+)
+for chunk in stream:
+ print(chunk['message']['content'], end='', flush=True)
+```
+
+## Smart Router
+
+For intelligent routing between local Ollama (fast) and remote SGLang (powerful) using an LLM judge, see the [Smart Router documentation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/ollama/README.md).
+
+## Summary
+
+| Component | Purpose |
+|-----------|---------|
+| **Ollama API** | Familiar CLI/API that developers already know |
+| **SGLang Backend** | High-performance inference engine |
+| **Smart Router** | Intelligent routing - fast local for simple tasks, powerful remote for complex tasks |
diff --git a/sglang/docs/basic_usage/openai_api.rst b/sglang/docs/basic_usage/openai_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..370abe99c56796ebb1953c03e69b22eab17551cd
--- /dev/null
+++ b/sglang/docs/basic_usage/openai_api.rst
@@ -0,0 +1,9 @@
+OpenAI-Compatible APIs
+======================
+
+.. toctree::
+ :maxdepth: 1
+
+ openai_api_completions.ipynb
+ openai_api_vision.ipynb
+ openai_api_embeddings.ipynb
diff --git a/sglang/docs/basic_usage/openai_api_completions.ipynb b/sglang/docs/basic_usage/openai_api_completions.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..ffa576ae52c5fe3ce27fb627eac763bf7e58b2ce
--- /dev/null
+++ b/sglang/docs/basic_usage/openai_api_completions.ipynb
@@ -0,0 +1,552 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# OpenAI APIs - Completions\n",
+ "\n",
+ "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+ "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+ "\n",
+ "This tutorial covers the following popular APIs:\n",
+ "\n",
+ "- `chat/completions`\n",
+ "- `completions`\n",
+ "\n",
+ "Check out other tutorials to learn about [vision APIs](openai_api_vision.ipynb) for vision-language models and [embedding APIs](openai_api_embeddings.ipynb) for embedding models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server\n",
+ "\n",
+ "Launch the server in your terminal and wait for it to initialize."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+ "print(f\"Server started on http://localhost:{port}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Chat Completions\n",
+ "\n",
+ "### Usage\n",
+ "\n",
+ "The server fully implements the OpenAI API.\n",
+ "It will automatically apply the chat template specified in the Hugging Face tokenizer, if one is available.\n",
+ "You can also specify a custom chat template with `--chat-template` when launching the server."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "\n",
+ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=64,\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Response: {response}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Model Thinking/Reasoning Support\n",
+ "\n",
+ "Some models support internal reasoning or thinking processes that can be exposed in the API response. SGLang provides unified support for various reasoning models through the `chat_template_kwargs` parameter and compatible reasoning parsers.\n",
+ "\n",
+ "#### Supported Models and Configuration\n",
+ "\n",
+ "| Model Family | Chat Template Parameter | Reasoning Parser | Notes |\n",
+ "|--------------|------------------------|------------------|--------|\n",
+ "| DeepSeek-R1 (R1, R1-0528, R1-Distill) | `enable_thinking` | `--reasoning-parser deepseek-r1` | Standard reasoning models |\n",
+ "| DeepSeek-V3.1 | `thinking` | `--reasoning-parser deepseek-v3` | Hybrid model (thinking/non-thinking modes) |\n",
+ "| Qwen3 (standard) | `enable_thinking` | `--reasoning-parser qwen3` | Hybrid model (thinking/non-thinking modes) |\n",
+ "| Qwen3-Thinking | N/A (always enabled) | `--reasoning-parser qwen3-thinking` | Always generates reasoning |\n",
+ "| Kimi | N/A (always enabled) | `--reasoning-parser kimi` | Kimi thinking models |\n",
+ "| Gpt-Oss | N/A (always enabled) | `--reasoning-parser gpt-oss` | Gpt-Oss thinking models |\n",
+ "\n",
+ "#### Basic Usage\n",
+ "\n",
+ "To enable reasoning output, you need to:\n",
+ "1. Launch the server with the appropriate reasoning parser\n",
+ "2. Set the model-specific parameter in `chat_template_kwargs`\n",
+ "3. Optionally use `separate_reasoning: False` to not get reasoning content separately (default to `True`)\n",
+ "\n",
+ "**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Example: Qwen3 Models\n",
+ "\n",
+ "```python\n",
+ "# Launch server:\n",
+ "# python3 -m sglang.launch_server --model Qwen/Qwen3-4B --reasoning-parser qwen3\n",
+ "\n",
+ "from openai import OpenAI\n",
+ "\n",
+ "client = OpenAI(\n",
+ " api_key=\"EMPTY\",\n",
+ " base_url=f\"http://127.0.0.1:30000/v1\",\n",
+ ")\n",
+ "\n",
+ "model = \"Qwen/Qwen3-4B\"\n",
+ "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=model,\n",
+ " messages=messages,\n",
+ " extra_body={\n",
+ " \"chat_template_kwargs\": {\"enable_thinking\": True},\n",
+ " \"separate_reasoning\": True\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+ "print(\"-\"*100)\n",
+ "print(\"Answer:\", response.choices[0].message.content)\n",
+ "```\n",
+ "\n",
+ "**ExampleOutput:**\n",
+ "```\n",
+ "Reasoning: Okay, so the user is asking how many 'r's are in the word 'strawberry'. Let me think. First, I need to make sure I have the word spelled correctly. Strawberry... S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me break it down.\n",
+ "\n",
+ "Starting with 'strawberry', let's write out the letters one by one. S, T, R, A, W, B, E, R, R, Y. Hmm, wait, that's 10 letters. Let me check again. S (1), T (2), R (3), A (4), W (5), B (6), E (7), R (8), R (9), Y (10). So the letters are S-T-R-A-W-B-E-R-R-Y. \n",
+ "...\n",
+ "Therefore, the answer should be three R's in 'strawberry'. But I need to make sure I'm not counting any other letters as R. Let me check again. S, T, R, A, W, B, E, R, R, Y. No other R's. So three in total. Yeah, that seems right.\n",
+ "\n",
+ "----------------------------------------------------------------------------------------------------\n",
+ "Answer: The word \"strawberry\" contains **three** letters 'r'. Here's the breakdown:\n",
+ "\n",
+ "1. **S-T-R-A-W-B-E-R-R-Y** \n",
+ " - The **third letter** is 'R'. \n",
+ " - The **eighth and ninth letters** are also 'R's. \n",
+ "\n",
+ "Thus, the total count is **3**. \n",
+ "\n",
+ "**Answer:** 3.\n",
+ "```\n",
+ "\n",
+ "**Note:** Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`. Qwen3-Thinking models always generate reasoning content and don't support the `enable_thinking` parameter.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Logit Bias Support\n",
+ "\n",
+ "SGLang supports the `logit_bias` parameter for both chat completions and completions APIs. This parameter allows you to modify the likelihood of specific tokens being generated by adding bias values to their logits. The bias values can range from -100 to 100, where:\n",
+ "\n",
+ "- **Positive values** (0 to 100) increase the likelihood of the token being selected\n",
+ "- **Negative values** (-100 to 0) decrease the likelihood of the token being selected\n",
+ "- **-100** effectively prevents the token from being generated\n",
+ "\n",
+ "The `logit_bias` parameter accepts a dictionary where keys are token IDs (as strings) and values are the bias amounts (as floats).\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Getting Token IDs\n",
+ "\n",
+ "To use `logit_bias` effectively, you need to know the token IDs for the words you want to bias. Here's how to get token IDs:\n",
+ "\n",
+ "```python\n",
+ "# Get tokenizer to find token IDs\n",
+ "import tiktoken\n",
+ "\n",
+ "# For OpenAI models, use the appropriate encoding\n",
+ "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo\") # or your model\n",
+ "\n",
+ "# Get token IDs for specific words\n",
+ "word = \"sunny\"\n",
+ "token_ids = tokenizer.encode(word)\n",
+ "print(f\"Token IDs for '{word}': {token_ids}\")\n",
+ "\n",
+ "# For SGLang models, you can access the tokenizer through the client\n",
+ "# and get token IDs for bias\n",
+ "```\n",
+ "\n",
+ "**Important:** The `logit_bias` parameter uses token IDs as string keys, not the actual words.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Example: DeepSeek-V3 Models\n",
+ "\n",
+ "DeepSeek-V3 models support thinking mode through the `thinking` parameter:\n",
+ "\n",
+ "```python\n",
+ "# Launch server:\n",
+ "# python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.1 --tp 8 --reasoning-parser deepseek-v3\n",
+ "\n",
+ "from openai import OpenAI\n",
+ "\n",
+ "client = OpenAI(\n",
+ " api_key=\"EMPTY\",\n",
+ " base_url=f\"http://127.0.0.1:30000/v1\",\n",
+ ")\n",
+ "\n",
+ "model = \"deepseek-ai/DeepSeek-V3.1\"\n",
+ "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=model,\n",
+ " messages=messages,\n",
+ " extra_body={\n",
+ " \"chat_template_kwargs\": {\"thinking\": True},\n",
+ " \"separate_reasoning\": True\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+ "print(\"-\"*100)\n",
+ "print(\"Answer:\", response.choices[0].message.content)\n",
+ "```\n",
+ "\n",
+ "**Example Output:**\n",
+ "```\n",
+ "Reasoning: First, the question is: \"How many r's are in 'strawberry'?\"\n",
+ "\n",
+ "I need to count the number of times the letter 'r' appears in the word \"strawberry\".\n",
+ "\n",
+ "Let me write out the word: S-T-R-A-W-B-E-R-R-Y.\n",
+ "\n",
+ "Now, I'll go through each letter and count the 'r's.\n",
+ "...\n",
+ "So, I have three 'r's in \"strawberry\".\n",
+ "\n",
+ "I should double-check. The word is spelled S-T-R-A-W-B-E-R-R-Y. The letters are at positions: 3, 8, and 9 are 'r's. Yes, that's correct.\n",
+ "\n",
+ "Therefore, the answer should be 3.\n",
+ "----------------------------------------------------------------------------------------------------\n",
+ "Answer: The word \"strawberry\" contains **3** instances of the letter \"r\". Here's a breakdown for clarity:\n",
+ "\n",
+ "- The word is spelled: S-T-R-A-W-B-E-R-R-Y\n",
+ "- The \"r\" appears at the 3rd, 8th, and 9th positions.\n",
+ "```\n",
+ "\n",
+ "**Note:** DeepSeek-V3 models use the `thinking` parameter (not `enable_thinking`) to control reasoning output.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example with logit_bias parameter\n",
+ "# Note: You need to get the actual token IDs from your tokenizer\n",
+ "# For demonstration, we'll use some example token IDs\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": \"Complete this sentence: The weather today is\"}\n",
+ " ],\n",
+ " temperature=0.7,\n",
+ " max_tokens=20,\n",
+ " logit_bias={\n",
+ " \"12345\": 50, # Increase likelihood of token ID 12345\n",
+ " \"67890\": -50, # Decrease likelihood of token ID 67890\n",
+ " \"11111\": 25, # Slightly increase likelihood of token ID 11111\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Response with logit bias: {response.choices[0].message.content}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Parameters\n",
+ "\n",
+ "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
+ "\n",
+ "SGLang extends the standard API with the `extra_body` parameter, allowing for additional customization. One key option within `extra_body` is `chat_template_kwargs`, which can be used to pass arguments to the chat template processor."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"system\",\n",
+ " \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
+ " },\n",
+ " {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
+ " {\n",
+ " \"role\": \"assistant\",\n",
+ " \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
+ " },\n",
+ " {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
+ " ],\n",
+ " temperature=0.3, # Lower temperature for more focused responses\n",
+ " max_tokens=128, # Reasonable length for a concise response\n",
+ " top_p=0.95, # Slightly higher for better fluency\n",
+ " presence_penalty=0.2, # Mild penalty to avoid repetition\n",
+ " frequency_penalty=0.2, # Mild penalty for more natural language\n",
+ " n=1, # Single response is usually more stable\n",
+ " seed=42, # Keep for reproducibility\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Streaming mode is also supported."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Logit Bias Support\n",
+ "\n",
+ "The completions API also supports the `logit_bias` parameter with the same functionality as described in the chat completions section above.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stream = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
+ " stream=True,\n",
+ ")\n",
+ "for chunk in stream:\n",
+ " if chunk.choices[0].delta.content is not None:\n",
+ " print(chunk.choices[0].delta.content, end=\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Returning Routed Experts (MoE Models)\n",
+ "\n",
+ "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example with logit_bias parameter for completions API\n",
+ "# Note: You need to get the actual token IDs from your tokenizer\n",
+ "# For demonstration, we'll use some example token IDs\n",
+ "response = client.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " prompt=\"The best programming language for AI is\",\n",
+ " temperature=0.7,\n",
+ " max_tokens=20,\n",
+ " logit_bias={\n",
+ " \"12345\": 75, # Strongly favor token ID 12345\n",
+ " \"67890\": -100, # Completely avoid token ID 67890\n",
+ " \"11111\": -25, # Slightly discourage token ID 11111\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Response with logit bias: {response.choices[0].text}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Completions\n",
+ "\n",
+ "### Usage\n",
+ "Completions API is similar to Chat Completions API, but without the `messages` parameter or chat templates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = client.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " prompt=\"List 3 countries and their capitals.\",\n",
+ " temperature=0,\n",
+ " max_tokens=64,\n",
+ " n=1,\n",
+ " stop=None,\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Response: {response}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Parameters\n",
+ "\n",
+ "The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
+ "\n",
+ "Here is an example of a detailed completions request:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = client.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " prompt=\"Write a short story about a space explorer.\",\n",
+ " temperature=0.7, # Moderate temperature for creative writing\n",
+ " max_tokens=150, # Longer response for a story\n",
+ " top_p=0.9, # Balanced diversity in word choice\n",
+ " stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n",
+ " presence_penalty=0.3, # Encourage novel elements\n",
+ " frequency_penalty=0.3, # Reduce repetitive phrases\n",
+ " n=1, # Generate one completion\n",
+ " seed=123, # For reproducible results\n",
+ ")\n",
+ "\n",
+ "print_highlight(f\"Response: {response}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Returning Routed Experts (MoE Models)\n",
+ "\n",
+ "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Structured Outputs (JSON, Regex, EBNF)\n",
+ "\n",
+ "For OpenAI compatible structured outputs API, refer to [Structured Outputs](../advanced_features/structured_outputs.ipynb) for more details.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using LoRA Adapters\n",
+ "\n",
+ "SGLang supports LoRA (Low-Rank Adaptation) adapters with OpenAI-compatible APIs. You can specify which adapter to use directly in the `model` parameter using the `base-model:adapter-name` syntax.\n",
+ "\n",
+ "**Server Setup:**\n",
+ "```bash\n",
+ "python -m sglang.launch_server \\\n",
+ " --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+ " --enable-lora \\\n",
+ " --lora-paths adapter_a=/path/to/adapter_a adapter_b=/path/to/adapter_b\n",
+ "```\n",
+ "\n",
+ "For more details on LoRA serving configuration, see the [LoRA documentation](../advanced_features/lora.ipynb).\n",
+ "\n",
+ "**API Call:**\n",
+ "\n",
+ "(Recommended) Use the `model:adapter` syntax to specify which adapter to use:\n",
+ "```python\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct:adapter_a\", # ← base-model:adapter-name\n",
+ " messages=[{\"role\": \"user\", \"content\": \"Convert to SQL: show all users\"}],\n",
+ " max_tokens=50,\n",
+ ")\n",
+ "```\n",
+ "\n",
+ "**Backward Compatible: Using `extra_body`**\n",
+ "\n",
+ "The old `extra_body` method is still supported for backward compatibility:\n",
+ "```python\n",
+ "# Backward compatible method\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[{\"role\": \"user\", \"content\": \"Convert to SQL: show all users\"}],\n",
+ " extra_body={\"lora_path\": \"adapter_a\"}, # ← old method\n",
+ " max_tokens=50,\n",
+ ")\n",
+ "```\n",
+ "**Note:** When both `model:adapter` and `extra_body[\"lora_path\"]` are specified, the `model:adapter` syntax takes precedence."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/basic_usage/openai_api_embeddings.ipynb b/sglang/docs/basic_usage/openai_api_embeddings.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..a6c90c06b5f07e4001046f06628772a8a9233d21
--- /dev/null
+++ b/sglang/docs/basic_usage/openai_api_embeddings.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# OpenAI APIs - Embedding\n",
+ "\n",
+ "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+ "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/embeddings).\n",
+ "\n",
+ "This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/retrieval_ranking/embedding_models.md)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server\n",
+ "\n",
+ "Launch the server in your terminal and wait for it to initialize. Remember to add `--is-embedding` to the command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "embedding_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+ " --host 0.0.0.0 --is-embedding --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using cURL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess, json\n",
+ "\n",
+ "text = \"Once upon a time\"\n",
+ "\n",
+ "curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+ " -H \"Content-Type: application/json\" \\\n",
+ " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
+ "\n",
+ "result = subprocess.check_output(curl_text, shell=True)\n",
+ "\n",
+ "print(result)\n",
+ "\n",
+ "text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
+ "\n",
+ "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using Python Requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "text = \"Once upon a time\"\n",
+ "\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/v1/embeddings\",\n",
+ " json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n",
+ ")\n",
+ "\n",
+ "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
+ "\n",
+ "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using OpenAI Python Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "\n",
+ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "# Text embedding example\n",
+ "response = client.embeddings.create(\n",
+ " model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n",
+ " input=text,\n",
+ ")\n",
+ "\n",
+ "embedding = response.data[0].embedding[:10]\n",
+ "print_highlight(f\"Text embedding (first 10): {embedding}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using Input IDs\n",
+ "\n",
+ "SGLang also supports `input_ids` as input to get the embedding."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import os\n",
+ "from transformers import AutoTokenizer\n",
+ "\n",
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+ "\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n",
+ "input_ids = tokenizer.encode(text)\n",
+ "\n",
+ "curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+ " -H \"Content-Type: application/json\" \\\n",
+ " -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
+ "\n",
+ "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
+ " 0\n",
+ "][\"embedding\"]\n",
+ "\n",
+ "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(embedding_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi-Modal Embedding Model\n",
+ "Please refer to [Multi-Modal Embedding Model](../supported_models/retrieval_ranking/embedding_models.md)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/basic_usage/openai_api_vision.ipynb b/sglang/docs/basic_usage/openai_api_vision.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..45198bb3c63766700d8ee5845b1ea97ff35b5763
--- /dev/null
+++ b/sglang/docs/basic_usage/openai_api_vision.ipynb
@@ -0,0 +1,252 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# OpenAI APIs - Vision\n",
+ "\n",
+ "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+ "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n",
+ "This tutorial covers the vision APIs for vision language models.\n",
+ "\n",
+ "SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/text_generation/multimodal_language_models.md).\n",
+ "\n",
+ "As an alternative to the OpenAI API, you can also use the [SGLang offline engine](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server\n",
+ "\n",
+ "Launch the server in your terminal and wait for it to initialize."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "vision_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=vision_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using cURL\n",
+ "\n",
+ "Once the server is up, you can send test requests using curl or requests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess\n",
+ "\n",
+ "curl_command = f\"\"\"\n",
+ "curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
+ " -H \"Content-Type: application/json\" \\\\\n",
+ " -d '{{\n",
+ " \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+ " \"messages\": [\n",
+ " {{\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {{\n",
+ " \"type\": \"text\",\n",
+ " \"text\": \"What’s in this image?\"\n",
+ " }},\n",
+ " {{\n",
+ " \"type\": \"image_url\",\n",
+ " \"image_url\": {{\n",
+ " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ " }}\n",
+ " }}\n",
+ " ]\n",
+ " }}\n",
+ " ],\n",
+ " \"max_tokens\": 300\n",
+ " }}'\n",
+ "\"\"\"\n",
+ "\n",
+ "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+ "print_highlight(response)\n",
+ "\n",
+ "\n",
+ "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+ "print_highlight(response)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using Python Requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+ "\n",
+ "data = {\n",
+ " \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+ " \"messages\": [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
+ " {\n",
+ " \"type\": \"image_url\",\n",
+ " \"image_url\": {\n",
+ " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ " },\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
+ " \"max_tokens\": 300,\n",
+ "}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "print_highlight(response.text)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using OpenAI Python Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from openai import OpenAI\n",
+ "\n",
+ "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {\n",
+ " \"type\": \"text\",\n",
+ " \"text\": \"What is in this image?\",\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"image_url\",\n",
+ " \"image_url\": {\n",
+ " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ " },\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
+ " max_tokens=300,\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multiple-Image Inputs\n",
+ "\n",
+ "The server also supports multiple images and interleaved text and images if the model supports it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from openai import OpenAI\n",
+ "\n",
+ "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+ " messages=[\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": [\n",
+ " {\n",
+ " \"type\": \"image_url\",\n",
+ " \"image_url\": {\n",
+ " \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\",\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"image_url\",\n",
+ " \"image_url\": {\n",
+ " \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n",
+ " },\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"text\",\n",
+ " \"text\": \"I have two very different images. They are not related at all. \"\n",
+ " \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ " ],\n",
+ " temperature=0,\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(vision_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/basic_usage/popular_model_usage.rst b/sglang/docs/basic_usage/popular_model_usage.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ec0268ed7cf2827f1627363e4d64931f1e5094da
--- /dev/null
+++ b/sglang/docs/basic_usage/popular_model_usage.rst
@@ -0,0 +1,19 @@
+Popular Model Usage (DeepSeek, GPT-OSS, GLM, Llama, MiniMax, Qwen, and more)
+===============================================================
+
+For more usage examples and recipes, visit the `SGLang Cookbook `_.
+
+.. toctree::
+ :maxdepth: 1
+
+ deepseek_v3.md
+ deepseek_v32.md
+ glm45.md
+ glmv.md
+ gpt_oss.md
+ minimax_m2.md
+ qwen3.md
+ qwen3_5.md
+ qwen3_vl.md
+ deepseek_ocr.md
+ llama4.md
diff --git a/sglang/docs/basic_usage/qwen3.md b/sglang/docs/basic_usage/qwen3.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0364176c09a94f764e503f9e2a73ddf286a4db1
--- /dev/null
+++ b/sglang/docs/basic_usage/qwen3.md
@@ -0,0 +1,39 @@
+# Qwen3-Next Usage
+
+SGLang has supported Qwen3-Next-80B-A3B-Instruct and Qwen3-Next-80B-A3B-Thinking since [this PR](https://github.com/sgl-project/sglang/pull/10233).
+
+## Launch Qwen3-Next with SGLang
+
+To serve Qwen3-Next models on 4xH100/H200 GPUs:
+
+```bash
+python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4
+```
+
+### Configuration Tips
+- `--max-mamba-cache-size`: Adjust `--max-mamba-cache-size` to increase mamba cache space and max running requests capability. It will decrease KV cache space as a trade-off. You can adjust it according to workload.
+- `--mamba-ssm-dtype`: `bfloat16` or `float32`, use `bfloat16` to save mamba cache size and `float32` to get more accurate results. The default setting is `float32`.
+- `--mamba-full-memory-ratio`: The ratio of mamba state memory to full kv cache memory. The default is 0.9.
+
+### Mamba Radix Cache
+SGLang supports prefix caching for Qwen3-Next models named `MambaRadixCache`, which improves inference speed by reusing computation results. There are two versions of `MambaRadixCache`:
+- `no_buffer`: The default version, which is also other hybrid linear models' choice. When it is enabled, SGLang will automatically close overlap schedule for compatibility reasons.
+- `extra_buffer`: An optimized version that is compatible with features like page size > 1, overlap schedule, and speculative decoding. It also supports storing mamba state in branching positions. However, it requires two extra mamba spaces for a ping-pong buffer for each request. To enable it, add the argument `--mamba-scheduler-strategy extra_buffer` when launching the server.
+
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Qwen3-Next models with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+
+``` bash
+python3 -m sglang.launch_server \
+ --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+ --tp 4 \
+ --speculative-num-steps 3 \
+ --speculative-eagle-topk 1 \
+ --speculative-num-draft-tokens 4 \
+ --speculative-algo NEXTN
+```
+
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/10233).
diff --git a/sglang/docs/basic_usage/qwen3_vl.md b/sglang/docs/basic_usage/qwen3_vl.md
new file mode 100644
index 0000000000000000000000000000000000000000..f05e7832a5340a2d22c9550861c706a86387b88c
--- /dev/null
+++ b/sglang/docs/basic_usage/qwen3_vl.md
@@ -0,0 +1,130 @@
+# Qwen3-VL Usage
+
+[Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl)
+is Alibaba’s latest multimodal large language model with strong text, vision, and reasoning capabilities.
+SGLang supports Qwen3-VL Family of models with Image and Video input support.
+
+## Launch commands for SGLang
+
+Below are suggested launch commands tailored for different hardware / precision modes
+
+### FP8 (quantised) mode
+For high memory-efficiency and latency optimized deployments (e.g., on H100, H200) where FP8 checkpoint is supported:
+```bash
+python3 -m sglang.launch_server \
+ --model-path Qwen/Qwen3-VL-235B-A22B-Instruct-FP8 \
+ --tp 8 \
+ --ep 8 \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --keep-mm-feature-on-device
+```
+
+### Non-FP8 (BF16 / full precision) mode
+For deployments on A100/H100 where BF16 is used (or FP8 snapshot not used):
+```bash
+python3 -m sglang.launch_server \
+ --model-path Qwen/Qwen3-VL-235B-A22B-Instruct \
+ --tp 8 \
+ --ep 8 \
+ --host 0.0.0.0 \
+ --port 30000 \
+```
+
+## Hardware-specific notes / recommendations
+
+- On H100 with FP8: Use the FP8 checkpoint for best memory efficiency.
+- On A100 / H100 with BF16 (non-FP8): It’s recommended to use `--mm-max-concurrent-calls` to control parallel throughput and GPU memory usage during image/video inference.
+- On H200 & B200: The model can be run “out of the box”, supporting full context length plus concurrent image + video processing.
+
+## Sending Image/Video Requests
+
+### Image input:
+
+```python
+import requests
+
+url = f"http://localhost:30000/v1/chat/completions"
+
+data = {
+ "model": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s in this image?"},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true"
+ },
+ },
+ ],
+ }
+ ],
+ "max_tokens": 300,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
+
+### Video Input:
+
+```python
+import requests
+
+url = f"http://localhost:30000/v1/chat/completions"
+
+data = {
+ "model": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "What’s happening in this video?"},
+ {
+ "type": "video_url",
+ "video_url": {
+ "url": "https://github.com/sgl-project/sgl-test-files/raw/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+ },
+ },
+ ],
+ }
+ ],
+ "max_tokens": 300,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
+
+## Important Server Parameters and Flags
+
+When launching the model server for **multimodal support**, you can use the following command-line arguments to fine-tune performance and behavior:
+
+- `--mm-attention-backend`: Specify multimodal attention backend. Eg. `fa3`(Flash Attention 3)
+- `--mm-max-concurrent-calls `: Specifies the **maximum number of concurrent asynchronous multimodal data processing calls** allowed on the server. Use this to control parallel throughput and GPU memory usage during image/video inference.
+- `--mm-per-request-timeout `: Defines the **timeout duration (in seconds)** for each multimodal request. If a request exceeds this time limit (e.g., for very large video inputs), it will be automatically terminated.
+- `--keep-mm-feature-on-device`: Instructs the server to **retain multimodal feature tensors on the GPU** after processing. This avoids device-to-host (D2H) memory copies and improves performance for repeated or high-frequency inference workloads.
+- `SGLANG_USE_CUDA_IPC_TRANSPORT=1`: Shared memory pool based CUDA IPC for multi-modal data transport. For significantly improving e2e latency.
+
+### Example usage with the above optimizations:
+```bash
+SGLANG_USE_CUDA_IPC_TRANSPORT=1 \
+SGLANG_VLM_CACHE_SIZE_MB=0 \
+python -m sglang.launch_server \
+ --model-path Qwen/Qwen3-VL-235B-A22B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --trust-remote-code \
+ --tp-size 8 \
+ --enable-cache-report \
+ --log-level info \
+ --max-running-requests 64 \
+ --mem-fraction-static 0.65 \
+ --chunked-prefill-size 8192 \
+ --attention-backend fa3 \
+ --mm-attention-backend fa3 \
+ --enable-metrics
+```
diff --git a/sglang/docs/basic_usage/sampling_params.md b/sglang/docs/basic_usage/sampling_params.md
new file mode 100644
index 0000000000000000000000000000000000000000..23415f9af555646f91f2be9544e11053e700717a
--- /dev/null
+++ b/sglang/docs/basic_usage/sampling_params.md
@@ -0,0 +1,347 @@
+# Sampling Parameters
+
+This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime.
+If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](openai_api_completions.ipynb).
+
+## `/generate` Endpoint
+
+The `/generate` endpoint accepts the following parameters in JSON format. For detailed usage, see the [native API doc](native_api.ipynb). The object is defined at `io_struct.py::GenerateReqInput`. You can also read the source code to find more arguments and docs.
+
+| Argument | Type/Default | Description |
+|----------------------------|------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| text | `Optional[Union[List[str], str]] = None` | The input prompt. Can be a single prompt or a batch of prompts. |
+| input_ids | `Optional[Union[List[List[int]], List[int]]] = None` | The token IDs for text; one can specify either text or input_ids. |
+| input_embeds | `Optional[Union[List[List[List[float]]], List[List[float]]]] = None` | The embeddings for input_ids; one can specify either text, input_ids, or input_embeds. |
+| image_data | `Optional[Union[List[List[ImageDataItem]], List[ImageDataItem], ImageDataItem]] = None` | The image input. Supports three formats: (1) **Raw images**: PIL Image, file path, URL, or base64 string; (2) **Processor output**: Dict with `format: "processor_output"` containing HuggingFace processor outputs; (3) **Precomputed embeddings**: Dict with `format: "precomputed_embedding"` and `feature` containing pre-calculated visual embeddings. Can be a single image, list of images, or list of lists of images. See [Multimodal Input Formats](#multimodal-input-formats) for details. |
+| audio_data | `Optional[Union[List[AudioDataItem], AudioDataItem]] = None` | The audio input. Can be a file name, URL, or base64 encoded string. |
+| sampling_params | `Optional[Union[List[Dict], Dict]] = None` | The sampling parameters as described in the sections below. |
+| rid | `Optional[Union[List[str], str]] = None` | The request ID. |
+| return_logprob | `Optional[Union[List[bool], bool]] = None` | Whether to return log probabilities for tokens. |
+| logprob_start_len | `Optional[Union[List[int], int]] = None` | If return_logprob, the start location in the prompt for returning logprobs. Default is "-1", which returns logprobs for output tokens only. |
+| top_logprobs_num | `Optional[Union[List[int], int]] = None` | If return_logprob, the number of top logprobs to return at each position. |
+| token_ids_logprob | `Optional[Union[List[List[int]], List[int]]] = None` | If return_logprob, the token IDs to return logprob for. |
+| return_text_in_logprobs | `bool = False` | Whether to detokenize tokens in text in the returned logprobs. |
+| stream | `bool = False` | Whether to stream output. |
+| lora_path | `Optional[Union[List[Optional[str]], Optional[str]]] = None` | The path to the LoRA. |
+| custom_logit_processor | `Optional[Union[List[Optional[str]], str]] = None` | Custom logit processor for advanced sampling control. Must be a serialized instance of `CustomLogitProcessor` using its `to_str()` method. For usage see below. |
+| return_hidden_states | `Union[List[bool], bool] = False` | Whether to return hidden states. |
+| return_routed_experts | `bool = False` | Whether to return routed experts for MoE models. Requires `--enable-return-routed-experts` server flag. Returns base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. |
+
+## Sampling parameters
+
+The object is defined at `sampling_params.py::SamplingParams`. You can also read the source code to find more arguments and docs.
+
+### Note on defaults
+
+By default, SGLang initializes several sampling parameters from the model's `generation_config.json` (when the server is launched with `--sampling-defaults model`, which is the default). To use SGLang/OpenAI constant defaults instead, start the server with `--sampling-defaults openai`. You can always override any parameter per request via `sampling_params`.
+
+```bash
+# Use model-provided defaults from generation_config.json (default behavior)
+python -m sglang.launch_server --model-path --sampling-defaults model
+
+# Use SGLang/OpenAI constant defaults instead
+python -m sglang.launch_server --model-path --sampling-defaults openai
+```
+
+### Core parameters
+
+| Argument | Type/Default | Description |
+|-----------------|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| max_new_tokens | `int = 128` | The maximum output length measured in tokens. |
+| stop | `Optional[Union[str, List[str]]] = None` | One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. |
+| stop_token_ids | `Optional[List[int]] = None` | Provide stop words in the form of token IDs. Generation will stop if one of these token IDs is sampled. |
+| stop_regex | `Optional[Union[str, List[str]]] = None` | Stop when hitting any of the regex patterns in this list |
+| temperature | `float (model default; fallback 1.0)` | [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, a higher temperature leads to more diversity. |
+| top_p | `float (model default; fallback 1.0)` | [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. |
+| top_k | `int (model default; fallback -1)` | [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. |
+| min_p | `float (model default; fallback 0.0)` | [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. |
+
+### Penalizers
+
+| Argument | Type/Default | Description |
+|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| frequency_penalty | `float = 0.0` | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
+| presence_penalty | `float = 0.0` | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
+| repetition_penalty | `float = 1.0` | Scales the logits of previously generated tokens to discourage (values > 1) or encourage (values < 1) repetition. Valid range is `[0, 2]`; `1.0` leaves probabilities unchanged. |
+| min_new_tokens | `int = 0` | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
+
+### Constrained decoding
+
+Please refer to our dedicated guide on [constrained decoding](../advanced_features/structured_outputs.ipynb) for the following parameters.
+
+| Argument | Type/Default | Description |
+|-----------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| json_schema | `Optional[str] = None` | JSON schema for structured outputs. |
+| regex | `Optional[str] = None` | Regex for structured outputs. |
+| ebnf | `Optional[str] = None` | EBNF for structured outputs. |
+| structural_tag | `Optional[str] = None` | The structural tag for structured outputs. |
+
+### Other options
+
+| Argument | Type/Default | Description |
+|-------------------------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| n | `int = 1` | Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeating the same prompts several times offers better control and efficiency.) |
+| ignore_eos | `bool = False` | Don't stop generation when EOS token is sampled. |
+| skip_special_tokens | `bool = True` | Remove special tokens during decoding. |
+| spaces_between_special_tokens | `bool = True` | Whether or not to add spaces between special tokens during detokenization. |
+| no_stop_trim | `bool = False` | Don't trim stop words or EOS token from the generated text. |
+| custom_params | `Optional[List[Optional[Dict[str, Any]]]] = None` | Used when employing `CustomLogitProcessor`. For usage, see below. |
+
+## Examples
+
+### Normal
+
+Launch a server:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "The capital of France is",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 32,
+ },
+ },
+)
+print(response.json())
+```
+
+Detailed example in [send request](./send_request.ipynb).
+
+### Streaming
+
+Send a request and stream the output:
+
+```python
+import requests, json
+
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "The capital of France is",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 32,
+ },
+ "stream": True,
+ },
+ stream=True,
+)
+
+prev = 0
+for chunk in response.iter_lines(decode_unicode=False):
+ chunk = chunk.decode("utf-8")
+ if chunk and chunk.startswith("data:"):
+ if chunk == "data: [DONE]":
+ break
+ data = json.loads(chunk[5:].strip("\n"))
+ output = data["text"].strip()
+ print(output[prev:], end="", flush=True)
+ prev = len(output)
+print("")
+```
+
+Detailed example in [openai compatible api](openai_api_completions.ipynb).
+
+### Multimodal
+
+Launch a server:
+
+```bash
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov
+```
+
+Download an image:
+
+```bash
+curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+ "<|im_start|>user\n\nDescribe this image in a very short sentence.<|im_end|>\n"
+ "<|im_start|>assistant\n",
+ "image_data": "example_image.png",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 32,
+ },
+ },
+)
+print(response.json())
+```
+
+The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
+
+Streaming is supported in a similar manner as [above](#streaming).
+
+Detailed example in [OpenAI API Vision](openai_api_vision.ipynb).
+
+### Structured Outputs (JSON, Regex, EBNF)
+
+You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
+
+SGLang supports two grammar backends:
+
+- [XGrammar](https://github.com/mlc-ai/xgrammar) (default): Supports JSON schema, regular expression, and EBNF constraints.
+ - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md).
+- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.
+
+If instead you want to initialize the Outlines backend, you can use `--grammar-backend outlines` flag:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: xgrammar)
+```
+
+```python
+import json
+import requests
+
+json_schema = json.dumps({
+ "type": "object",
+ "properties": {
+ "name": {"type": "string", "pattern": "^[\\w]+$"},
+ "population": {"type": "integer"},
+ },
+ "required": ["name", "population"],
+})
+
+# JSON (works with both Outlines and XGrammar)
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "Here is the information of the capital of France in the JSON format.\n",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 64,
+ "json_schema": json_schema,
+ },
+ },
+)
+print(response.json())
+
+# Regular expression (Outlines backend only)
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "Paris is the capital of",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 64,
+ "regex": "(France|England)",
+ },
+ },
+)
+print(response.json())
+
+# EBNF (XGrammar backend only)
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "Write a greeting.",
+ "sampling_params": {
+ "temperature": 0,
+ "max_new_tokens": 64,
+ "ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
+ },
+ },
+)
+print(response.json())
+```
+
+Detailed example in [structured outputs](../advanced_features/structured_outputs.ipynb).
+
+### Custom logit processor
+
+Launch a server with `--enable-custom-logit-processor` flag on.
+
+```bash
+python -m sglang.launch_server \
+ --model-path meta-llama/Meta-Llama-3-8B-Instruct \
+ --port 30000 \
+ --enable-custom-logit-processor
+```
+
+Define a custom logit processor that will always sample a specific token id.
+
+```python
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+
+class DeterministicLogitProcessor(CustomLogitProcessor):
+ """A dummy logit processor that changes the logits to always
+ sample the given token id.
+ """
+
+ def __call__(self, logits, custom_param_list):
+ # Check that the number of logits matches the number of custom parameters
+ assert logits.shape[0] == len(custom_param_list)
+ key = "token_id"
+
+ for i, param_dict in enumerate(custom_param_list):
+ # Mask all other tokens
+ logits[i, :] = -float("inf")
+ # Assign highest probability to the specified token
+ logits[i, param_dict[key]] = 0.0
+ return logits
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:30000/generate",
+ json={
+ "text": "The capital of France is",
+ "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+ "sampling_params": {
+ "temperature": 0.0,
+ "max_new_tokens": 32,
+ "custom_params": {"token_id": 5},
+ },
+ },
+)
+print(response.json())
+```
+
+Send an OpenAI chat completion request:
+
+```python
+import openai
+from sglang.utils import print_highlight
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None")
+
+response = client.chat.completions.create(
+ model="meta-llama/Meta-Llama-3-8B-Instruct",
+ messages=[
+ {"role": "user", "content": "List 3 countries and their capitals."},
+ ],
+ temperature=0.0,
+ max_tokens=32,
+ extra_body={
+ "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+ "custom_params": {"token_id": 5},
+ },
+)
+
+print_highlight(f"Response: {response}")
+```
diff --git a/sglang/docs/basic_usage/send_request.ipynb b/sglang/docs/basic_usage/send_request.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..968a23b8d632da4194b231d66d78730891d34013
--- /dev/null
+++ b/sglang/docs/basic_usage/send_request.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sending Requests\n",
+ "This notebook provides a quick-start guide to use SGLang in chat completions after installation. Once your server is running, API documentation is available at `http://localhost:30000/docs` (Swagger UI), `http://localhost:30000/redoc` (ReDoc), or `http://localhost:30000/openapi.json` (OpenAPI spec, useful for AI agents). Replace `30000` with your port if using a different one.\n",
+ "\n",
+ "- For Vision Language Models, see [OpenAI APIs - Vision](openai_api_vision.ipynb).\n",
+ "- For Embedding Models, see [OpenAI APIs - Embedding](openai_api_embeddings.ipynb) and [Encode (embedding model)](native_api.html#Encode-(embedding-model)).\n",
+ "- For Reward Models, see [Classify (reward model)](native_api.html#Classify-(reward-model))."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+ "\n",
+ "# This is equivalent to running the following command in your terminal\n",
+ "# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\"\"\"\n",
+ "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+ " --host 0.0.0.0 --log-level warning\n",
+ "\"\"\")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using cURL\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess, json\n",
+ "\n",
+ "curl_command = f\"\"\"\n",
+ "curl -s http://localhost:{port}/v1/chat/completions \\\n",
+ " -H \"Content-Type: application/json\" \\\n",
+ " -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
+ "\"\"\"\n",
+ "\n",
+ "response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
+ "print_highlight(response)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using Python Requests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+ "\n",
+ "data = {\n",
+ " \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
+ " \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
+ "}\n",
+ "\n",
+ "response = requests.post(url, json=data)\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using OpenAI Python Client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "\n",
+ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=64,\n",
+ ")\n",
+ "print_highlight(response)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Streaming"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import openai\n",
+ "\n",
+ "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+ "\n",
+ "# Use stream=True for streaming responses\n",
+ "response = client.chat.completions.create(\n",
+ " model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+ " ],\n",
+ " temperature=0,\n",
+ " max_tokens=64,\n",
+ " stream=True,\n",
+ ")\n",
+ "\n",
+ "# Handle the streaming output\n",
+ "for chunk in response:\n",
+ " if chunk.choices[0].delta.content:\n",
+ " print(chunk.choices[0].delta.content, end=\"\", flush=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using Native Generation APIs\n",
+ "\n",
+ "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests\n",
+ "\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": \"The capital of France is\",\n",
+ " \"sampling_params\": {\n",
+ " \"temperature\": 0,\n",
+ " \"max_new_tokens\": 32,\n",
+ " },\n",
+ " },\n",
+ ")\n",
+ "\n",
+ "print_highlight(response.json())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Streaming"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import requests, json\n",
+ "\n",
+ "response = requests.post(\n",
+ " f\"http://localhost:{port}/generate\",\n",
+ " json={\n",
+ " \"text\": \"The capital of France is\",\n",
+ " \"sampling_params\": {\n",
+ " \"temperature\": 0,\n",
+ " \"max_new_tokens\": 32,\n",
+ " },\n",
+ " \"stream\": True,\n",
+ " },\n",
+ " stream=True,\n",
+ ")\n",
+ "\n",
+ "prev = 0\n",
+ "for chunk in response.iter_lines(decode_unicode=False):\n",
+ " chunk = chunk.decode(\"utf-8\")\n",
+ " if chunk and chunk.startswith(\"data:\"):\n",
+ " if chunk == \"data: [DONE]\":\n",
+ " break\n",
+ " data = json.loads(chunk[5:].strip(\"\\n\"))\n",
+ " output = data[\"text\"]\n",
+ " print(output[prev:], end=\"\", flush=True)\n",
+ " prev = len(output)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/developer_guide/bench_serving.md b/sglang/docs/developer_guide/bench_serving.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a67723c8ab7a28aec674eec613fa1aff131512a
--- /dev/null
+++ b/sglang/docs/developer_guide/bench_serving.md
@@ -0,0 +1,355 @@
+# Bench Serving Guide
+
+This guide explains how to benchmark online serving throughput and latency using `python -m sglang.bench_serving`. It supports multiple inference backends via OpenAI-compatible and native endpoints, and produces both console metrics and optional JSONL outputs.
+
+### What it does
+
+- Generates synthetic or dataset-driven prompts and submits them to a target serving endpoint
+- Measures throughput, time-to-first-token (TTFT), inter-token latency (ITL), per-request end-to-end latency, and more
+- Supports streaming or non-streaming modes, rate control, and concurrency limits
+
+### Supported backends and endpoints
+
+- `sglang` / `sglang-native`: `POST /generate`
+- `sglang-oai`, `vllm`, `lmdeploy`: `POST /v1/completions`
+- `sglang-oai-chat`, `vllm-chat`, `lmdeploy-chat`: `POST /v1/chat/completions`
+- `trt` (TensorRT-LLM): `POST /v2/models/ensemble/generate_stream`
+- `gserver`: Custom server (Not Implemented yet in this script)
+- `truss`: `POST /v1/models/model:predict`
+
+If `--base-url` is provided, requests are sent to it. Otherwise, `--host` and `--port` are used. When `--model` is not provided, the script will attempt to query `GET /v1/models` for an available model ID (OpenAI-compatible endpoints).
+
+### Prerequisites
+
+- Python 3.8+
+- Dependencies typically used by this script: `aiohttp`, `numpy`, `requests`, `tqdm`, `transformers`, and for some datasets `datasets`, `pillow`, `pybase64`. Install as needed.
+- An inference server running and reachable via the endpoints above
+- If your server requires authentication, set environment variable `OPENAI_API_KEY` (used as `Authorization: Bearer `)
+
+### Quick start
+
+Run a basic benchmark against an sglang server exposing `/generate`:
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+```
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --num-prompts 1000 \
+ --model meta-llama/Llama-3.1-8B-Instruct
+```
+
+Or, using an OpenAI-compatible endpoint (completions):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend vllm \
+ --base-url http://127.0.0.1:8000 \
+ --num-prompts 1000 \
+ --model meta-llama/Llama-3.1-8B-Instruct
+```
+
+### Datasets
+
+Select with `--dataset-name`:
+
+- `sharegpt` (default): loads ShareGPT-style pairs; optionally restrict with `--sharegpt-context-len` and override outputs with `--sharegpt-output-len`
+- `random`: random text lengths; sampled from ShareGPT token space
+- `random-ids`: random token ids (can lead to gibberish)
+- `image`: generates images and wraps them in chat messages; supports custom resolutions, multiple formats, and different content types
+- `generated-shared-prefix`: synthetic dataset with shared long system prompts and short questions
+- `mmmu`: samples from MMMU (Math split) and includes images
+
+Common dataset flags:
+
+- `--num-prompts N`: number of requests
+- `--random-input-len`, `--random-output-len`, `--random-range-ratio`: for random/random-ids/image
+- `--image-count`: Number of images per request (for `image` dataset).
+
+- `--apply-chat-template`: apply tokenizer chat template when constructing prompts
+- `--dataset-path PATH`: file path for ShareGPT json; if blank and missing, it will be downloaded and cached
+
+Generated Shared Prefix flags (for `generated-shared-prefix`):
+
+- `--gsp-num-groups`
+- `--gsp-prompts-per-group`
+- `--gsp-system-prompt-len`
+- `--gsp-question-len`
+- `--gsp-output-len`
+
+Image dataset flags (for `image`):
+
+- `--image-count`: Number of images per request
+- `--image-resolution`: Image resolution; supports presets (4k, 1080p, 720p, 360p) or custom 'heightxwidth' format (e.g., 1080x1920, 512x768)
+- `--image-format`: Image format (jpeg or png)
+- `--image-content`: Image content type (random or blank)
+
+### Examples
+
+1. To benchmark image dataset with 3 images per request, 500 prompts, 512 input length, and 512 output length, you can run:
+
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-3B-Instruct --disable-radix-cache
+```
+
+```bash
+python -m sglang.bench_serving \
+ --backend sglang-oai-chat \
+ --dataset-name image \
+ --num-prompts 500 \
+ --image-count 3 \
+ --image-resolution 720p \
+ --random-input-len 512 \
+ --random-output-len 512
+```
+
+2. To benchmark random dataset with 3000 prompts, 1024 input length, and 1024 output length, you can run:
+
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-3B-Instruct
+```
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --dataset-name random \
+ --num-prompts 3000 \
+ --random-input 1024 \
+ --random-output 1024 \
+ --random-range-ratio 0.5
+```
+
+### Choosing model and tokenizer
+
+- `--model` is required unless the backend exposes `GET /v1/models`, in which case the first model ID is auto-selected.
+- `--tokenizer` defaults to `--model`. Both can be HF model IDs or local paths.
+- For ModelScope workflows, setting `SGLANG_USE_MODELSCOPE=true` enables fetching via ModelScope (weights are skipped for speed).
+- If your tokenizer lacks a chat template, the script warns because token counting can be less robust for gibberish outputs.
+
+### Rate, concurrency, and streaming
+
+- `--request-rate`: requests per second. `inf` sends all immediately (burst). Non-infinite rate uses a Poisson process for arrival times.
+- `--max-concurrency`: caps concurrent in-flight requests regardless of arrival rate.
+- `--disable-stream`: switch to non-streaming mode when supported; TTFT then equals total latency for chat completions.
+
+### Other key options
+
+- `--output-file FILE.jsonl`: append JSONL results to file; auto-named if unspecified
+- `--output-details`: include per-request arrays (generated texts, errors, ttfts, itls, input/output lens)
+- `--extra-request-body '{"top_p":0.9,"temperature":0.6}'`: merged into payload (sampling params, etc.)
+- `--disable-ignore-eos`: pass through EOS behavior (varies by backend)
+- `--warmup-requests N`: run warmup requests with short output first (default 1)
+- `--flush-cache`: call `/flush_cache` (sglang) before main run
+- `--profile`: call `/start_profile` and `/stop_profile` (requires server to enable profiling, e.g., `SGLANG_TORCH_PROFILER_DIR`)
+- `--lora-name name1 name2 ...`: randomly pick one per request and pass to backend (e.g., `lora_path` for sglang)
+- `--tokenize-prompt`: send integer IDs instead of text (currently supports `--backend sglang` only)
+
+### Authentication
+
+If your target endpoint requires OpenAI-style auth, set:
+
+```bash
+export OPENAI_API_KEY=sk-...yourkey...
+```
+
+The script will add `Authorization: Bearer $OPENAI_API_KEY` automatically for OpenAI-compatible routes.
+
+### Metrics explained
+
+Printed after each run:
+
+- Request throughput (req/s)
+- Input token throughput (tok/s) - includes both text and vision tokens
+- Output token throughput (tok/s)
+- Total token throughput (tok/s) - includes both text and vision tokens
+- Total input text tokens and Total input vision tokens - per-modality breakdown
+- Concurrency: aggregate time of all requests divided by wall time
+- End-to-End Latency (ms): mean/median/std/p99 per-request total latency
+- Time to First Token (TTFT, ms): mean/median/std/p99 for streaming mode
+- Inter-Token Latency (ITL, ms): mean/median/std/p95/p99/max between tokens
+- TPOT (ms): Token processing time after first token, i.e., `(latency - ttft)/(tokens-1)`
+- Accept length (sglang-only, if available): speculative decoding accept length
+
+The script also retokenizes generated text with the configured tokenizer and reports "retokenized" counts.
+
+### JSONL output format
+
+When `--output-file` is set, one JSON object is appended per run. Base fields:
+
+- Arguments summary: backend, dataset, request_rate, max_concurrency, etc.
+- Duration and totals: completed, total_input_tokens, total_output_tokens, retokenized totals
+- Throughputs and latency statistics as printed in the console
+- `accept_length` when available (sglang)
+
+With `--output-details`, an extended object also includes arrays:
+
+- `input_lens`, `output_lens`
+- `ttfts`, `itls` (per request: ITL arrays)
+- `generated_texts`, `errors`
+
+### End-to-end examples
+
+1) sglang native `/generate` (streaming):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --dataset-name random \
+ --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.5 \
+ --num-prompts 2000 \
+ --request-rate 100 \
+ --max-concurrency 512 \
+ --output-file sglang_random.jsonl --output-details
+```
+
+2) OpenAI-compatible Completions (e.g., vLLM):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend vllm \
+ --base-url http://127.0.0.1:8000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --dataset-name sharegpt \
+ --num-prompts 1000 \
+ --sharegpt-output-len 256
+```
+
+3) OpenAI-compatible Chat Completions (streaming):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend vllm-chat \
+ --base-url http://127.0.0.1:8000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --dataset-name random \
+ --num-prompts 500 \
+ --apply-chat-template
+```
+
+4) Images (VLM) with chat template:
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model your-vlm-model \
+ --dataset-name image \
+ --image-count 2 \
+ --image-resolution 720p \
+ --random-input-len 128 --random-output-len 256 \
+ --num-prompts 200 \
+ --apply-chat-template
+```
+
+4a) Images with custom resolution:
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model your-vlm-model \
+ --dataset-name image \
+ --image-count 1 \
+ --image-resolution 512x768 \
+ --random-input-len 64 --random-output-len 128 \
+ --num-prompts 100 \
+ --apply-chat-template
+```
+
+4b) 1080p images with PNG format and blank content:
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model your-vlm-model \
+ --dataset-name image \
+ --image-count 1 \
+ --image-resolution 1080p \
+ --image-format png \
+ --image-content blank \
+ --random-input-len 64 --random-output-len 128 \
+ --num-prompts 100 \
+ --apply-chat-template
+```
+
+5) Generated shared prefix (long system prompts + short questions):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --dataset-name generated-shared-prefix \
+ --gsp-num-groups 64 --gsp-prompts-per-group 16 \
+ --gsp-system-prompt-len 2048 --gsp-question-len 128 --gsp-output-len 256 \
+ --num-prompts 1024
+```
+
+6) Tokenized prompts (ids) for strict length control (sglang only):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --dataset-name random \
+ --tokenize-prompt \
+ --random-input-len 2048 --random-output-len 256 --random-range-ratio 0.2
+```
+
+7) Profiling and cache flush (sglang):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model meta-llama/Llama-3.1-8B-Instruct \
+ --profile \
+ --flush-cache
+```
+
+8) TensorRT-LLM streaming endpoint:
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend trt \
+ --base-url http://127.0.0.1:8000 \
+ --model your-trt-llm-model \
+ --dataset-name random \
+ --num-prompts 100 \
+ --disable-ignore-eos
+```
+
+9) Evaluating large-scale KVCache sharing with mooncake trace (sglang only):
+
+```bash
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model model-name \
+ --dataset-name mooncake \
+ --mooncake-slowdown-factor 1.0 \
+ --mooncake-num-rounds 1000 \
+ --mooncake-workload conversation|mooncake|agent|synthetic
+ --use-trace-timestamps true \
+ --random-output-len 256
+```
+
+### Troubleshooting
+
+- All requests failed: verify `--backend`, server URL/port, `--model`, and authentication. Check warmup errors printed by the script.
+- Throughput seems too low: adjust `--request-rate` and `--max-concurrency`; verify server batch size/scheduling; ensure streaming is enabled if appropriate.
+- Token counts look odd: prefer chat/instruct models with proper chat templates; otherwise tokenization of gibberish may be inconsistent.
+- Image/MMMU datasets: ensure you installed extra deps (`pillow`, `datasets`, `pybase64`).
+- Authentication errors (401/403): set `OPENAI_API_KEY` or disable auth on your server.
+
+### Notes
+
+- The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
+- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.
diff --git a/sglang/docs/developer_guide/benchmark_and_profiling.md b/sglang/docs/developer_guide/benchmark_and_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f2c826fcb6dedcbab9682d441e73598568d7ac2
--- /dev/null
+++ b/sglang/docs/developer_guide/benchmark_and_profiling.md
@@ -0,0 +1,467 @@
+# Benchmark and Profiling
+
+## Benchmark
+
+SGLang provides four benchmark tools that operate at different levels of the stack. The table below summarizes their key differences:
+
+| Tool | HTTP Server | Scheduler | Use Case |
+| -------------------------- | --------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- |
+| `bench_serving` | Yes (async HTTP client to a running server) | Yes (indirectly, via server) | Realistic online serving benchmarks with latency metrics (TTFT, TPOT, ITL) |
+| `bench_one_batch_server` | Yes (sends HTTP requests to a running server) | Yes (indirectly, via server) | End-to-end single-batch latency including HTTP and scheduler overhead |
+| `bench_offline_throughput` | No | Yes (directly uses `Engine` in-process) | Maximum throughput measurement without HTTP overhead |
+| `bench_one_batch` | No | No (directly calls `ModelRunner`) | Kernel-level latency profiling of a single static batch |
+
+Use `bench_serving` by default unless there are specific needs.
+
+**`bench_serving`** is an async HTTP load-testing client that sends requests at controlled rates with configurable concurrency to a running server. It measures realistic online serving metrics including time-to-first-token (TTFT), time-per-output-token (TPOT), inter-token latency (ITL), and throughput. Use `num-prompts >= 5 * max-concurrency` to measure steady-state performance. Launch a server with `sglang.launch_server` first.
+
+ ```bash
+ python3 -m sglang.bench_serving --backend sglang --max-concurrency 16 --num-prompts 80 --random-input-len 256 --random-output-len 32 --dataset-name random
+ ```
+
+**`bench_one_batch_server`** sends a single batch as one HTTP request to a running server. Due to only having a single batch, the server is never in a steady-state and metrics will be biased. Launch a server with `sglang.launch_server` first.
+
+ ```bash
+ python3 -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000 --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
+ ```
+
+**`bench_offline_throughput`** directly instantiates the `Engine` object in-process (no HTTP server) and submits all requests at once via `engine.generate()`. The engine's scheduler handles batching and execution. This measures maximum achievable throughput without any network overhead.
+
+ ```bash
+ python3 -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
+ ```
+
+**`bench_one_batch`** is the lowest-level tool. It directly instantiates a `ModelRunner` and calls `extend()` / `decode()` on a fixed static batch, bypassing the scheduler entirely. The prefill and decode phases are run separately, making profiling easier but rendering the metrics unrealistic. Because there is no dynamic batching, it may run out of memory for batch sizes that a real server can handle (a real server chunks prefill into smaller batches). This is best suited for profiling individual kernel performance.
+
+ ```bash
+ python3 -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
+ ```
+
+## Profile with PyTorch Profiler
+
+[Pytorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) is a convenient basic tool to inspect kernel execution time, call stack, and kernel overlap and occupancy.
+
+### Profile a server with `sglang.bench_serving`
+
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# start server
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+
+# send profiling request from client
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile
+```
+
+The `SGLANG_TORCH_PROFILER_DIR` environment variable must be set on both the server and client side; otherwise, the trace file will not be generated correctly. A secure way to do this is by setting it in your shell's resource file (e.g., `~/.bashrc` for bash).
+
+For more details, please refer to [Bench Serving Guide](./bench_serving.md).
+
+### Profile In PD Disaggregation Mode
+
+When profiling in PD disaggregation mode, prefill and decode workers **must be profiled separately** due to torch profiler limitations. The `bench_serving` command provides dedicated options for this:
+
+#### Profile Prefill Workers
+
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# start prefill and decode servers (see PD disaggregation docs for setup)
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1
+
+# start router
+python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+
+# send profiling request targeting prefill workers
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000
+```
+
+#### Profile Decode Workers
+
+```bash
+# send profiling request targeting decode workers
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001
+```
+
+#### Important Notes
+
+- `--profile-prefill-url` and `--profile-decode-url` are **mutually exclusive** - you cannot profile both at the same time
+- Both options support multiple worker URLs for multi-instance setups:
+ ```bash
+ # Profile multiple prefill workers
+ python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 http://127.0.0.1:30002
+
+ # Profile multiple decode workers
+ python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 http://127.0.0.1:30003
+ ```
+- Make sure `SGLANG_TORCH_PROFILER_DIR` is set on all worker nodes before starting the servers
+- For more details on setting up PD disaggregation, see [PD Disaggregation Guide](../advanced_features/pd_disaggregation.md)
+
+### Profile a server with `sglang.bench_offline_throughput`
+```bash
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# profile one batch with bench_one_batch.py
+# batch size can be controlled with --batch argument
+python3 -m sglang.bench_one_batch --model-path meta-llama/Llama-3.1-8B-Instruct --batch 32 --input-len 1024 --output-len 10 --profile
+
+# profile multiple batches with bench_offline_throughput.py
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+
+### Profile a server with `sglang.profiler`
+
+When the server is running (e.g., processing a decoding request), you can start live profiling immediately by sending a profile request to the server.
+
+You can do this by running `python3 -m sglang.profiler`. For example:
+
+```
+# Terminal 1: Send a generation request
+python3 -m sglang.test.send_one
+
+# Terminal 2: Before the above request finishes, quickly launch the following command in a separate terminal.
+# It will generate a profile of the above request for several decoding batches.
+python3 -m sglang.profiler
+```
+
+You can also combine the above operations into a single command
+
+```
+python3 -m sglang.test.send_one --profile
+```
+
+### Profile a server with HTTP API endpoints
+
+SGLang provides HTTP API endpoints to control profiling on a running server. This allows you to start and stop profiling programmatically, which is useful for capturing specific workload patterns.
+
+#### Using `/start_profile` endpoint
+
+The `/start_profile` endpoint starts profiling on the server. You can control when profiling begins and how long it runs using the following parameters:
+
+**Basic usage:**
+
+```bash
+# Start profiling immediately for 10 steps
+curl -X POST http://127.0.0.1:30000/start_profile \
+ -H "Content-Type: application/json" \
+ -d '{
+ "num_steps": 10
+ }'
+```
+
+**Parameters:**
+
+- `output_dir` (optional): Directory where profile traces will be saved. If not specified, uses `SGLANG_TORCH_PROFILER_DIR` environment variable, or `/tmp` as the default
+- `num_steps` (optional): Number of steps to profile. If not specified, profiling continues until manually stopped with `/end_profile`
+- `start_step` (optional): Step number at which to start profiling (inclusive). Useful for skipping warmup iterations
+- `activities` (optional): List of activities to profile, e.g., `["CPU", "GPU"]`. Default is `["CPU", "GPU"]`
+- `merge_profiles` (optional): Whether to merge distributed traces. Default is `false`
+
+**Note on step ranges:** Profiling starts at `start_step` (inclusive) and continues for `num_steps` iterations. For example, with `start_step=3` and `num_steps=10`, profiling captures steps 3, 4, 5, 6, 7, 8, 9, 10, 11, and 12 (10 steps total, starting from step 3).
+
+**Advanced usage with `start_step`:**
+
+```bash
+# Wait 5 steps (warmup), then profile for 10 steps
+curl -X POST http://127.0.0.1:30000/start_profile \
+ -H "Content-Type: application/json" \
+ -d '{
+ "output_dir": "/tmp/profiles",
+ "start_step": 5,
+ "num_steps": 10,
+ "activities": ["CPU", "GPU"]
+ }'
+```
+
+**Continuous profiling (manual stop):**
+
+```bash
+# Start profiling without num_steps - must manually stop with /end_profile
+curl -X POST http://127.0.0.1:30000/start_profile
+```
+
+#### Using `/end_profile` endpoint
+
+The `/end_profile` endpoint stops an ongoing profiling session and saves the trace file.
+
+```bash
+# Stop profiling and save traces
+curl -X POST http://127.0.0.1:30000/end_profile
+```
+
+This is only needed when you start profiling without specifying `num_steps`. If `num_steps` is specified, profiling will automatically stop after that many steps.
+
+#### Example workflow
+
+```bash
+# Terminal 1: Start the server
+export SGLANG_TORCH_PROFILER_DIR=/tmp/profiles
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+
+# Terminal 2: Start continuous profiling
+curl -X POST http://127.0.0.1:30000/start_profile \
+ -H "Content-Type: application/json" \
+ -d '{
+ "start_step": 3
+ }'
+
+# Terminal 3: Send requests to generate load
+python -m sglang.bench_serving --backend sglang --num-prompts 100
+
+# Terminal 2: Stop profiling when done
+curl -X POST http://127.0.0.1:30000/end_profile
+```
+
+### Profiler Trace Merger for Distributed Traces
+
+SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs.
+
+#### Multi-Node Profiling and Shared Storage Considerations
+
+Single-node profiler output merging is completely supported. When profiling in distributed environments spanning multiple nodes, shared storage (e.g., NFS, Lustre) should be accessible by all nodes for the output directory to enable merging of trace files.
+
+If there is no shared storage accessible across nodes, automatic merging of trace files during profiling is not supported directly as of now.
+
+#### HTTP API Usage
+
+```bash
+# Start profiling with automatic trace merging enabled
+curl -X POST /start_profile \
+ -H "Content-Type: application/json" \
+ -d '{
+ "output_dir": "/tmp/profiles", # where to store profile traces
+ "num_steps": 10,
+ "activities": ["CPU", "GPU"],
+ "merge_profiles": true # optional argument to merge profile traces (default=False)
+ }'
+```
+
+#### Command Line Usage
+
+```bash
+# Start profiling with merge enabled
+python -m sglang.profiler \
+ --num-steps 10 \
+ --cpu \
+ --gpu \
+ --output-dir /tmp/profiles \
+ --merge-profiles # optional argument to merge profile traces (default=False)
+```
+
+#### Output Files
+
+The profile merger generates:
+- Individual rank trace files: `{profile_id}-TP-{tp}-DP-{dp}-PP-{pp}-EP-{ep}.trace.json.gz`
+- Merged trace file: `merged-{profile_id}.trace.json.gz`
+
+### Possible PyTorch bugs
+If in any cases you encounter the following error (for example, using qwen 2.5 VL):
+```bash
+RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
+```
+This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
+```bash
+export SGLANG_PROFILE_WITH_STACK=False
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+
+### View traces
+
+Trace files can be loaded and visualized from:
+
+1. https://ui.perfetto.dev/ (any browser)
+2. chrome://tracing (Chrome browser only)
+
+If browser cannot open trace file due to its large size,
+client can generate a small trace file (<100MB) by controlling number of prompts and lengths of prompt outputs.
+For example, when profiling a server,
+
+```bash
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 2 --sharegpt-output-len 100 --profile
+```
+
+This command sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly.
+
+Additionally, if you want to locate the SGLang Python source code through the cuda kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service.
+
+## Profile with Nsight
+
+[Nsight systems](https://docs.nvidia.com/nsight-systems/) is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+1. Prerequisite:
+
+ Install using apt, or run inside a [NVIDIA Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) or [SGLang Docker container](https://github.com/sgl-project/sglang/tree/main/docker).
+
+ ```bash
+ # install nsys
+ # https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html
+ apt update
+ apt install -y --no-install-recommends gnupg
+ echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+ apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+ apt update
+ apt install nsight-systems-cli
+ ```
+
+2. To profile a single batch, use
+
+ ```bash
+ nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.bench_one_batch --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512
+ ```
+
+3. To profile a server, e.g.
+
+ ```bash
+ # launch the server, set the delay and duration times according to needs
+ # after the duration time has been used up, server will be killed by nsys
+
+ nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+
+ # client
+ python3 -m sglang.bench_serving --backend sglang --num-prompts 1000 --dataset-name random --random-input 1024 --random-output 512
+ ```
+
+ In practice, we recommend users to set `--duration` argument to a large value. Whenever user wants the server to stop profiling. Firstly run:
+
+ ```bash
+ nsys sessions list
+ ```
+
+ to get the session id in the form of `profile-XXXXX`, then run:
+
+ ```bash
+ nsys stop --session=profile-XXXXX
+ ```
+
+ to manually kill the profiler and generate `nsys-rep` files instantly.
+
+4. Use NVTX to annotate code regions, e.g. to see their execution time.
+
+ ```bash
+ # install nvtx
+ pip install nvtx
+ ```
+
+ ```python
+ # code snippets
+ import nvtx
+ with nvtx.annotate("description", color="color"):
+ # some critical code
+ ```
+
+### Layer-wise NVTX Profiling with Nsight Systems
+
+SGLang provides built-in layerwise NVTX annotations that can be combined with the CUDA Profiler for detailed per-layer profiling in Nsight Systems. This is particularly useful for identifying performance bottlenecks at the layer level.
+
+#### Using `--enable-layerwise-nvtx-marker` with Nsight Systems and `/start_profile`
+
+The `--enable-layerwise-nvtx-marker` flag automatically adds NVTX markers to every layer in your model. This is particularly powerful when combined with Nsight Systems profiling to see detailed per-layer performance.
+
+**Method 1: Using `/start_profile` with CUDA_PROFILER (for programmatic control)**
+
+This method allows you to control exactly when profiling starts/stops via HTTP API while Nsight Systems is running.
+
+1. Launch the server with layerwise NVTX enabled under Nsight Systems:
+
+ ```bash
+ # Terminal 1: Start server with nsys and capture-range option
+ nsys profile --trace-fork-before-exec=true \
+ --cuda-graph-trace=node \
+ --capture-range=cudaProfilerApi \
+ --capture-range-end=stop \
+ -o layerwise_profile \
+ python -m sglang.launch_server \
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
+ --enable-layerwise-nvtx-marker \
+ --disable-cuda-graph
+ ```
+
+ Note: NVTX markers are not emitted for kernel launches captured by CUDA graphs. Use `--disable-cuda-graph` to ensure all layerwise NVTX markers are emitted in the trace.
+
+2. In another terminal, control profiling via `/start_profile` with `CUDA_PROFILER` activity:
+
+ ```bash
+ # Terminal 2: Wait for server to be ready, then start CUDA profiling
+ # Wait 3 steps for warmup, then profile for 10 steps
+ curl -X POST http://127.0.0.1:30000/start_profile \
+ -H "Content-Type: application/json" \
+ -d '{
+ "start_step": 3,
+ "num_steps": 10,
+ "activities": ["CUDA_PROFILER"]
+ }'
+ ```
+
+3. Send requests to generate load:
+
+ ```bash
+ # Terminal 3: Generate workload
+ python -m sglang.bench_serving --backend sglang --num-prompts 100
+ ```
+
+4. Profiling will automatically stop after 10 steps (due to `num_steps: 10`). If you hadn't specified `num_steps`, you would need to manually stop it:
+
+ ```bash
+ # Terminal 2: Only needed if num_steps was not specified
+ curl -X POST http://127.0.0.1:30000/end_profile
+ ```
+
+The `--capture-range=cudaProfilerApi` option tells Nsight Systems to only capture data between `cudaProfilerStart()` and `cudaProfilerStop()` calls (triggered by `/start_profile` and `/end_profile`), reducing overhead and file size. The `start_step` parameter skips the first 3 steps to avoid capturing warmup overhead.
+
+**Method 2: Simpler approach without `/start_profile` API**
+
+For simpler use cases where you don't need fine-grained control over profiling start/stop, you can profile with Nsight Systems capturing the entire workload:
+
+```bash
+# Terminal 1: Start server with layerwise NVTX
+# Note: --disable-cuda-graph ensures all NVTX markers are emitted
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
+ --enable-layerwise-nvtx-marker \
+ --disable-cuda-graph
+
+# Terminal 2: Profile the benchmarking client
+nsys profile --trace-fork-before-exec=true \
+ --cuda-graph-trace=node \
+ -o layerwise_profile \
+ python -m sglang.bench_serving --backend sglang --num-prompts 10
+```
+
+This approach profiles the entire client execution, including all server interactions. The layerwise NVTX markers will be visible in the Nsight Systems timeline.
+
+**Viewing the profiling results:**
+
+Open the generated `.qdrep` file with Nsight Systems:
+
+```bash
+nsys-ui layerwise_profile.qdrep
+```
+
+In the Nsight Systems GUI, you'll see:
+- **NVTX ranges**: Each layer appears as a labeled range in the timeline with detailed information in the marker metadata
+- **CUDA kernels**: All GPU kernels are shown alongside the layer annotations
+- **Layer hierarchy**: The full module path (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct.model.layers.0.self_attn.qkv_proj`) helps identify specific layers. The prefix uses the full model path from `--model-path`.
+- **Tensor shapes**: Input/output dimensions and parameter shapes are included in the NVTX marker data
+
+**Benefits of layerwise NVTX profiling:**
+
+- **Granular visibility**: See exactly which layers are taking the most time
+- **Memory tracking**: Identify layers with large memory allocations
+- **Bottleneck identification**: Quickly locate inefficient operations
+- **Communication overhead**: In multi-GPU setups, see per-layer communication costs
+- **Development debugging**: Validate that model architecture changes have the expected performance impact
+
+## Other tips
+
+1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
+2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using:
+
+ ```bash
+ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'
+ ```
+
+3. You can use `--python-backtrace=cuda` to see python call stack for all CUDA kernels, as in PyTorch Profiler. (Caveat: this can cause inaccurately long kernel runtimes for CUDA event based timing)
+4. For more arguments see [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
diff --git a/sglang/docs/developer_guide/contribution_guide.md b/sglang/docs/developer_guide/contribution_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..5f78456a25de4f116d5be27ae210246104c17d7e
--- /dev/null
+++ b/sglang/docs/developer_guide/contribution_guide.md
@@ -0,0 +1,147 @@
+# Contribution Guide
+
+Welcome to **SGLang**! We appreciate your interest in contributing. This guide provides a concise overview of how to set up your environment, run tests, build documentation, and open a Pull Request (PR). Whether you’re fixing a small bug or developing a major feature, we encourage following these steps for a smooth contribution process.
+
+## Install SGLang from Source
+
+### Fork and clone the repository
+
+**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
+
+```bash
+git clone https://github.com//sglang.git
+```
+
+### Build from source
+
+Refer to [Install SGLang from Source](../get_started/install.md#method-2-from-source).
+
+## Format code with pre-commit
+
+We use [pre-commit](https://pre-commit.com/) to maintain consistent code style checks. Before pushing your changes, please run:
+
+```bash
+pip3 install pre-commit
+pre-commit install
+pre-commit run --all-files
+```
+
+- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
+- **Do not commit** directly to the `main` branch. Always create a new branch (e.g., `feature/my-new-feature`), push your changes, and open a PR from that branch.
+
+## Run and add unit tests
+
+If you add a new feature or fix a bug, please add corresponding unit tests to ensure coverage and prevent regression.
+SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unittest.html) framework.
+For detailed instructions on running tests and integrating them into CI, refer to [test/README.md](https://github.com/sgl-project/sglang/tree/main/test/README.md).
+
+## Write documentations
+
+We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
+For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
+
+## Test the accuracy
+If your code changes the model output, please run the accuracy tests. A quick sanity check is the few-shot GSM8K.
+
+```
+# Launch a server
+python3 -m sglang.launch_server --model Qwen/Qwen2-7B-Instruct
+
+# Evaluate
+python3 -m sglang.test.few_shot_gsm8k --num-questions 200
+```
+
+Please note that the above script is primarily a sanity check, not a rigorous accuracy or speed test.
+This test can have significant variance (1%–5%) in accuracy due to batching and the non-deterministic nature of the inference engine.
+Also, do not rely on the "Latency/Output throughput" from this script, as it is not a proper speed test.
+
+GSM8K is too easy for state-of-the-art models nowadays. Please try your own more challenging accuracy tests.
+You can find additional accuracy eval examples in:
+- [test_eval_accuracy_large.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_eval_accuracy_large.py)
+- [test_gpt_oss_1gpu.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_gpt_oss_1gpu.py)
+
+## Benchmark the speed
+Refer to [Benchmark and Profiling](../developer_guide/benchmark_and_profiling.md).
+
+## Requesting a review for merge
+You can follow the pull request merge process described in [MAINTAINER.md](https://github.com/sgl-project/sglang/blob/main/.github/MAINTAINER.md).
+You will need to work with the Merge Oncall, Codeowner, and other reviewers to get their approvals.
+Then your PR can be merged.
+
+## How to Trigger CI Tests
+
+We have a lot of open PRs but limited CI machines, so only top and trusted contributors have permission to trigger CI tests.
+Users with permission are listed in the [CI_PERMISSIONS.json](https://github.com/sgl-project/sglang/blob/main/.github/CI_PERMISSIONS.json)
+
+**PR authors** can always use `/rerun-failed-ci` on their own PRs, even if they are not listed in `CI_PERMISSIONS.json`.
+
+For CI to run on a pull request, it must have the "run-ci" label. Authorized users can add the label or rerun failed tests by commenting on the PR with one of these commands:
+
+- `/tag-run-ci-label`: Adds the "run-ci" label. Every future commit will trigger CI.
+- `/rerun-failed-ci`: Reruns the failed or flaky tests from the most recent commit.
+- `/tag-and-rerun-ci`: A single command that performs both `/tag-run-ci-label` and `/rerun-failed-ci`.
+- `/rerun-stage `: Reruns a specific test stage without waiting for its dependencies. This is useful when you want to quickly validate a fix for a specific test failure instead of waiting ~30 minutes for preceding stages to complete.
+
+If you have permission, the [Slash Command Handler](https://github.com/sgl-project/sglang/actions/workflows/slash-command-handler.yml) will run your command and react with a 👍 to your comment. It may take up to a few minutes for the reaction to appear. Here’s a usage [example](https://github.com/sgl-project/sglang/pull/14253#issuecomment-3599509302).
+
+To avoid spamming a PR with too many `/rerun-failed-ci` comments, you can also trigger the command by editing an existing comment and adding any suffix (e.g., `/rerun-failed-ci try again`).
+
+Example of rerunning a single test stage: `/rerun-stage unit-test-backend-4-gpu`.
+
+If you don’t have permission and you’re not the PR author, please ask maintainers to trigger CI for you.
+
+### CI rate limits
+
+Due to CI scheduling and limited resources, higher-priority PRs may preempt running jobs. In such cases, you may need to rerun the tests.
+
+We apply CI rate limits to prevent abuse and ensure fair usage of our CI resources.
+
+Each CI workflow has a default limit defined in its workflow configuration file. For example, in [pr-gate.yml](https://github.com/sgl-project/sglang/blob/main/.github/workflows/pr-gate.yml), the default cooldown period is 120 minutes, and each workflow can override it via the `cool-down-minutes` input parameter:
+
+```yaml
+cool-down-minutes:
+ description: "Default cooldown period in minutes; 0 disables rate limiting"
+ type: number
+ default: 120
+```
+
+Users listed in [CI_PERMISSIONS.json](https://github.com/sgl-project/sglang/blob/main/.github/CI_PERMISSIONS.json) may have a per-user cooldown interval. In practice, we use the minimum of the workflow’s default window and the user-specific interval.
+
+
+## Code style guidance
+- Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function.
+- Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code.
+- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code.
+ - A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible.
+- Make functions as pure as possible. Avoid in-place modification of arguments.
+- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files. (e.g., `scheduler.py`, `scheduler_output_processor_mixin.py`)
+- Keep tests run fast.
+ - If a single test file run longer than 500 seconds, split it into multiple smaller files (e.g., `test_eagle_infer_a.py`, `test_eagle_infer_b.py`).
+ - If a single job in a github workflow runs longer than 30 mins, split it into smaller jobs/steps.
+ - Reuse server launches in your unit tests to make tests run faster.
+- When supporting new hardware or features, follow these guidelines:
+ - Do not drastically change existing code.
+ - Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`).
+ - If you write multiple if/else blocks for new features, ensure the common path (e.g., NVIDIA hardware or the existing code path) is the first branch.
+
+## How to update sgl-kernel
+Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR).
+To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
+
+Follow these steps:
+
+1. Submit a PR to update the sgl-kernel source code without using it in sglang python package (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
+2. Bump the version of sgl-kernel (e.g., [#9220](https://github.com/sgl-project/sglang/pull/9220/files)).
+ - Once merged, this will trigger an automatic release of the sgl-kernel wheel to PyPI.
+ - If not urgent, you can wait for other people to release the wheel. A new version will typically be released within one week.
+3. Apply the changes:
+ - Update the sgl-kernel version in `sglang/python/pyproject.toml` to use the modified kernels.
+ - Update the related caller code in the sglang to use the new kernel.
+
+## Tips for newcomers
+
+If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
+
+If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://slack.sglang.io).
+
+Thank you for your interest in SGLang. Happy coding!
diff --git a/sglang/docs/developer_guide/development_guide_using_docker.md b/sglang/docs/developer_guide/development_guide_using_docker.md
new file mode 100644
index 0000000000000000000000000000000000000000..a833011c62b1a34f9bca30be9eeea40505eedebf
--- /dev/null
+++ b/sglang/docs/developer_guide/development_guide_using_docker.md
@@ -0,0 +1,108 @@
+# Development Guide Using Docker
+
+## Setup VSCode on a Remote Host
+(Optional - you can skip this step if you plan to run sglang dev container locally)
+
+1. In the remote host, download `code` from [Https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell.
+
+Example
+```bash
+wget https://vscode.download.prss.microsoft.com/dbazure/download/stable/fabdb6a30b49f79a7aba0f2ad9df9b399473380f/vscode_cli_alpine_x64_cli.tar.gz
+tar xf vscode_cli_alpine_x64_cli.tar.gz
+
+# https://code.visualstudio.com/docs/remote/tunnels
+./code tunnel
+```
+
+2. In your local machine, press F1 in VSCode and choose "Remote Tunnels: Connect to Tunnel".
+
+## Setup Docker Container
+
+### Option 1. Use the default dev container automatically from VSCode
+There is a `.devcontainer` folder in the sglang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).
+
+(*Figure 1: Diagram from VSCode official documentation [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).*)
+
+To enable this, you only need to:
+1. Start Visual Studio Code and install [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers).
+2. Press F1, type and choose "Dev Container: Open Folder in Container.
+3. Input the `sglang` local repo path in your machine and press enter.
+
+The first time you open it in dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container:
+
+
+
+Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, sglang server will be started in the dev container with all your local changes applied automatically:
+
+
+
+
+### Option 2. Start up containers manually (advanced)
+
+The following startup command is an example for internal development by the SGLang team. You can **modify or add directory mappings as needed**, especially for model weight downloads, to prevent repeated downloads by different Docker containers.
+
+❗️ **Note on RDMA**
+
+ 1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them but keeping them there does not harm. Thus, we enable these two flags by default in the commands below.
+ 2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+
+```bash
+# Change the name to yours
+docker run -itd --shm-size 32g --gpus all -v --ipc=host --network=host --privileged --name sglang_dev lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_dev /bin/zsh
+```
+Some useful volumes to mount are:
+1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
+2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
+
+Example 1: Mounting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
+```bash
+docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+Example 2: Mounting both HuggingFace cache and local SGLang repo. Local code changes are automatically synced to the devcontainer as the SGLang is installed in editable mode in the dev image.
+```bash
+docker run -itd --shm-size 32g --gpus all -v $HOME/.cache/huggingface/:/root/.cache/huggingface -v $HOME/src/sglang:/sgl-workspace/sglang --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+## Debug SGLang with VSCode Debugger
+1. (Create if not exist) open `launch.json` in VSCode.
+2. Add the following config and save. Please note that you can edit the script as needed to apply different parameters or debug a different program (e.g. benchmark script).
+ ```JSON
+ {
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python Debugger: launch_server",
+ "type": "debugpy",
+ "request": "launch",
+ "module": "sglang.launch_server",
+ "console": "integratedTerminal",
+ "args": [
+ "--model-path", "meta-llama/Llama-3.2-1B",
+ "--host", "0.0.0.0",
+ "--port", "30000",
+ "--trust-remote-code",
+ ],
+ "justMyCode": false
+ }
+ ]
+ }
+ ```
+
+3. Press "F5" to start. VSCode debugger will ensure that the program will pause at the breakpoints even if the program is running at remote SSH/Tunnel host + dev container.
+
+## Profile
+
+```bash
+# Change batch size, input, output and add `disable-cuda-graph` (for easier analysis)
+# e.g. DeepSeek V3
+nsys profile -o deepseek_v3 python3 -m sglang.bench_one_batch --batch-size 1 --input 128 --output 256 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --tp 8 --disable-cuda-graph
+```
+
+## Evaluation
+
+```bash
+# e.g. gsm8k 8 shot
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 2000 --parallel 2000 --num-shots 8
+```
diff --git a/sglang/docs/developer_guide/development_jit_kernel_guide.md b/sglang/docs/developer_guide/development_jit_kernel_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..2fb3422748e975638f6c71fdd1a95f18ad6b8ba3
--- /dev/null
+++ b/sglang/docs/developer_guide/development_jit_kernel_guide.md
@@ -0,0 +1,259 @@
+# Development Guide for JIT Kernels
+
+## Environment Setup
+
+We strongly recommend using `clangd` as the language server for JIT kernel development.
+For Ubuntu/Debian, you can download clangd from [apt.llvm.org](https://apt.llvm.org/).
+If you are using VS Code, we recommend installing the `clangd` extension for better IDE integration.
+
+All JIT-related files are located in `python/sglang/jit_kernel`.
+Unlike `sgl-kernel`, which compiles CUDA/C++ binaries ahead of time (AOT), just-in-time (JIT) kernels are compiled at runtime.
+Consequently, a static `compile_commands.json` cannot be generated.
+To enable code completion with `clangd`, run `python -m sglang.jit_kernel` to generate a `.clangd` configuration file in your current directory.
+After generating the file, restart the clangd language server. It should now recognize all JIT kernel files.
+
+## Code Structure
+
+### C++ Implementation
+
+C++ source code is located in `python/sglang/jit_kernel/csrc`.
+Reusable functions should be placed in `python/sglang/jit_kernel/include`.
+
+We use [tvm-ffi](https://github.com/apache/tvm-ffi) for efficient foreign language bindings.
+Refer to the [documentation](https://tvm.apache.org/ffi/) for advanced usage, such as exporting C++ objects.
+Typically, `tvm::ffi::TensorView` is sufficient for passing PyTorch Tensors from Python.
+
+### Python Interface
+
+Python interfaces are defined in `python/sglang/jit_kernel`.
+The `load_jit` utility function in `python/sglang/jit_kernel/utils.py` loads and returns the compiled module.
+To export a C++ function (e.g., `cpp_func`), pass `cuda_wrappers=[("func", "cpp_func")]` to `load_jit`.
+The function can then be called in Python as `module.func`.
+
+For caching compiled modules, prefer `sglang.jit_kernel.utils.cache_once` over `functools.lru_cache`.
+`functools.lru_cache` is not compatible with `torch.compile`.
+
+### C++ Utilities
+
+The following C++ utilities are available:
+
+#### Integer Range
+
+Similar to PyTorch, we provide an `irange` function to represent an integer range.
+
+```C++
+#include
+
+void test() {
+ for (auto i : host::irange(100)) { // [0, 100)
+ // do something
+ }
+ for (auto i : host::irange(0, 100)) { // [0, 100)
+ // do something
+ }
+}
+
+```
+
+#### Runtime Checking
+
+`RuntimeCheck` validates conditions at runtime. It accepts optional arguments for error reporting.
+If the check fails, these arguments are output to aid debugging.
+`RuntimeDeviceCheck` verifies the status of the last kernel launch.
+
+```C++
+#include
+#include
+
+void test() {
+ host::RuntimeCheck(1 + 1 == 2, 1 + 1, " != ", 2);
+ host::RuntimeDeviceCheck();
+ // check the provided `cudaError_t`
+ host::RuntimeDeviceCheck(cudaGetLastError());
+}
+
+```
+
+#### Tensor Checking
+
+`TensorMatcher` provides a readable way to validate and extract tensor shape information.
+
+```cpp
+#include
+
+void test(const tvm::ffi::TensorView k_cache, const tvm::ffi::TensorView v_cache) {
+ using namespace host;
+
+ auto D = SymbolicSize{"D"}; // cache dimension
+ auto N = SymbolicSize{"N"}; // kvcache stride
+ auto dtype = SymbolicDType{};
+ auto device = SymbolicDevice{};
+
+ TensorMatcher({-1, D}) //
+ .with_strides({N, 1})
+ .with_dtype(dtype)
+ .with_device(device)
+ .verify(k_cache)
+ .verify(v_cache);
+}
+```
+
+Configure the `TensorMatcher` with expected stride, dtype, and device properties before verification.
+- If `with_strides` is omitted, the tensor is expected to be contiguous.
+- Template arguments in `with_dtype` restrict the allowed data types.
+- Template arguments in `with_device` restrict the allowed devices.
+- Values passed to `with_xxx` methods enforce equality checks.
+- Passing `-1` for size or stride allows matching any value.
+
+A `Symbolic` variable must resolve to the same value across all verifications.
+Use `.unwrap()` to retrieve the matched value after verification.
+
+> Note: `TensorMatcher` is a temporary expression and should not be stored in a variable.
+
+> Tip: Add `//` at the end of the `TensorMatcher` chain to enforce proper indentation.
+
+#### Kernel Launching
+
+`LaunchKernel::resolve_device` retrieves the current `cudaStream` from PyTorch.
+Kernels can also be launched directly using `LaunchKernel`.
+
+```cpp
+#include
+
+#include
+
+__global__ void kernel() {}
+
+void test() {
+ const auto num_blocks = 1;
+ const auto num_threads = 32;
+ const auto dynamic_smem = 0;
+
+ DLDevice dev; // suppose this is initialized properly
+ host::LaunchKernel(num_blocks, num_threads, dev)(kernel);
+
+ cudaStream_t stream = host::LaunchKernel::resolve_device(dev);
+ host::LaunchKernel(num_blocks, num_threads, stream, dynamic_smem)(kernel);
+}
+
+```
+
+## Add new kernels
+
+This section walks through a complete, end-to-end example of adding a new JIT kernel to the system.
+We use a simple add_constant kernel as a running example, which adds a constant integer value to every element of an input tensor.
+
+Conceptually, the Python interface looks like this:
+
+```python
+def add_constant(src: torch.Tensor, c: int):
+ return src + c
+```
+
+### STEP 1: Write the C++ kernel
+
+Write your CUDA kernel in [jit_kernel/csrc/add_constant.cuh](../../python/sglang/jit_kernel/csrc/add_constant.cuh). For demonstration purposes, we pass the constant value as a template parameter.
+
+```cpp
+#include // For TensorMatcher, SymbolicSize, SymbolicDevice
+#include // For LaunchKernel
+#include // For div_ceil, RuntimeCheck
+
+#include
+#include
+
+#include
+#include
+
+namespace {
+
+template
+__global__ void add_constant_kernel(int32_t* dst, const int32_t* src, size_t length) {
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < length) {
+ dst[idx] = src[idx] + kConstant;
+ }
+}
+
+constexpr size_t kBlockSize = 256;
+
+// You can also use struct with static method as an alternative
+template
+void add_constant(tvm::ffi::TensorView dst, tvm::ffi::TensorView src) {
+ using namespace host;
+
+ // 1. Validate input tensors
+ SymbolicSize N = {"num_elements"};
+ SymbolicDevice device_;
+ TensorMatcher({N}) // 1D tensor, must be contiguous
+ .with_dtype() // must be int32
+ .with_device(device_) // must be on CUDA device
+ .verify(dst) // check tensor dst
+ .verify(src); // check tensor src
+
+ // 2. Extract required parameters, prepare for kernel launch
+ const size_t num_elements = N.unwrap();
+ const size_t grid_size = div_ceil(num_elements, kBlockSize);
+ const DLDevice device = device_.unwrap();
+ // some extra runtime checks using host::RuntimeCheck
+ RuntimeCheck(num_elements > 0, "We only support non-empty tensors, got num_elements = ", num_elements);
+
+ // 3. Launch the kernel. Error code will be automatically checked.
+ LaunchKernel(grid_size, kBlockSize, device /*, dynamic_smem*/)(
+ // kernel function
+ add_constant_kernel,
+ // kernel arguments
+ static_cast(dst.data_ptr()),
+ static_cast(src.data_ptr()),
+ num_elements);
+}
+
+} // namespace
+
+```
+
+### STEP 2: Create Python Interfaces
+
+Next, expose the kernel through a Python wrapper.
+Create a new file at [jit_kernel/add_constant.py](../../python/sglang/jit_kernel/add_constant.py) and expose the needed interfaces.
+
+```python
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.jit_kernel.utils import cache_once, load_jit, make_cpp_args
+
+if TYPE_CHECKING:
+ from tvm_ffi.module import Module
+
+
+@cache_once
+def _jit_add_constant_module(constant: int) -> Module:
+ args = make_cpp_args(constant) # pass all the template argument
+ return load_jit(
+ "add_constant",
+ *args,
+ cuda_files=["add_constant.cuh"],
+ cuda_wrappers=[("add_constant", f"add_constant<{args}>")],
+ )
+
+
+def add_constant(src: torch.Tensor, constant: int) -> torch.Tensor:
+ dst = torch.empty_like(src)
+ module = _jit_add_constant_module(constant)
+ module.add_constant(dst, src)
+ return dst
+
+```
+
+### STEP 3: Use your kernel
+
+Finally, import and use the kernel like a regular Python function:
+
+```python
+from sglang.jit_kernel.add_constant import add_constant
+```
+
+For a complete, runnable example, refer to [test_add_constant.py](../../python/sglang/jit_kernel/tests/test_add_constant.py).
diff --git a/sglang/docs/developer_guide/evaluating_new_models.md b/sglang/docs/developer_guide/evaluating_new_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3126c9a0d88190381820d911cdbe72037f83a3b
--- /dev/null
+++ b/sglang/docs/developer_guide/evaluating_new_models.md
@@ -0,0 +1,146 @@
+# Evaluating New Models with SGLang
+
+This document provides commands for evaluating models' accuracy and performance. Before open-sourcing new models, we strongly suggest running these commands to verify whether the score matches your internal benchmark results.
+
+**For cross verification, please submit commands for installation, server launching, and benchmark running with all the scores and hardware requirements when open-sourcing your models.**
+
+[Reference: MiniMax M2](https://github.com/sgl-project/sglang/pull/12129)
+
+## Accuracy
+
+### LLMs
+
+SGLang provides built-in scripts to evaluate common benchmarks.
+
+**MMLU**
+
+```bash
+python -m sglang.test.run_eval \
+ --eval-name mmlu \
+ --port 30000 \
+ --num-examples 1000 \
+ --max-tokens 8192
+```
+
+**GSM8K**
+
+```bash
+python -m sglang.test.few_shot_gsm8k \
+ --host 127.0.0.1 \
+ --port 30000 \
+ --num-questions 200 \
+ --num-shots 5
+```
+
+**HellaSwag**
+
+```bash
+python benchmark/hellaswag/bench_sglang.py \
+ --host 127.0.0.1 \
+ --port 30000 \
+ --num-questions 200 \
+ --num-shots 20
+```
+
+**GPQA**
+
+```bash
+python -m sglang.test.run_eval \
+ --eval-name gpqa \
+ --port 30000 \
+ --num-examples 198 \
+ --max-tokens 120000 \
+ --repeat 8
+```
+
+```{tip}
+For reasoning models, add `--thinking-mode ` (e.g., `qwen3`, `deepseek-v3`). You may skip it if the model has forced thinking enabled.
+```
+
+**HumanEval**
+
+```bash
+pip install human_eval
+
+python -m sglang.test.run_eval \
+ --eval-name humaneval \
+ --num-examples 10 \
+ --port 30000
+```
+
+### VLMs
+
+**MMMU**
+
+```bash
+python benchmark/mmmu/bench_sglang.py \
+ --port 30000 \
+ --concurrency 64
+```
+
+```{tip}
+You can set max tokens by passing `--extra-request-body '{"max_tokens": 4096}'`.
+```
+
+For models capable of processing video, we recommend extending the evaluation to include `VideoMME`, `MVBench`, and other relevant benchmarks.
+
+## Performance
+
+Performance benchmarks measure **Latency** (Time To First Token - TTFT) and **Throughput** (tokens/second).
+
+### LLMs
+
+**Latency-Sensitive Benchmark**
+
+This simulates a scenario with low concurrency (e.g., single user) to measure latency.
+
+```bash
+python -m sglang.bench_serving \
+ --backend sglang \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --dataset-name random \
+ --num-prompts 10 \
+ --max-concurrency 1
+```
+
+**Throughput-Sensitive Benchmark**
+
+This simulates a high-traffic scenario to measure maximum system throughput.
+
+```bash
+python -m sglang.bench_serving \
+ --backend sglang \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --dataset-name random \
+ --num-prompts 1000 \
+ --max-concurrency 100
+```
+
+**Single Batch Performance**
+
+You can also benchmark the performance of processing a single batch offline.
+
+```bash
+python -m sglang.bench_one_batch_server \
+ --model \
+ --batch-size 8 \
+ --input-len 1024 \
+ --output-len 1024
+```
+
+You can run more granular benchmarks:
+
+- **Low Concurrency**: `--num-prompts 10 --max-concurrency 1`
+- **Medium Concurrency**: `--num-prompts 80 --max-concurrency 16`
+- **High Concurrency**: `--num-prompts 500 --max-concurrency 100`
+
+## Reporting Results
+
+For each evaluation, please report:
+
+1. **Metric Score**: Accuracy % (LLMs and VLMs); Latency (ms) and Throughput (tok/s) (LLMs only).
+2. **Environment settings**: GPU type/count, SGLang commit hash.
+3. **Launch configuration**: Model path, TP size, and any special flags.
+4. **Evaluation parameters**: Number of shots, examples, max tokens.
diff --git a/sglang/docs/developer_guide/release_process.md b/sglang/docs/developer_guide/release_process.md
new file mode 100644
index 0000000000000000000000000000000000000000..e817a256e4d21ba5b28c3ac58e6bd97ec23ba2ac
--- /dev/null
+++ b/sglang/docs/developer_guide/release_process.md
@@ -0,0 +1,18 @@
+# PyPI Package Release Process
+
+## Update the version in code
+Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
+
+## Upload the PyPI package
+
+```
+pip install build twine
+```
+
+```
+cd python
+bash upload_pypi.sh
+```
+
+## Make a release in GitHub
+Make a new release https://github.com/sgl-project/sglang/releases/new.
diff --git a/sglang/docs/developer_guide/setup_github_runner.md b/sglang/docs/developer_guide/setup_github_runner.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a0e92cdc4ecc229f0ca36e4fd93b2f3708890f3
--- /dev/null
+++ b/sglang/docs/developer_guide/setup_github_runner.md
@@ -0,0 +1,51 @@
+# Set Up Self-Hosted Runners for GitHub Actions
+
+## Add a Runner
+
+### Step 1: Start a docker container.
+
+**You can mount a folder for the shared huggingface model weights cache. **
+The command below uses `/tmp/huggingface` as an example.
+
+```
+docker pull nvidia/cuda:12.9.1-devel-ubuntu22.04
+# Nvidia
+docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.9.1-devel-ubuntu22.04 /bin/bash
+# AMD
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.8-rocm700-mi30x /bin/bash
+# AMD just the last 2 GPUs
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.8-rocm700-mi30x /bin/bash
+```
+
+### Step 2: Configure the runner by `config.sh`
+
+Run these commands inside the container.
+
+```
+apt update && apt install -y curl python3-pip git
+pip install --upgrade pip
+export RUNNER_ALLOW_RUNASROOT=1
+```
+
+Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux to run `config.sh`
+
+**Notes**
+- Do not need to specify the runner group
+- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
+- Do not need to change the work folder.
+
+### Step 3: Run the runner by `run.sh`
+
+- Set up environment variables
+```
+export HF_HOME=/hf_home
+export SGLANG_IS_IN_CI=true
+export HF_TOKEN=hf_xxx
+export OPENAI_API_KEY=sk-xxx
+export CUDA_VISIBLE_DEVICES=0
+```
+
+- Run it forever
+```
+while true; do ./run.sh; echo "Restarting..."; sleep 2; done
+```
diff --git a/sglang/docs/diffusion/api/cli.md b/sglang/docs/diffusion/api/cli.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0f8d7da457474f4dcda69bcb1b93a22e8ecc8f3
--- /dev/null
+++ b/sglang/docs/diffusion/api/cli.md
@@ -0,0 +1,332 @@
+# SGLang diffusion CLI Inference
+
+The SGLang-diffusion CLI provides a quick way to access the inference pipeline for image and video generation.
+
+## Prerequisites
+
+- A working SGLang diffusion installation and the `sglang` CLI available in `$PATH`.
+
+
+## Supported Arguments
+
+### Server Arguments
+
+- `--model-path {MODEL_PATH}`: Path to the model or model ID
+- `--lora-path {LORA_PATH}`: Path to a LoRA adapter (local path or HuggingFace model ID). If not specified, LoRA will not be applied.
+- `--lora-nickname {NAME}`: Nickname for the LoRA adapter. (default: `default`).
+- `--num-gpus {NUM_GPUS}`: Number of GPUs to use
+- `--tp-size {TP_SIZE}`: Tensor parallelism size (only for the encoder; should not be larger than 1 if text encoder offload is enabled, as layer-wise offload plus prefetch is faster)
+- `--sp-degree {SP_SIZE}`: Sequence parallelism size (typically should match the number of GPUs)
+- `--ulysses-degree {ULYSSES_DEGREE}`: The degree of DeepSpeed-Ulysses-style SP in USP
+- `--ring-degree {RING_DEGREE}`: The degree of ring attention-style SP in USP
+- `--attention-backend {BACKEND}`: Attention backend to use. For SGLang-native pipelines use `fa`, `torch_sdpa`, `sage_attn`, etc. For diffusers pipelines use diffusers backend names like `flash`, `_flash_3_hub`, `sage`, `xformers`.
+- `--attention-backend-config {CONFIG}`: Configuration for the attention backend. Can be a JSON string (e.g., '{"k": "v"}'), a path to a JSON/YAML file, or key=value pairs (e.g., "k=v,k2=v2").
+- `--cache-dit-config {PATH}`: Path to a Cache-DiT YAML/JSON config (diffusers backend only)
+- `--dit-precision {DTYPE}`: Precision for the DiT model (currently supports fp32, fp16, and bf16).
+
+
+### Sampling Parameters
+
+- `--prompt {PROMPT}`: Text description for the video you want to generate
+- `--num-inference-steps {STEPS}`: Number of denoising steps
+- `--negative-prompt {PROMPT}`: Negative prompt to guide generation away from certain concepts
+- `--seed {SEED}`: Random seed for reproducible generation
+
+
+**Image/Video Configuration**
+
+- `--height {HEIGHT}`: Height of the generated output
+- `--width {WIDTH}`: Width of the generated output
+- `--num-frames {NUM_FRAMES}`: Number of frames to generate
+- `--fps {FPS}`: Frames per second for the saved output, if this is a video-generation task
+
+
+**Frame Interpolation** (video only)
+
+Frame interpolation is a post-processing step that synthesizes new frames
+between each pair of consecutive generated frames, producing smoother
+motion without re-running the diffusion model. The `--frame-interpolation-exp`
+flag controls how many rounds of interpolation to apply: each round inserts one
+new frame into every gap between adjacent frames, so the output frame count
+follows the formula **(N − 1) × 2^exp + 1** (e.g. 5 original frames with
+`exp=1` → 4 gaps × 1 new frame + 5 originals = **9** frames; with `exp=2` →
+**17** frames).
+
+- `--enable-frame-interpolation`: Enable frame interpolation. Model weights are downloaded automatically on first use.
+- `--frame-interpolation-exp {EXP}`: Interpolation exponent — `1` = 2× temporal resolution, `2` = 4×, etc. (default: `1`)
+- `--frame-interpolation-scale {SCALE}`: RIFE inference scale; use `0.5` for high-resolution inputs to save memory (default: `1.0`)
+- `--frame-interpolation-model-path {PATH}`: Local directory or HuggingFace repo ID containing RIFE `flownet.pkl` weights (default: `elfgum/RIFE-4.22.lite`, downloaded automatically)
+
+Example — generate a 5-frame video and interpolate to 9 frames ((5 − 1) × 2¹ + 1 = 9):
+
+```bash
+sglang generate \
+ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
+ --prompt "A dog running through a park" \
+ --num-frames 5 \
+ --enable-frame-interpolation \
+ --frame-interpolation-exp 1 \
+ --save-output
+```
+
+**Output Options**
+
+- `--output-path {PATH}`: Directory to save the generated video
+- `--save-output`: Whether to save the image/video to disk
+- `--return-frames`: Whether to return the raw frames
+
+### Using Configuration Files
+
+Instead of specifying all parameters on the command line, you can use a configuration file:
+
+```bash
+sglang generate --config {CONFIG_FILE_PATH}
+```
+
+The configuration file should be in JSON or YAML format with the same parameter names as the CLI options. Command-line arguments take precedence over settings in the configuration file, allowing you to override specific values while keeping the rest from the configuration file.
+
+Example configuration file (config.json):
+
+```json
+{
+ "model_path": "FastVideo/FastHunyuan-diffusers",
+ "prompt": "A beautiful woman in a red dress walking down a street",
+ "output_path": "outputs/",
+ "num_gpus": 2,
+ "sp_size": 2,
+ "tp_size": 1,
+ "num_frames": 45,
+ "height": 720,
+ "width": 1280,
+ "num_inference_steps": 6,
+ "seed": 1024,
+ "fps": 24,
+ "precision": "bf16",
+ "vae_precision": "fp16",
+ "vae_tiling": true,
+ "vae_sp": true,
+ "vae_config": {
+ "load_encoder": false,
+ "load_decoder": true,
+ "tile_sample_min_height": 256,
+ "tile_sample_min_width": 256
+ },
+ "text_encoder_precisions": [
+ "fp16",
+ "fp16"
+ ],
+ "mask_strategy_file_path": null,
+ "enable_torch_compile": false
+}
+```
+
+Or using YAML format (config.yaml):
+
+```yaml
+model_path: "FastVideo/FastHunyuan-diffusers"
+prompt: "A beautiful woman in a red dress walking down a street"
+output_path: "outputs/"
+num_gpus: 2
+sp_size: 2
+tp_size: 1
+num_frames: 45
+height: 720
+width: 1280
+num_inference_steps: 6
+seed: 1024
+fps: 24
+precision: "bf16"
+vae_precision: "fp16"
+vae_tiling: true
+vae_sp: true
+vae_config:
+ load_encoder: false
+ load_decoder: true
+ tile_sample_min_height: 256
+ tile_sample_min_width: 256
+text_encoder_precisions:
+ - "fp16"
+ - "fp16"
+mask_strategy_file_path: null
+enable_torch_compile: false
+```
+
+
+To see all the options, you can use the `--help` flag:
+
+```bash
+sglang generate --help
+```
+
+## Serve
+
+Launch the SGLang diffusion HTTP server and interact with it using the OpenAI SDK and curl.
+
+### Start the server
+
+Use the following command to launch the server:
+
+```bash
+SERVER_ARGS=(
+ --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+ --text-encoder-cpu-offload
+ --pin-cpu-memory
+ --num-gpus 4
+ --ulysses-degree=2
+ --ring-degree=2
+)
+
+sglang serve "${SERVER_ARGS[@]}"
+```
+
+- **--model-path**: Which model to load. The example uses `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`.
+- **--port**: HTTP port to listen on (the default here is `30010`).
+
+For detailed API usage, including Image, Video Generation and LoRA management, please refer to the [OpenAI API Documentation](openai_api.md).
+
+### Cloud Storage Support
+
+SGLang diffusion supports automatically uploading generated images and videos to S3-compatible cloud storage (e.g., AWS S3, MinIO, Alibaba Cloud OSS, Tencent Cloud COS).
+
+When enabled, the server follows a **Generate -> Upload -> Delete** workflow:
+1. The artifact is generated to a temporary local file.
+2. The file is immediately uploaded to the configured S3 bucket in a background thread.
+3. Upon successful upload, the local file is deleted.
+4. The API response returns the public URL of the uploaded object.
+
+**Configuration**
+
+Cloud storage is enabled via environment variables. Note that `boto3` must be installed separately (`pip install boto3`) to use this feature.
+
+```bash
+# Enable S3 storage
+export SGLANG_CLOUD_STORAGE_TYPE=s3
+export SGLANG_S3_BUCKET_NAME=my-bucket
+export SGLANG_S3_ACCESS_KEY_ID=your-access-key
+export SGLANG_S3_SECRET_ACCESS_KEY=your-secret-key
+
+# Optional: Custom endpoint for MinIO/OSS/COS
+export SGLANG_S3_ENDPOINT_URL=https://minio.example.com
+```
+
+See [Environment Variables Documentation](../environment_variables.md) for more details.
+
+## Generate
+
+Run a one-off generation task without launching a persistent server.
+
+To use it, pass both server arguments and sampling parameters in one command, after the `generate` subcommand, for example:
+
+```bash
+SERVER_ARGS=(
+ --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers
+ --text-encoder-cpu-offload
+ --pin-cpu-memory
+ --num-gpus 4
+ --ulysses-degree=2
+ --ring-degree=2
+)
+
+SAMPLING_ARGS=(
+ --prompt "A curious raccoon"
+ --save-output
+ --output-path outputs
+ --output-file-name "A curious raccoon.mp4"
+)
+
+sglang generate "${SERVER_ARGS[@]}" "${SAMPLING_ARGS[@]}"
+
+# Or, users can set `SGLANG_CACHE_DIT_ENABLED` env as `true` to enable cache acceleration
+SGLANG_CACHE_DIT_ENABLED=true sglang generate "${SERVER_ARGS[@]}" "${SAMPLING_ARGS[@]}"
+```
+
+Once the generation task has finished, the server will shut down automatically.
+
+> [!NOTE]
+> The HTTP server-related arguments are ignored in this subcommand.
+
+## Component Path Overrides
+
+SGLang diffusion allows you to override any pipeline component (e.g., `vae`, `transformer`, `text_encoder`) by specifying a custom checkpoint path. This is useful for:
+
+### Example: FLUX.2-dev with Tiny AutoEncoder
+
+You can override **any** component by using `---path`, where `` matches the key in the model's `model_index.json`:
+
+For example, replace the default VAE with a distilled tiny autoencoder for ~3x faster decoding:
+
+```bash
+sglang serve \
+ --model-path=black-forest-labs/FLUX.2-dev \
+ # with a Huggingface Repo ID
+ --vae-path=fal/FLUX.2-Tiny-AutoEncoder
+ # or use a local path
+ --vae-path=~/.cache/huggingface/hub/models--fal--FLUX.2-Tiny-AutoEncoder/snapshots/.../vae
+```
+
+**Important:**
+- The component key must match the one in your model's `model_index.json` (e.g., `vae`).
+- The path must:
+ - either be a Huggingface Repo ID (e.g., fal/FLUX.2-Tiny-AutoEncoder)
+ - or point to a **complete component folder**, containing `config.json` and safetensors files
+
+
+## Diffusers Backend
+
+SGLang diffusion supports a **diffusers backend** that allows you to run any diffusers-compatible model through SGLang's infrastructure using vanilla diffusers pipelines. This is useful for running models without native SGLang implementations or models with custom pipeline classes.
+
+### Arguments
+
+| Argument | Values | Description |
+|----------|--------|-------------|
+| `--backend` | `auto` (default), `sglang`, `diffusers` | `auto`: prefer native SGLang, fallback to diffusers. `sglang`: force native (fails if unavailable). `diffusers`: force vanilla diffusers pipeline. |
+| `--diffusers-attention-backend` | `flash`, `_flash_3_hub`, `sage`, `xformers`, `native` | Attention backend for diffusers pipelines. See [diffusers attention backends](https://huggingface.co/docs/diffusers/main/en/optimization/attention_backends). |
+| `--trust-remote-code` | flag | Required for models with custom pipeline classes (e.g., Ovis). |
+| `--vae-tiling` | flag | Enable VAE tiling for large image support (decodes tile-by-tile). |
+| `--vae-slicing` | flag | Enable VAE slicing for lower memory usage (decodes slice-by-slice). |
+| `--dit-precision` | `fp16`, `bf16`, `fp32` | Precision for the diffusion transformer. |
+| `--vae-precision` | `fp16`, `bf16`, `fp32` | Precision for the VAE. |
+| `--enable-torch-compile` | flag | Enable `torch.compile` for diffusers pipelines. |
+| `--cache-dit-config` | `{PATH}` | Path to a Cache-DiT YAML/JSON config file for accelerating diffusers pipelines with Cache-DiT. |
+
+### Example: Running Ovis-Image-7B
+
+[Ovis-Image-7B](https://huggingface.co/AIDC-AI/Ovis-Image-7B) is a 7B text-to-image model optimized for high-quality text rendering.
+
+```bash
+sglang generate \
+ --model-path AIDC-AI/Ovis-Image-7B \
+ --backend diffusers \
+ --trust-remote-code \
+ --diffusers-attention-backend flash \
+ --prompt "A serene Japanese garden with cherry blossoms" \
+ --height 1024 \
+ --width 1024 \
+ --num-inference-steps 30 \
+ --save-output \
+ --output-path outputs \
+ --output-file-name ovis_garden.png
+```
+
+### Extra Diffusers Arguments
+
+For pipeline-specific parameters not exposed via CLI, use `diffusers_kwargs` in a config file:
+
+```json
+{
+ "model_path": "AIDC-AI/Ovis-Image-7B",
+ "backend": "diffusers",
+ "prompt": "A beautiful landscape",
+ "diffusers_kwargs": {
+ "cross_attention_kwargs": {"scale": 0.5}
+ }
+}
+```
+
+```bash
+sglang generate --config config.json
+```
+
+### Cache-DiT Acceleration
+
+Users who use the diffusers backend can also leverage Cache-DiT acceleration and load custom cache configs from a YAML file to boost performance of diffusers pipelines. See the [Cache-DiT Acceleration](https://docs.sglang.io/diffusion/performance/cache/cache_dit.html) documentation for details.
diff --git a/sglang/docs/diffusion/api/openai_api.md b/sglang/docs/diffusion/api/openai_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..30d8d53122061d8dd476e6304423652a9e239add
--- /dev/null
+++ b/sglang/docs/diffusion/api/openai_api.md
@@ -0,0 +1,420 @@
+# SGLang Diffusion OpenAI API
+
+The SGLang diffusion HTTP server implements an OpenAI-compatible API for image and video generation, as well as LoRA adapter management.
+
+## Prerequisites
+
+- Python 3.11+ if you plan to use the OpenAI Python SDK.
+
+## Serve
+
+Launch the server using the `sglang serve` command.
+
+### Start the server
+
+```bash
+SERVER_ARGS=(
+ --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+ --text-encoder-cpu-offload
+ --pin-cpu-memory
+ --num-gpus 4
+ --ulysses-degree=2
+ --ring-degree=2
+ --port 30010
+)
+
+sglang serve "${SERVER_ARGS[@]}"
+```
+
+- **--model-path**: Path to the model or model ID.
+- **--port**: HTTP port to listen on (default: `30000`).
+
+**Get Model Information**
+
+**Endpoint:** `GET /models`
+
+Returns information about the model served by this server, including model path, task type, pipeline configuration, and precision settings.
+
+**Curl Example:**
+
+```bash
+curl -sS -X GET "http://localhost:30010/models"
+```
+
+**Response Example:**
+
+```json
+{
+ "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+ "task_type": "T2V",
+ "pipeline_name": "wan_pipeline",
+ "pipeline_class": "WanPipeline",
+ "num_gpus": 4,
+ "dit_precision": "bf16",
+ "vae_precision": "fp16"
+}
+```
+
+---
+
+## Endpoints
+
+### Image Generation
+
+The server implements an OpenAI-compatible Images API under the `/v1/images` namespace.
+
+**Create an image**
+
+**Endpoint:** `POST /v1/images/generations`
+
+**Python Example (b64_json response):**
+
+```python
+import base64
+from openai import OpenAI
+
+client = OpenAI(api_key="sk-proj-1234567890", base_url="http://localhost:30010/v1")
+
+img = client.images.generate(
+ prompt="A calico cat playing a piano on stage",
+ size="1024x1024",
+ n=1,
+ response_format="b64_json",
+)
+
+image_bytes = base64.b64decode(img.data[0].b64_json)
+with open("output.png", "wb") as f:
+ f.write(image_bytes)
+```
+
+**Curl Example:**
+
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/generations" \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -d '{
+ "prompt": "A calico cat playing a piano on stage",
+ "size": "1024x1024",
+ "n": 1,
+ "response_format": "b64_json"
+ }'
+```
+
+> **Note**
+> If `response_format=url` is used and cloud storage is not configured, the API returns
+> a relative URL like `/v1/images//content`.
+
+**Edit an image**
+
+**Endpoint:** `POST /v1/images/edits`
+
+This endpoint accepts a multipart form upload with input images and a text prompt. The server can return either a base64-encoded image or a URL to download the image.
+
+**Curl Example (b64_json response):**
+
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/edits" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -F "image=@local_input_image.png" \
+ -F "url=image_url.jpg" \
+ -F "prompt=A calico cat playing a piano on stage" \
+ -F "size=1024x1024" \
+ -F "response_format=b64_json"
+```
+
+**Curl Example (URL response):**
+
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/edits" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -F "image=@local_input_image.png" \
+ -F "url=image_url.jpg" \
+ -F "prompt=A calico cat playing a piano on stage" \
+ -F "size=1024x1024" \
+ -F "response_format=url"
+```
+
+**Download image content**
+
+When `response_format=url` is used with `POST /v1/images/generations` or `POST /v1/images/edits`,
+the API returns a relative URL like `/v1/images//content`.
+
+**Endpoint:** `GET /v1/images/{image_id}/content`
+
+**Curl Example:**
+
+```bash
+curl -sS -L "http://localhost:30010/v1/images//content" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -o output.png
+```
+
+### Video Generation
+
+The server implements a subset of the OpenAI Videos API under the `/v1/videos` namespace.
+
+**Create a video**
+
+**Endpoint:** `POST /v1/videos`
+
+**Python Example:**
+
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key="sk-proj-1234567890", base_url="http://localhost:30010/v1")
+
+video = client.videos.create(
+ prompt="A calico cat playing a piano on stage",
+ size="1280x720"
+)
+print(f"Video ID: {video.id}, Status: {video.status}")
+```
+
+**Curl Example:**
+
+```bash
+curl -sS -X POST "http://localhost:30010/v1/videos" \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -d '{
+ "prompt": "A calico cat playing a piano on stage",
+ "size": "1280x720"
+ }'
+```
+
+**List videos**
+
+**Endpoint:** `GET /v1/videos`
+
+**Python Example:**
+
+```python
+videos = client.videos.list()
+for item in videos.data:
+ print(item.id, item.status)
+```
+
+**Curl Example:**
+
+```bash
+curl -sS -X GET "http://localhost:30010/v1/videos" \
+ -H "Authorization: Bearer sk-proj-1234567890"
+```
+
+**Download video content**
+
+**Endpoint:** `GET /v1/videos/{video_id}/content`
+
+**Python Example:**
+
+```python
+import time
+
+# Poll for completion
+while True:
+ page = client.videos.list()
+ item = next((v for v in page.data if v.id == video_id), None)
+ if item and item.status == "completed":
+ break
+ time.sleep(5)
+
+# Download content
+resp = client.videos.download_content(video_id=video_id)
+with open("output.mp4", "wb") as f:
+ f.write(resp.read())
+```
+
+**Curl Example:**
+
+```bash
+curl -sS -L "http://localhost:30010/v1/videos//content" \
+ -H "Authorization: Bearer sk-proj-1234567890" \
+ -o output.mp4
+```
+
+---
+
+### LoRA Management
+
+The server supports dynamic loading, merging, and unmerging of LoRA adapters.
+
+**Important Notes:**
+- Mutual Exclusion: Only one LoRA can be *merged* (active) at a time
+- Switching: To switch LoRAs, you must first `unmerge` the current one, then `set` the new one
+- Caching: The server caches loaded LoRA weights in memory. Switching back to a previously loaded LoRA (same path) has little cost
+
+**Set LoRA Adapter**
+
+Loads one or more LoRA adapters and merges their weights into the model. Supports both single LoRA (backward compatible) and multiple LoRA adapters.
+
+**Endpoint:** `POST /v1/set_lora`
+
+**Parameters:**
+- `lora_nickname` (string or list of strings, required): A unique identifier for the LoRA adapter(s). Can be a single string or a list of strings for multiple LoRAs
+- `lora_path` (string or list of strings/None, optional): Path to the `.safetensors` file(s) or Hugging Face repo ID(s). Required for the first load; optional if re-activating a cached nickname. If a list, must match the length of `lora_nickname`
+- `target` (string or list of strings, optional): Which transformer(s) to apply the LoRA to. If a list, must match the length of `lora_nickname`. Valid values:
+ - `"all"` (default): Apply to all transformers
+ - `"transformer"`: Apply only to the primary transformer (high noise for Wan2.2)
+ - `"transformer_2"`: Apply only to transformer_2 (low noise for Wan2.2)
+ - `"critic"`: Apply only to the critic model
+- `strength` (float or list of floats, optional): LoRA strength for merge, default 1.0. If a list, must match the length of `lora_nickname`. Values < 1.0 reduce the effect, values > 1.0 amplify the effect
+
+**Single LoRA Example:**
+
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+ -H "Content-Type: application/json" \
+ -d '{
+ "lora_nickname": "lora_name",
+ "lora_path": "/path/to/lora.safetensors",
+ "target": "all",
+ "strength": 0.8
+ }'
+```
+
+**Multiple LoRA Example:**
+
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+ -H "Content-Type: application/json" \
+ -d '{
+ "lora_nickname": ["lora_1", "lora_2"],
+ "lora_path": ["/path/to/lora1.safetensors", "/path/to/lora2.safetensors"],
+ "target": ["transformer", "transformer_2"],
+ "strength": [0.8, 1.0]
+ }'
+```
+
+**Multiple LoRA with Same Target:**
+
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+ -H "Content-Type: application/json" \
+ -d '{
+ "lora_nickname": ["style_lora", "character_lora"],
+ "lora_path": ["/path/to/style.safetensors", "/path/to/character.safetensors"],
+ "target": "all",
+ "strength": [0.7, 0.9]
+ }'
+```
+
+> [!NOTE]
+> When using multiple LoRAs:
+> - All list parameters (`lora_nickname`, `lora_path`, `target`, `strength`) must have the same length
+> - If `target` or `strength` is a single value, it will be applied to all LoRAs
+> - Multiple LoRAs applied to the same target will be merged in order
+
+
+**Merge LoRA Weights**
+
+Manually merges the currently set LoRA weights into the base model.
+
+> [!NOTE]
+> `set_lora` automatically performs a merge, so this is typically only needed if you have manually unmerged but want to re-apply the same LoRA without calling `set_lora` again.*
+
+**Endpoint:** `POST /v1/merge_lora_weights`
+
+**Parameters:**
+- `target` (string, optional): Which transformer(s) to merge. One of "all" (default), "transformer", "transformer_2", "critic"
+- `strength` (float, optional): LoRA strength for merge, default 1.0. Values < 1.0 reduce the effect, values > 1.0 amplify the effect
+
+**Curl Example:**
+
+```bash
+curl -X POST http://localhost:30010/v1/merge_lora_weights \
+ -H "Content-Type: application/json" \
+ -d '{"strength": 0.8}'
+```
+
+
+**Unmerge LoRA Weights**
+
+Unmerges the currently active LoRA weights from the base model, restoring it to its original state. This **must** be called before setting a different LoRA.
+
+**Endpoint:** `POST /v1/unmerge_lora_weights`
+
+**Curl Example:**
+
+```bash
+curl -X POST http://localhost:30010/v1/unmerge_lora_weights \
+ -H "Content-Type: application/json"
+```
+
+**List LoRA Adapters**
+
+Returns loaded LoRA adapters and current application status per module.
+
+**Endpoint:** `GET /v1/list_loras`
+
+**Curl Example:**
+
+```bash
+curl -sS -X GET "http://localhost:30010/v1/list_loras"
+```
+
+**Response Example:**
+
+```json
+{
+ "loaded_adapters": [
+ { "nickname": "lora_a", "path": "/weights/lora_a.safetensors" },
+ { "nickname": "lora_b", "path": "/weights/lora_b.safetensors" }
+ ],
+ "active": {
+ "transformer": [
+ {
+ "nickname": "lora2",
+ "path": "tarn59/pixel_art_style_lora_z_image_turbo",
+ "merged": true,
+ "strength": 1.0
+ }
+ ]
+ }
+}
+```
+
+Notes:
+- If LoRA is not enabled for the current pipeline, the server will return an error.
+- `num_lora_layers_with_weights` counts only layers that have LoRA weights applied for the active adapter.
+
+### Example: Switching LoRAs
+
+1. Set LoRA A:
+ ```bash
+ curl -X POST http://localhost:30010/v1/set_lora -d '{"lora_nickname": "lora_a", "lora_path": "path/to/A"}'
+ ```
+2. Generate with LoRA A...
+3. Unmerge LoRA A:
+ ```bash
+ curl -X POST http://localhost:30010/v1/unmerge_lora_weights
+ ```
+4. Set LoRA B:
+ ```bash
+ curl -X POST http://localhost:30010/v1/set_lora -d '{"lora_nickname": "lora_b", "lora_path": "path/to/B"}'
+ ```
+5. Generate with LoRA B...
+
+### Adjust Output Quality
+
+The server supports adjusting output quality and compression levels for both image and video generation through the `output-quality` and `output-compression` parameters.
+
+#### Parameters
+
+- **`output-quality`** (string, optional): Preset quality level that automatically sets compression. **Default is `"default"`**. Valid values:
+ - `"maximum"`: Highest quality (100)
+ - `"high"`: High quality (90)
+ - `"medium"`: Medium quality (55)
+ - `"low"`: Lower quality (35)
+ - `"default"`: Auto-adjust based on media type (50 for video, 75 for image)
+
+- **`output-compression`** (integer, optional): Direct compression level override (0-100). **Default is `None`**. When provided (not `None`), takes precedence over `output-quality`.
+ - `0`: Lowest quality, smallest file size
+ - `100`: Highest quality, largest file size
+
+#### Notes
+
+- **Precedence**: When both `output-quality` and `output-compression` are provided, `output-compression` takes precedence
+- **Format Support**: Quality settings apply to JPEG, and video formats. PNG uses lossless compression and ignores these settings
+- **File Size vs Quality**: Lower compression values (or "low" quality preset) produce smaller files but may show visible artifacts
diff --git a/sglang/docs/diffusion/ci_perf.md b/sglang/docs/diffusion/ci_perf.md
new file mode 100644
index 0000000000000000000000000000000000000000..088c5be563bc7486f71bceed30516f32112d2731
--- /dev/null
+++ b/sglang/docs/diffusion/ci_perf.md
@@ -0,0 +1,29 @@
+## Perf Baseline Generation Script
+
+`python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py` starts a local diffusion server, issues requests for selected test cases, aggregates stage/denoise-step/E2E timings from the perf log, and writes the results back to the `scenarios` section of `perf_baselines.json`.
+
+### Usage
+
+Update a single case:
+
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --case qwen_image_t2i
+```
+
+Select by regex:
+
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --match 'qwen_image_.*'
+```
+
+Run all keys from the baseline file `scenarios`:
+
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --all-from-baseline
+```
+
+Specify input/output paths and timeout:
+
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --baseline python/sglang/multimodal_gen/test/server/perf_baselines.json --out /tmp/perf_baselines.json --timeout 600
+```
diff --git a/sglang/docs/diffusion/compatibility_matrix.md b/sglang/docs/diffusion/compatibility_matrix.md
new file mode 100644
index 0000000000000000000000000000000000000000..392f3d9b98fc53e099d644565eed17dba9e86666
--- /dev/null
+++ b/sglang/docs/diffusion/compatibility_matrix.md
@@ -0,0 +1,78 @@
+# Compatibility Matrix
+
+The table below shows every supported model and the optimizations supported for them.
+
+The symbols used have the following meanings:
+
+- ✅ = Full compatibility
+- ❌ = No compatibility
+- ⭕ = Does not apply to this model
+
+## Models x Optimization
+
+The `HuggingFace Model ID` can be passed directly to `from_pretrained()` methods, and sglang-diffusion will use the
+optimal
+default parameters when initializing and generating videos.
+
+### Video Generation Models
+
+| Model Name | Hugging Face Model ID | Resolutions | TeaCache | Sliding Tile Attn | Sage Attn | Video Sparse Attention (VSA) | Sparse Linear Attention (SLA) | Sage Sparse Linear Attention (SageSLA) | Sparse Video Gen 2 (SVG2) |
+|:-----------------------------|:--------------------------------------------------|:--------------------|:--------:|:-----------------:|:---------:|:----------------------------:|:----------------------------:|:-----------------------------------------------:|:----------------------------------:|
+| FastWan2.1 T2V 1.3B | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers` | 480p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | ❌ |
+| FastWan2.2 TI2V 5B Full Attn | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720p | ⭕ | ⭕ | ⭕ | ✅ | ❌ | ❌ | ❌ |
+| Wan2.2 TI2V 5B | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | 720p | ⭕ | ⭕ | ✅ | ⭕ | ❌ | ❌ | ❌ |
+| Wan2.2 T2V A14B | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | ❌ |
+| Wan2.2 I2V A14B | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | 480p
720p | ❌ | ❌ | ✅ | ⭕ | ❌ | ❌ | ❌ |
+| HunyuanVideo | `hunyuanvideo-community/HunyuanVideo` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| FastHunyuan | `FastVideo/FastHunyuan-diffusers` | 720×1280
544×960 | ❌ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| Wan2.1 T2V 1.3B | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| Wan2.1 T2V 14B | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | 480p, 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| Wan2.1 I2V 480P | `Wan-AI/Wan2.1-I2V-14B-480P-Diffusers` | 480p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| Wan2.1 I2V 720P | `Wan-AI/Wan2.1-I2V-14B-720P-Diffusers` | 720p | ✅ | ✅ | ✅ | ⭕ | ❌ | ❌ | ✅ |
+| TurboWan2.1 T2V 1.3B | `IPostYellow/TurboWan2.1-T2V-1.3B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ |
+| TurboWan2.1 T2V 14B | `IPostYellow/TurboWan2.1-T2V-14B-Diffusers` | 480p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ |
+| TurboWan2.1 T2V 14B 720P | `IPostYellow/TurboWan2.1-T2V-14B-720P-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ |
+| TurboWan2.2 I2V A14B | `IPostYellow/TurboWan2.2-I2V-A14B-Diffusers` | 720p | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ⭕ |
+
+**Note**:
+1.Wan2.2 TI2V 5B has some quality issues when performing I2V generation. We are working on fixing this issue.
+2.SageSLA Based on SpargeAttn. Install it first with `pip install git+https://github.com/thu-ml/SpargeAttn.git --no-build-isolation`
+
+### Image Generation Models
+
+| Model Name | HuggingFace Model ID | Resolutions |
+|:-----------------|:----------------------------------------|:---------------|
+| FLUX.1-dev | `black-forest-labs/FLUX.1-dev` | Any resolution |
+| FLUX.2-dev | `black-forest-labs/FLUX.2-dev` | Any resolution |
+| FLUX.2-Klein | `black-forest-labs/FLUX.2-klein-4B` | Any resolution |
+| Z-Image-Turbo | `Tongyi-MAI/Z-Image-Turbo` | Any resolution |
+| GLM-Image | `zai-org/GLM-Image` | Any resolution |
+| Qwen Image | `Qwen/Qwen-Image` | Any resolution |
+| Qwen Image 2512 | `Qwen/Qwen-Image-2512` | Any resolution |
+| Qwen Image Edit | `Qwen/Qwen-Image-Edit` | Any resolution |
+
+## Verified LoRA Examples
+
+This section lists example LoRAs that have been explicitly tested and verified with each base model in the **SGLang Diffusion** pipeline.
+
+> Important:
+> LoRAs that are not listed here are not necessarily incompatible.
+> In practice, most standard LoRAs are expected to work, especially those following common Diffusers or SD-style conventions.
+> The entries below simply reflect configurations that have been manually validated by the SGLang team.
+
+### Verified LoRAs by Base Model
+
+| Base Model | Supported LoRAs |
+|:-----------------|:----------------|
+| Wan2.2 | `lightx2v/Wan2.2-Distill-Loras`
`Cseti/wan2.2-14B-Arcane_Jinx-lora-v1` |
+| Wan2.1 | `lightx2v/Wan2.1-Distill-Loras` |
+| Z-Image-Turbo | `tarn59/pixel_art_style_lora_z_image_turbo`
`wcde/Z-Image-Turbo-DeJPEG-Lora` |
+| Qwen-Image | `lightx2v/Qwen-Image-Lightning`
`flymy-ai/qwen-image-realism-lora`
`prithivMLmods/Qwen-Image-HeadshotX`
`starsfriday/Qwen-Image-EVA-LoRA` |
+| Qwen-Image-Edit | `ostris/qwen_image_edit_inpainting`
`lightx2v/Qwen-Image-Edit-2511-Lightning` |
+| Flux | `dvyio/flux-lora-simple-illustration`
`XLabs-AI/flux-furry-lora`
`XLabs-AI/flux-RealismLora` |
+
+## Special requirements
+
+### Sliding Tile Attention
+
+- Currently, only Hopper GPUs (H100s) are supported.
diff --git a/sglang/docs/diffusion/contributing.md b/sglang/docs/diffusion/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..7de656100e16a0a4a6a22b6a256d6e2619970ec5
--- /dev/null
+++ b/sglang/docs/diffusion/contributing.md
@@ -0,0 +1,67 @@
+# Contributing to SGLang Diffusion
+
+This guide outlines the requirements for contributing to the SGLang Diffusion module (`sglang.multimodal_gen`).
+
+## On AI-Assisted ("Vibe Coding") PRs
+
+Vibe-coded PRs are welcome — we judge code quality, not how it was produced. The bar is the same for all PRs:
+
+- **No over-commenting.** If the name says it all, skip the docstring.
+- **No over-catching.** Don't guard against errors that virtually never happen in practice.
+- **Test before submitting.** AI-generated code can be subtly wrong — verify correctness end-to-end.
+
+## Commit Message Convention
+
+We follow a structured commit message format to maintain a clean history.
+
+**Format:**
+```text
+[diffusion] :
+```
+
+**Examples:**
+- `[diffusion] cli: add --perf-dump-path argument`
+- `[diffusion] scheduler: fix deadlock in batch processing`
+- `[diffusion] model: support Stable Diffusion 3.5`
+
+**Rules:**
+- **Prefix**: Always start with `[diffusion]`.
+- **Scope** (Optional): `cli`, `scheduler`, `model`, `pipeline`, `docs`, etc.
+- **Subject**: Imperative mood, short and clear (e.g., "add feature" not "added feature").
+
+## Performance Reporting
+
+For PRs that impact **latency**, **throughput**, or **memory usage**, you **should** provide a performance comparison report.
+
+### How to Generate a Report
+
+1. **Baseline**: run the benchmark (for a single generation task)
+ ```bash
+ $ sglang generate --model-path --prompt "A benchmark prompt" --perf-dump-path baseline.json
+ ```
+
+2. **New**: run the same benchmark, without modifying any server_args or sampling_params
+ ```bash
+ $ sglang generate --model-path --prompt "A benchmark prompt" --perf-dump-path new.json
+ ```
+
+3. **Compare**: run the compare script, which will print a Markdown table to the console
+ ```bash
+ $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...]
+ ### Performance Comparison Report
+ ...
+ ```
+4. **Paste**: paste the table into the PR description
+
+## CI-Based Change Protection
+
+Consider adding tests to the `pr-test` or `nightly-test` suites to safeguard your changes, especially for PRs that:
+
+- support a new model
+ - add a testcase for this new model to `testcase_configs.py`
+- support or fix important features
+- significantly improve performance
+
+Please run the according testcase, then update/add the baseline to `perf_baselines.json` by following the instruction in console if applicable.
+
+See [test](https://github.com/sgl-project/sglang/tree/main/python/sglang/multimodal_gen/test) for examples
diff --git a/sglang/docs/diffusion/environment_variables.md b/sglang/docs/diffusion/environment_variables.md
new file mode 100644
index 0000000000000000000000000000000000000000..b02d7beb749b7840aae00a32d031dc334e5268ea
--- /dev/null
+++ b/sglang/docs/diffusion/environment_variables.md
@@ -0,0 +1,36 @@
+## Caching Acceleration
+
+These variables configure caching acceleration for Diffusion Transformer (DiT) models.
+SGLang supports multiple caching strategies - see [caching documentation](performance/cache/index.md) for an overview.
+
+### Cache-DiT Configuration
+
+See [cache-dit documentation](performance/cache/cache_dit.md) for detailed configuration.
+
+| Environment Variable | Default | Description |
+|-------------------------------------|---------|------------------------------------------|
+| `SGLANG_CACHE_DIT_ENABLED` | false | Enable Cache-DiT acceleration |
+| `SGLANG_CACHE_DIT_FN` | 1 | First N blocks to always compute |
+| `SGLANG_CACHE_DIT_BN` | 0 | Last N blocks to always compute |
+| `SGLANG_CACHE_DIT_WARMUP` | 4 | Warmup steps before caching |
+| `SGLANG_CACHE_DIT_RDT` | 0.24 | Residual difference threshold |
+| `SGLANG_CACHE_DIT_MC` | 3 | Max continuous cached steps |
+| `SGLANG_CACHE_DIT_TAYLORSEER` | false | Enable TaylorSeer calibrator |
+| `SGLANG_CACHE_DIT_TS_ORDER` | 1 | TaylorSeer order (1 or 2) |
+| `SGLANG_CACHE_DIT_SCM_PRESET` | none | SCM preset (none/slow/medium/fast/ultra) |
+| `SGLANG_CACHE_DIT_SCM_POLICY` | dynamic | SCM caching policy |
+| `SGLANG_CACHE_DIT_SCM_COMPUTE_BINS` | not set | Custom SCM compute bins |
+| `SGLANG_CACHE_DIT_SCM_CACHE_BINS` | not set | Custom SCM cache bins |
+
+## Cloud Storage
+
+These variables configure S3-compatible cloud storage for automatically uploading generated images and videos.
+
+| Environment Variable | Default | Description |
+|---------------------------------|---------|--------------------------------------------------------|
+| `SGLANG_CLOUD_STORAGE_TYPE` | not set | Set to `s3` to enable cloud storage |
+| `SGLANG_S3_BUCKET_NAME` | not set | The name of the S3 bucket |
+| `SGLANG_S3_ENDPOINT_URL` | not set | Custom endpoint URL (for MinIO, OSS, etc.) |
+| `SGLANG_S3_REGION_NAME` | us-east-1 | AWS region name |
+| `SGLANG_S3_ACCESS_KEY_ID` | not set | AWS Access Key ID |
+| `SGLANG_S3_SECRET_ACCESS_KEY` | not set | AWS Secret Access Key |
diff --git a/sglang/docs/diffusion/index.md b/sglang/docs/diffusion/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a1aa815a46257201644df1a643f6646abe98d2c
--- /dev/null
+++ b/sglang/docs/diffusion/index.md
@@ -0,0 +1,98 @@
+# SGLang Diffusion
+
+SGLang Diffusion is an inference framework for accelerated image and video generation using diffusion models. It provides an end-to-end unified pipeline with optimized kernels and an efficient scheduler loop.
+
+## Key Features
+
+- **Broad Model Support**: Wan series, FastWan series, Hunyuan, Qwen-Image, Qwen-Image-Edit, Flux, Z-Image, GLM-Image, and more
+- **Fast Inference**: Optimized kernels, efficient scheduler loop, and Cache-DiT acceleration
+- **Ease of Use**: OpenAI-compatible API, CLI, and Python SDK
+- **Multi-Platform**: NVIDIA GPUs (H100, H200, A100, B200, 4090), AMD GPUs (MI300X, MI325X) and Ascend NPU (A2, A3)
+
+---
+
+## Quick Start
+
+### Installation
+
+```bash
+uv pip install "sglang[diffusion]" --prerelease=allow
+```
+
+See [Installation Guide](installation.md) for more installation methods and ROCm-specific instructions.
+
+### Basic Usage
+
+Generate an image with the CLI:
+
+```bash
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A beautiful sunset over the mountains" \
+ --save-output
+```
+
+Or start a server with the OpenAI-compatible API:
+
+```bash
+sglang serve --model-path Qwen/Qwen-Image --port 30010
+```
+
+---
+
+## Documentation
+
+### Getting Started
+
+- **[Installation](installation.md)** - Install SGLang Diffusion via pip, uv, Docker, or from source
+- **[Compatibility Matrix](compatibility_matrix.md)** - Supported models and optimization compatibility
+
+### Usage
+
+- **[CLI Documentation](api/cli.md)** - Command-line interface for `sglang generate` and `sglang serve`
+- **[OpenAI API](api/openai_api.md)** - OpenAI-compatible API for image/video generation and LoRA management
+
+### Performance Optimization
+
+- **[Performance Overview](performance/index.md)** - Overview of all performance optimization strategies
+- **[Attention Backends](performance/attention_backends.md)** - Available attention backends (FlashAttention, SageAttention, etc.)
+- **[Caching Strategies](performance/cache/)** - Cache-DiT and TeaCache acceleration
+- **[Profiling](performance/profiling.md)** - Profiling techniques with PyTorch Profiler and Nsight Systems
+
+### Reference
+
+- **[Environment Variables](environment_variables.md)** - Configuration via environment variables
+- **[Support New Models](support_new_models.md)** - Guide for adding new diffusion models
+- **[Contributing](contributing.md)** - Contribution guidelines and commit message conventions
+- **[CI Performance](ci_perf.md)** - Performance baseline generation script
+
+---
+
+## CLI Quick Reference
+
+### Generate (one-off generation)
+
+```bash
+sglang generate --model-path --prompt "" --save-output
+```
+
+### Serve (HTTP server)
+
+```bash
+sglang serve --model-path --port 30010
+```
+
+### Enable Cache-DiT acceleration
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true sglang generate --model-path --prompt ""
+```
+
+---
+
+## References
+
+- [SGLang GitHub](https://github.com/sgl-project/sglang)
+- [Cache-DiT](https://github.com/vipshop/cache-dit)
+- [FastVideo](https://github.com/hao-ai-lab/FastVideo)
+- [xDiT](https://github.com/xdit-project/xDiT)
+- [Diffusers](https://github.com/huggingface/diffusers)
diff --git a/sglang/docs/diffusion/installation.md b/sglang/docs/diffusion/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..4cd62b10a9d2e25c192f6d287ab9701a3c39d970
--- /dev/null
+++ b/sglang/docs/diffusion/installation.md
@@ -0,0 +1,95 @@
+# Install SGLang-Diffusion
+
+You can install SGLang-Diffusion using one of the methods below.
+
+## Standard Installation (NVIDIA GPUs)
+
+### Method 1: With pip or uv
+
+It is recommended to use uv for a faster installation:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang[diffusion]" --prerelease=allow
+```
+
+### Method 2: From source
+
+```bash
+# Use the latest release branch
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+
+# Install the Python packages
+pip install --upgrade pip
+pip install -e "python[diffusion]"
+
+# With uv
+uv pip install -e "python[diffusion]" --prerelease=allow
+```
+
+### Method 3: Using Docker
+
+The Docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang), built from the [Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile).
+Replace `` below with your HuggingFace Hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+
+```bash
+docker run --gpus all \
+ --shm-size 32g \
+ -p 30000:30000 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HF_TOKEN=" \
+ --ipc=host \
+ lmsysorg/sglang:dev \
+ zsh -c '\
+ echo "Installing diffusion dependencies..." && \
+ pip install -e "python[diffusion]" && \
+ echo "Starting SGLang-Diffusion..." && \
+ sglang generate \
+ --model-path black-forest-labs/FLUX.1-dev \
+ --prompt "A logo With Bold Large text: SGL Diffusion" \
+ --save-output \
+ '
+```
+
+## Platform-Specific: ROCm (AMD GPUs)
+
+For AMD Instinct GPUs (e.g., MI300X), you can use the ROCm-enabled Docker image:
+
+```bash
+docker run --device=/dev/kfd --device=/dev/dri --ipc=host \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env HF_TOKEN= \
+ lmsysorg/sglang:v0.5.5.post2-rocm700-mi30x \
+ sglang generate --model-path black-forest-labs/FLUX.1-dev --prompt "A logo With Bold Large text: SGL Diffusion" --save-output
+```
+
+For detailed ROCm system configuration and installation from source, see [AMD GPUs](../../platforms/amd_gpu.md).
+
+## Platform-Specific: MUSA (Moore Threads GPUs)
+
+For Moore Threads GPUs (MTGPU) with the MUSA software stack:
+
+```bash
+# Clone the repository
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+
+# Install the Python packages
+pip install --upgrade pip
+rm -f python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
+pip install -e "python[all_musa]"
+```
+
+## Platform-Specific: Ascend NPU
+
+For Ascend NPU, please follow the [NPU installation guide](../platforms/ascend_npu.md).
+
+Quick test:
+
+```bash
+sglang generate --model-path black-forest-labs/FLUX.1-dev \
+ --prompt "A logo With Bold Large text: SGL Diffusion" \
+ --save-output
+```
diff --git a/sglang/docs/diffusion/performance/attention_backends.md b/sglang/docs/diffusion/performance/attention_backends.md
new file mode 100644
index 0000000000000000000000000000000000000000..9113d5bb15bb1b98b0da813c9c3b59203bc439c1
--- /dev/null
+++ b/sglang/docs/diffusion/performance/attention_backends.md
@@ -0,0 +1,131 @@
+# Attention Backends
+
+This document describes the attention backends available in sglang diffusion (`sglang.multimodal_gen`) and how to select them.
+
+## Overview
+
+Attention backends are defined by `AttentionBackendEnum` (`sglang.multimodal_gen.runtime.platforms.interface.AttentionBackendEnum`) and selected via the CLI flag `--attention-backend`.
+
+Backend selection is performed by the shared attention layers (e.g. `LocalAttention` / `USPAttention` / `UlyssesAttention` in `sglang.multimodal_gen.runtime.layers.attention.layer`) and therefore applies to any model component using these layers (e.g. diffusion transformer / DiT and encoders).
+
+When using the diffusers backend, `--attention-backend` is passed through to diffusers'
+`set_attention_backend` (e.g., `flash`, `_flash_3_hub`, `sage`, `xformers`, `native`).
+
+- **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA.
+- **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
+- **MPS**: always uses PyTorch SDPA.
+- **NPU**: always uses PyTorch SDPA.
+
+## Backend options
+
+For SGLang-native pipelines, the CLI accepts the lowercase names of `AttentionBackendEnum`. The table below lists the backends implemented by the built-in platforms. `fa3`/`fa4` are accepted as aliases for `fa`.
+
+| CLI value | Enum value | Notes |
+|---|---|---|
+| `fa` / `fa3` / `fa4` | `FA` | FlashAttention. `fa3/fa4` are normalized to `fa` during argument parsing (`ServerArgs.__post_init__`). |
+| `torch_sdpa` | `TORCH_SDPA` | PyTorch `scaled_dot_product_attention`. |
+| `sliding_tile_attn` | `SLIDING_TILE_ATTN` | Sliding Tile Attention (STA). Requires `st_attn`. Configure via `--attention-backend-config`. |
+| `sage_attn` | `SAGE_ATTN` | Requires `sageattention`. Upstream SageAttention CUDA extensions target SM80/SM86/SM89/SM90/SM120 (compute capability 8.0/8.6/8.9/9.0/12.0); see upstream `setup.py`: https://github.com/thu-ml/SageAttention/blob/main/setup.py. |
+| `sage_attn_3` | `SAGE_ATTN_3` | Requires SageAttention3 installed per upstream instructions. |
+| `video_sparse_attn` | `VIDEO_SPARSE_ATTN` | Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. |
+| `vmoba_attn` | `VMOBA_ATTN` | Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. |
+| `aiter` | `AITER` | Requires `aiter`. |
+| `sparse_video_gen_2_attn` | `SPARSE_VIDEO_GEN_2_ATTN` | Requires `svg`. See installation instructions at https://github.com/svg-project/Sparse-VideoGen. |
+
+## Selection priority
+
+The selection order in `runtime/layers/attention/selector.py` is:
+
+1. `global_force_attn_backend(...)` / `global_force_attn_backend_context_manager(...)`
+2. CLI `--attention-backend` (`ServerArgs.attention_backend`)
+3. Auto selection (platform capability, dtype, and installed packages)
+
+## Configuration
+
+Some backends require additional configuration. You can pass these parameters via `--attention-backend-config`. This argument accepts:
+- A path to a JSON or YAML configuration file.
+- A JSON string (e.g., `'{"sparsity": 0.5}'`).
+- Key-value pairs (e.g., `"sparsity=0.5,enable_x=true"`).
+
+### Supported Configuration Parameters
+
+**Sliding Tile Attention (`sliding_tile_attn`)**
+
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `mask_strategy_file_path` | `str` | **Required.** Path to the mask strategy JSON file. | - |
+| `sta_mode` | `str` | Mode of STA. | `STA_inference` |
+| `skip_time_steps` | `int` | Number of steps to use full attention before switching to sparse attention. | `15` |
+
+**Video Sparse Attention (`video_sparse_attn`)**
+
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `sparsity` | `float` | Validation sparsity (0.0 - 1.0). | `0.0` |
+
+**V-MoBA (`vmoba_attn`)**
+
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `temporal_chunk_size` | `int` | Chunk size for temporal dimension. | - |
+| `temporal_topk` | `int` | Top-K tokens to select in temporal dimension. | - |
+| `spatial_chunk_size` | `list[int]` | Chunk size for spatial dimension (H, W). | - |
+| `spatial_topk` | `int` | Top-K tokens to select in spatial dimension. | - |
+| `st_chunk_size` | `list[int]` | Chunk size for spatiotemporal dimension (T, H, W). | - |
+| `st_topk` | `int` | Top-K tokens to select in spatiotemporal dimension. | - |
+| `moba_select_mode` | `str` | Selection mode (e.g., `threshold`). | `threshold` |
+| `moba_threshold` | `float` | Threshold value for selection. | `0.25` |
+| `moba_threshold_type` | `str` | Type of thresholding (e.g., `query_head`). | `query_head` |
+| `first_full_step` | `int` | Number of initial steps to use full attention. | `12` |
+| `first_full_layer` | `int` | Number of initial layers to use full attention. | `0` |
+| `temporal_layer` | `int` | Number of temporal layers. | `1` |
+| `spatial_layer` | `int` | Number of spatial layers. | `1` |
+| `st_layer` | `int` | Number of spatiotemporal layers. | `1` |
+
+## Platform support matrix
+
+| Backend | CUDA | ROCm | MPS | NPU | Notes |
+|---|---:|---:|---:|---:|---|
+| `fa` | ✅ | ✅ | ❌ | ❌ | CUDA requires SM80+ and fp16/bf16. FlashAttention is only used when the required runtime is installed; otherwise it falls back to `torch_sdpa`. |
+| `torch_sdpa` | ✅ | ✅ | ✅ | ✅ | Most compatible option across platforms. |
+| `sliding_tile_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `st_attn`. Configure via `--attention-backend-config`. |
+| `sage_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). |
+| `sage_attn_3` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). |
+| `video_sparse_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. |
+| `vmoba_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. |
+| `aiter` | ✅ | ❌ | ❌ | ❌ | Requires `aiter`. |
+| `sparse_video_gen_2_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `svg`. |
+
+## Usage
+
+### Select a backend via CLI
+
+```bash
+sglang generate \
+ --model-path \
+ --prompt "..." \
+ --attention-backend fa
+```
+
+```bash
+sglang generate \
+ --model-path \
+ --prompt "..." \
+ --attention-backend torch_sdpa
+```
+
+### Using Sliding Tile Attention (STA)
+
+```bash
+# Pass the mask strategy file path via config
+sglang generate \
+ --model-path \
+ --prompt "..." \
+ --attention-backend sliding_tile_attn \
+ --attention-backend-config "mask_strategy_file_path=/abs/path/to/mask_strategy.json"
+```
+
+### Notes for ROCm / MPS
+
+- ROCm: use `--attention-backend torch_sdpa` or `fa` depending on what is available in your environment.
+- MPS: the platform implementation always uses `torch_sdpa`.
diff --git a/sglang/docs/diffusion/performance/cache/cache_dit.md b/sglang/docs/diffusion/performance/cache/cache_dit.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d4ce599fe23ead66e2f4a963d8541de3095ea58
--- /dev/null
+++ b/sglang/docs/diffusion/performance/cache/cache_dit.md
@@ -0,0 +1,273 @@
+# Cache-DiT Acceleration
+
+SGLang integrates [Cache-DiT](https://github.com/vipshop/cache-dit), a caching acceleration engine for Diffusion Transformers (DiT), to achieve up to **1.69x inference speedup** with minimal quality loss.
+
+## Overview
+
+**Cache-DiT** uses intelligent caching strategies to skip redundant computation in the denoising loop:
+
+- **DBCache (Dual Block Cache)**: Dynamically decides when to cache transformer blocks based on residual differences
+- **TaylorSeer**: Uses Taylor expansion for calibration to optimize caching decisions
+- **SCM (Step Computation Masking)**: Step-level caching control for additional speedup
+
+## Basic Usage
+
+Enable Cache-DiT by exporting the environment variable and using `sglang generate` or `sglang serve` :
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A beautiful sunset over the mountains"
+```
+
+## Diffusers Backend
+
+Cache-DiT supports loading acceleration configs from a custom YAML file. For
+diffusers pipelines (`diffusers` backend), pass the YAML/JSON path via `--cache-dit-config`. This
+flow requires cache-dit >= 1.2.0 (`cache_dit.load_configs`).
+
+### Single GPU inference
+
+Define a `cache.yaml` file that contains:
+
+```yaml
+cache_config:
+ max_warmup_steps: 8
+ warmup_interval: 2
+ max_cached_steps: -1
+ max_continuous_cached_steps: 2
+ Fn_compute_blocks: 1
+ Bn_compute_blocks: 0
+ residual_diff_threshold: 0.12
+ enable_taylorseer: true
+ taylorseer_order: 1
+```
+
+Then apply the config with:
+
+```bash
+sglang generate \
+ --backend diffusers \
+ --model-path Qwen/Qwen-Image \
+ --cache-dit-config cache.yaml \
+ --prompt "A beautiful sunset over the mountains"
+```
+
+### Distributed inference
+
+- 1D Parallelism
+
+Define a parallelism only config yaml `parallel.yaml` file that contains:
+
+```yaml
+parallelism_config:
+ ulysses_size: auto
+ parallel_kwargs:
+ attention_backend: native
+ extra_parallel_modules: ["text_encoder", "vae"]
+```
+
+Then, apply the distributed inference acceleration config from yaml. `ulysses_size: auto` means that cache-dit will auto detect the `world_size` as the ulysses_size. Otherwise, you should manually set it as specific int number, e.g, 4.
+
+Then apply the distributed config with: (Note: please add `--num-gpus N` to specify the number of gpus for distributed inference)
+
+```bash
+sglang generate \
+ --backend diffusers \
+ --num-gpus 4 \
+ --model-path Qwen/Qwen-Image \
+ --cache-dit-config parallel.yaml \
+ --prompt "A futuristic cityscape at sunset"
+```
+
+- 2D Parallelism
+
+You can also define a 2D parallelism config yaml `parallel_2d.yaml` file that contains:
+
+```yaml
+parallelism_config:
+ ulysses_size: auto
+ tp_size: 2
+ parallel_kwargs:
+ attention_backend: native
+ extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the 2D parallelism config from yaml. Here `tp_size: 2` means using tensor parallelism with size 2. The `ulysses_size: auto` means that cache-dit will auto detect the `world_size // tp_size` as the ulysses_size.
+
+- 3D Parallelism
+
+You can also define a 3D parallelism config yaml `parallel_3d.yaml` file that contains:
+
+```yaml
+parallelism_config:
+ ulysses_size: 2
+ ring_size: 2
+ tp_size: 2
+ parallel_kwargs:
+ attention_backend: native
+ extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the 3D parallelism config from yaml. Here `ulysses_size: 2`, `ring_size: 2`, `tp_size: 2` means using ulysses parallelism with size 2, ring parallelism with size 2 and tensor parallelism with size 2.
+
+### Hybrid Cache and Parallelism
+
+Define a hybrid cache and parallel acceleration config yaml `hybrid.yaml` file that contains:
+
+```yaml
+cache_config:
+ max_warmup_steps: 8
+ warmup_interval: 2
+ max_cached_steps: -1
+ max_continuous_cached_steps: 2
+ Fn_compute_blocks: 1
+ Bn_compute_blocks: 0
+ residual_diff_threshold: 0.12
+ enable_taylorseer: true
+ taylorseer_order: 1
+parallelism_config:
+ ulysses_size: auto
+ parallel_kwargs:
+ attention_backend: native
+ extra_parallel_modules: ["text_encoder", "vae"]
+```
+
+Then, apply the hybrid cache and parallel acceleration config from yaml.
+
+```bash
+sglang generate \
+ --backend diffusers \
+ --num-gpus 4 \
+ --model-path Qwen/Qwen-Image \
+ --cache-dit-config hybrid.yaml \
+ --prompt "A beautiful sunset over the mountains"
+```
+
+## Advanced Configuration
+
+### DBCache Parameters
+
+DBCache controls block-level caching behavior:
+
+| Parameter | Env Variable | Default | Description |
+|-----------|---------------------------|---------|------------------------------------------|
+| Fn | `SGLANG_CACHE_DIT_FN` | 1 | Number of first blocks to always compute |
+| Bn | `SGLANG_CACHE_DIT_BN` | 0 | Number of last blocks to always compute |
+| W | `SGLANG_CACHE_DIT_WARMUP` | 4 | Warmup steps before caching starts |
+| R | `SGLANG_CACHE_DIT_RDT` | 0.24 | Residual difference threshold |
+| MC | `SGLANG_CACHE_DIT_MC` | 3 | Maximum continuous cached steps |
+
+### TaylorSeer Configuration
+
+TaylorSeer improves caching accuracy using Taylor expansion:
+
+| Parameter | Env Variable | Default | Description |
+|-----------|-------------------------------|---------|---------------------------------|
+| Enable | `SGLANG_CACHE_DIT_TAYLORSEER` | false | Enable TaylorSeer calibrator |
+| Order | `SGLANG_CACHE_DIT_TS_ORDER` | 1 | Taylor expansion order (1 or 2) |
+
+### Combined Configuration Example
+
+DBCache and TaylorSeer are complementary strategies that work together, you can configure both sets of parameters
+simultaneously:
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_FN=2 \
+SGLANG_CACHE_DIT_BN=1 \
+SGLANG_CACHE_DIT_WARMUP=4 \
+SGLANG_CACHE_DIT_RDT=0.4 \
+SGLANG_CACHE_DIT_MC=4 \
+SGLANG_CACHE_DIT_TAYLORSEER=true \
+SGLANG_CACHE_DIT_TS_ORDER=2 \
+sglang generate --model-path black-forest-labs/FLUX.1-dev \
+ --prompt "A curious raccoon in a forest"
+```
+
+### SCM (Step Computation Masking)
+
+SCM provides step-level caching control for additional speedup. It decides which denoising steps to compute fully and
+which to use cached results.
+
+**SCM Presets**
+
+SCM is configured with presets:
+
+| Preset | Compute Ratio | Speed | Quality |
+|----------|---------------|----------|------------|
+| `none` | 100% | Baseline | Best |
+| `slow` | ~75% | ~1.3x | High |
+| `medium` | ~50% | ~2x | Good |
+| `fast` | ~35% | ~3x | Acceptable |
+| `ultra` | ~25% | ~4x | Lower |
+
+**Usage**
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_SCM_PRESET=medium \
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A futuristic cityscape at sunset"
+```
+
+**Custom SCM Bins**
+
+For fine-grained control over which steps to compute vs cache:
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_SCM_COMPUTE_BINS="8,3,3,2,2" \
+SGLANG_CACHE_DIT_SCM_CACHE_BINS="1,2,2,2,3" \
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A futuristic cityscape at sunset"
+```
+
+**SCM Policy**
+
+| Policy | Env Variable | Description |
+|-----------|---------------------------------------|---------------------------------------------|
+| `dynamic` | `SGLANG_CACHE_DIT_SCM_POLICY=dynamic` | Adaptive caching based on content (default) |
+| `static` | `SGLANG_CACHE_DIT_SCM_POLICY=static` | Fixed caching pattern |
+
+## Environment Variables
+
+All Cache-DiT parameters can be configured via environment variables.
+See [Environment Variables](../../environment_variables.md) for the complete list.
+
+## Supported Models
+
+SGLang Diffusion x Cache-DiT supports almost all models originally supported in SGLang Diffusion:
+
+| Model Family | Example Models |
+|--------------|-----------------------------|
+| Wan | Wan2.1, Wan2.2 |
+| Flux | FLUX.1-dev, FLUX.2-dev |
+| Z-Image | Z-Image-Turbo |
+| Qwen | Qwen-Image, Qwen-Image-Edit |
+| Hunyuan | HunyuanVideo |
+
+## Performance Tips
+
+1. **Start with defaults**: The default parameters work well for most models
+2. **Use TaylorSeer**: It typically improves both speed and quality
+3. **Tune R threshold**: Lower values = better quality, higher values = faster
+4. **SCM for extra speed**: Use `medium` preset for good speed/quality balance
+5. **Warmup matters**: Higher warmup = more stable caching decisions
+
+## Limitations
+
+- **SGLang-native pipelines**: Distributed support (TP/SP) is not yet validated; Cache-DiT will be automatically
+ disabled when `world_size > 1`.
+- **SCM minimum steps**: SCM requires >= 8 inference steps to be effective
+- **Model support**: Only models registered in Cache-DiT's BlockAdapterRegister are supported
+
+## Troubleshooting
+
+### SCM disabled for low step count
+
+For models with < 8 inference steps (e.g., DMD distilled models), SCM will be automatically disabled. DBCache
+acceleration still works.
+
+## References
+
+- [Cache-DiT](https://github.com/vipshop/cache-dit)
+- [SGLang Diffusion](../index.md)
diff --git a/sglang/docs/diffusion/performance/cache/index.md b/sglang/docs/diffusion/performance/cache/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1f8e61d0978ba540d8b0eb7f8283bcc42e0ca1b
--- /dev/null
+++ b/sglang/docs/diffusion/performance/cache/index.md
@@ -0,0 +1,60 @@
+# Caching Acceleration for Diffusion Models
+
+SGLang provides multiple caching acceleration strategies for Diffusion Transformer (DiT) models. These strategies can significantly reduce inference time by skipping redundant computation.
+
+## Overview
+
+SGLang supports two complementary caching approaches:
+
+| Strategy | Scope | Mechanism | Best For |
+|----------|-------|-----------|----------|
+| **Cache-DiT** | Block-level | Skip individual transformer blocks dynamically | Advanced, higher speedup |
+| **TeaCache** | Timestep-level | Skip entire denoising steps based on L1 similarity | Simple, built-in |
+
+
+
+## Cache-DiT
+
+[Cache-DiT](https://github.com/vipshop/cache-dit) provides block-level caching with
+advanced strategies like DBCache and TaylorSeer. It can achieve up to **1.69x speedup**.
+
+See [cache_dit.md](cache_dit.md) for detailed configuration.
+
+### Quick Start
+
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A beautiful sunset over the mountains"
+```
+
+### Key Features
+
+- **DBCache**: Dynamic block-level caching based on residual differences
+- **TaylorSeer**: Taylor expansion-based calibration for optimized caching
+- **SCM**: Step-level computation masking for additional speedup
+
+## TeaCache
+
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+
+See [teacache.md](teacache.md) for detailed documentation.
+
+### Quick Overview
+
+- Tracks L1 distance between modulated inputs across timesteps
+- When accumulated distance is below threshold, reuses cached residual
+- Supports CFG with separate positive/negative caches
+
+### Supported Models
+
+- Wan (wan2.1, wan2.2)
+- Hunyuan (HunyuanVideo)
+- Z-Image
+
+For Flux and Qwen models, TeaCache is automatically disabled when CFG is enabled.
+
+## References
+
+- [Cache-DiT Repository](https://github.com/vipshop/cache-dit)
+- [TeaCache Paper](https://arxiv.org/abs/2411.14324)
diff --git a/sglang/docs/diffusion/performance/cache/teacache.md b/sglang/docs/diffusion/performance/cache/teacache.md
new file mode 100644
index 0000000000000000000000000000000000000000..7960437c7b682daa103062cc454fa11c3cad3846
--- /dev/null
+++ b/sglang/docs/diffusion/performance/cache/teacache.md
@@ -0,0 +1,84 @@
+# TeaCache Acceleration
+
+> **Note**: This is one of two caching strategies available in SGLang.
+> For an overview of all caching options, see [caching](../index.md).
+
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+
+## Overview
+
+TeaCache works by:
+1. Tracking the L1 distance between modulated inputs across consecutive timesteps
+2. Accumulating the rescaled L1 distance over steps
+3. When accumulated distance is below a threshold, reusing the cached residual
+4. Supporting CFG (Classifier-Free Guidance) with separate positive/negative caches
+
+## How It Works
+
+### L1 Distance Tracking
+
+At each denoising step, TeaCache computes the relative L1 distance between the current and previous modulated inputs:
+
+```
+rel_l1 = |current - previous|.mean() / |previous|.mean()
+```
+
+This distance is then rescaled using polynomial coefficients and accumulated:
+
+```
+accumulated += poly(coefficients)(rel_l1)
+```
+
+### Cache Decision
+
+- If `accumulated >= threshold`: Force computation, reset accumulator
+- If `accumulated < threshold`: Skip computation, use cached residual
+
+### CFG Support
+
+For models that support CFG cache separation (Wan, Hunyuan, Z-Image), TeaCache maintains separate caches for positive and negative branches:
+- `previous_modulated_input` / `previous_residual` for positive branch
+- `previous_modulated_input_negative` / `previous_residual_negative` for negative branch
+
+For models that don't support CFG separation (Flux, Qwen), TeaCache is automatically disabled when CFG is enabled.
+
+## Configuration
+
+TeaCache is configured via `TeaCacheParams` in the sampling parameters:
+
+```python
+from sglang.multimodal_gen.configs.sample.teacache import TeaCacheParams
+
+params = TeaCacheParams(
+ teacache_thresh=0.1, # Threshold for accumulated L1 distance
+ coefficients=[1.0, 0.0, 0.0], # Polynomial coefficients for L1 rescaling
+)
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `teacache_thresh` | float | Threshold for accumulated L1 distance. Lower = more caching, faster but potentially lower quality |
+| `coefficients` | list[float] | Polynomial coefficients for L1 rescaling. Model-specific tuning |
+
+### Model-Specific Configurations
+
+Different models may have different optimal configurations. The coefficients are typically tuned per-model to balance speed and quality.
+
+## Supported Models
+
+TeaCache is built into the following model families:
+
+| Model Family | CFG Cache Separation | Notes |
+|--------------|---------------------|-------|
+| Wan (wan2.1, wan2.2) | Yes | Full support |
+| Hunyuan (HunyuanVideo) | Yes | To be supported |
+| Z-Image | Yes | To be supported |
+| Flux | No | To be supported |
+| Qwen | No | To be supported |
+
+
+## References
+
+- [TeaCache: Accelerating Diffusion Models with Temporal Similarity](https://arxiv.org/abs/2411.14324)
diff --git a/sglang/docs/diffusion/performance/index.md b/sglang/docs/diffusion/performance/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..f61c4e93c17a1714a4cbf0b2043eddfa51202ab8
--- /dev/null
+++ b/sglang/docs/diffusion/performance/index.md
@@ -0,0 +1,72 @@
+# Performance Optimization
+
+SGLang-Diffusion provides multiple performance optimization strategies to accelerate inference. This section covers all available performance tuning options.
+
+## Overview
+
+| Optimization | Type | Description |
+|--------------|------|-------------|
+| **Cache-DiT** | Caching | Block-level caching with DBCache, TaylorSeer, and SCM |
+| **TeaCache** | Caching | Timestep-level caching using L1 similarity |
+| **Attention Backends** | Kernel | Optimized attention implementations (FlashAttention, SageAttention, etc.) |
+| **Profiling** | Diagnostics | PyTorch Profiler and Nsight Systems guidance |
+
+## Caching Strategies
+
+SGLang supports two complementary caching approaches:
+
+### Cache-DiT
+
+[Cache-DiT](https://github.com/vipshop/cache-dit) provides block-level caching with advanced strategies. It can achieve up to **1.69x speedup**.
+
+**Quick Start:**
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+ --prompt "A beautiful sunset over the mountains"
+```
+
+**Key Features:**
+- **DBCache**: Dynamic block-level caching based on residual differences
+- **TaylorSeer**: Taylor expansion-based calibration for optimized caching
+- **SCM**: Step-level computation masking for additional speedup
+
+See [Cache-DiT Documentation](cache/cache_dit.md) for detailed configuration.
+
+### TeaCache
+
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+
+**Quick Overview:**
+- Tracks L1 distance between modulated inputs across timesteps
+- When accumulated distance is below threshold, reuses cached residual
+- Supports CFG with separate positive/negative caches
+
+**Supported Models:** Wan (wan2.1, wan2.2), Hunyuan (HunyuanVideo), Z-Image
+
+See [TeaCache Documentation](cache/teacache.md) for detailed configuration.
+
+## Attention Backends
+
+Different attention backends offer varying performance characteristics depending on your hardware and model:
+
+- **FlashAttention**: Fastest on NVIDIA GPUs with fp16/bf16
+- **SageAttention**: Alternative optimized implementation
+- **xformers**: Memory-efficient attention
+- **SDPA**: PyTorch native scaled dot-product attention
+
+See [Attention Backends](attention_backends.md) for platform support and configuration options.
+
+## Profiling
+
+To diagnose performance bottlenecks, SGLang-Diffusion supports profiling tools:
+
+- **PyTorch Profiler**: Built-in Python profiling
+- **Nsight Systems**: GPU kernel-level analysis
+
+See [Profiling Guide](profiling.md) for detailed instructions.
+
+## References
+
+- [Cache-DiT Repository](https://github.com/vipshop/cache-dit)
+- [TeaCache Paper](https://arxiv.org/abs/2411.14324)
diff --git a/sglang/docs/diffusion/performance/profiling.md b/sglang/docs/diffusion/performance/profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..b445138c03ccd32cf2213b220a3d96ea02aa90ac
--- /dev/null
+++ b/sglang/docs/diffusion/performance/profiling.md
@@ -0,0 +1,136 @@
+# Profiling Multimodal Generation
+
+This guide covers profiling techniques for multimodal generation pipelines in SGLang.
+
+## PyTorch Profiler
+
+PyTorch Profiler provides detailed kernel execution time, call stack, and GPU utilization metrics.
+
+### Denoising Stage Profiling
+
+Profile the denoising stage with sampled timesteps (default: 5 steps after 1 warmup step):
+
+```bash
+sglang generate \
+ --model-path Qwen/Qwen-Image \
+ --prompt "A Logo With Bold Large Text: SGL Diffusion" \
+ --seed 0 \
+ --profile
+```
+
+**Parameters:**
+- `--profile`: Enable profiling for the denoising stage
+- `--num-profiled-timesteps N`: Number of timesteps to profile after warmup (default: 5)
+ - Smaller values reduce trace file size
+ - Example: `--num-profiled-timesteps 10` profiles 10 steps after 1 warmup step
+
+### Full Pipeline Profiling
+
+Profile all pipeline stages (text encoding, denoising, VAE decoding, etc.):
+
+```bash
+sglang generate \
+ --model-path Qwen/Qwen-Image \
+ --prompt "A Logo With Bold Large Text: SGL Diffusion" \
+ --seed 0 \
+ --profile \
+ --profile-all-stages
+```
+
+**Parameters:**
+- `--profile-all-stages`: Used with `--profile`, profile all pipeline stages instead of just denoising
+
+### Output Location
+
+By default, trace files are saved in the ./logs/ directory.
+
+The exact output file path will be shown in the console output, for example:
+
+```bash
+[mm-dd hh:mm:ss] Saved profiler traces to: /sgl-workspace/sglang/logs/mocked_fake_id_for_offline_generate-5_steps-global-rank0.trace.json.gz
+```
+
+### View Traces
+
+Load and visualize trace files at:
+- https://ui.perfetto.dev/ (recommended)
+- chrome://tracing (Chrome only)
+
+For large trace files, reduce `--num-profiled-timesteps` or avoid using `--profile-all-stages`.
+
+
+### `--perf-dump-path` (Stage/Step Timing Dump)
+
+Besides profiler traces, you can also dump a lightweight JSON report that contains:
+- stage-level timing breakdown for the full pipeline
+- step-level timing breakdown for the denoising stage (per diffusion step)
+
+This is useful to quickly identify which stage dominates end-to-end latency, and whether denoising steps have uniform runtimes (and if not, which step has an abnormal spike).
+
+The dumped JSON contains a `denoise_steps_ms` field formatted as an array of objects, each with a `step` key (the step index) and a `duration_ms` key.
+
+Example:
+
+```bash
+sglang generate \
+ --model-path \
+ --prompt "" \
+ --perf-dump-path perf.json
+```
+
+## Nsight Systems
+
+Nsight Systems provides low-level CUDA profiling with kernel details, register usage, and memory access patterns.
+
+### Installation
+
+See the [SGLang profiling guide](https://github.com/sgl-project/sglang/blob/main/docs/developer_guide/benchmark_and_profiling.md#profile-with-nsight) for installation instructions.
+
+### Basic Profiling
+
+Profile the entire pipeline execution:
+
+```bash
+nsys profile \
+ --trace-fork-before-exec=true \
+ --cuda-graph-trace=node \
+ --force-overwrite=true \
+ -o QwenImage \
+ sglang generate \
+ --model-path Qwen/Qwen-Image \
+ --prompt "A Logo With Bold Large Text: SGL Diffusion" \
+ --seed 0
+```
+
+### Targeted Stage Profiling
+
+Use `--delay` and `--duration` to capture specific stages and reduce file size:
+
+```bash
+nsys profile \
+ --trace-fork-before-exec=true \
+ --cuda-graph-trace=node \
+ --force-overwrite=true \
+ --delay 10 \
+ --duration 30 \
+ -o QwenImage_denoising \
+ sglang generate \
+ --model-path Qwen/Qwen-Image \
+ --prompt "A Logo With Bold Large Text: SGL Diffusion" \
+ --seed 0
+```
+
+**Parameters:**
+- `--delay N`: Wait N seconds before starting capture (skip initialization overhead)
+- `--duration N`: Capture for N seconds (focus on specific stages)
+- `--force-overwrite`: Overwrite existing output files
+
+## Notes
+
+- **Reduce trace size**: Use `--num-profiled-timesteps` with smaller values or `--delay`/`--duration` with Nsight Systems
+- **Stage-specific analysis**: Use `--profile` alone for denoising stage, add `--profile-all-stages` for full pipeline
+- **Multiple runs**: Profile with different prompts and resolutions to identify bottlenecks across workloads
+
+## FAQ
+
+- If you are profiling `sglang generate` with Nsight Systems and find that the generated profiler file did not capture any CUDA kernels, you can resolve this issue by increasing the model's inference steps to extend the execution time.
diff --git a/sglang/docs/diffusion/support_new_models.md b/sglang/docs/diffusion/support_new_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..3141d5affbfb14f4a5d859c32f1e0e75e0286340
--- /dev/null
+++ b/sglang/docs/diffusion/support_new_models.md
@@ -0,0 +1,104 @@
+# How to Support New Diffusion Models
+
+This document explains how to add support for new diffusion models in SGLang diffusion.
+
+## Architecture Overview
+
+SGLang diffusion is engineered for both performance and flexibility, built upon a modular pipeline architecture. This
+design allows developers to easily construct complex, customized pipelines for various diffusion models by combining and
+reusing different components.
+
+At its core, the architecture revolves around two key concepts, as highlighted in our [blog post](https://lmsys.org/blog/2025-11-07-sglang-diffusion/#architecture):
+
+- **`ComposedPipeline`**: This class orchestrates a series of `PipelineStage`s to define the complete generation process for a specific model. It acts as the main entry point for a model and manages the data flow between the different stages of the diffusion process.
+- **`PipelineStage`**: Each stage is a modular component that encapsulates a common function within the diffusion process. Examples include prompt encoding, the denoising loop, or VAE decoding. These stages are designed to be self-contained and reusable across different pipelines.
+
+## Key Components for Implementation
+
+To add support for a new diffusion model, you will primarily need to define or configure the following components:
+
+1. **`PipelineConfig`**: This is a dataclass that holds all the static configurations for your model pipeline. It includes paths to model components (like UNet, VAE, text encoders), precision settings (e.g., `fp16`, `bf16`), and other model-specific architectural parameters. Each model typically has its own subclass of `PipelineConfig`.
+
+2. **`SamplingParams`**: This dataclass defines the parameters that control the generation process at runtime. These are the user-provided inputs for a generation request, such as the `prompt`, `negative_prompt`, `guidance_scale`, `num_inference_steps`, `seed`, output dimensions (`height`, `width`), etc.
+
+3. **`ComposedPipeline` (not a config)**: This is the central class where you define the structure of your model's generation pipeline. You will create a new class that inherits from `ComposedPipelineBase` and, within it, instantiate and chain together the necessary `PipelineStage`s in the correct order. See `ComposedPipelineBase` and `PipelineStage` base definitions:
+ - [`ComposedPipelineBase`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/pipelines/composed_pipeline_base.py)
+ - [`PipelineStage`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/pipelines/stages/base.py)
+ - [Central registry (models/config mapping)](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/registry.py)
+
+4. **Modules (components referenced by the pipeline)**: Each pipeline references a set of modules that are loaded from the model repository (e.g., Diffusers `model_index.json`) and assembled via the registry/loader. Common modules include:
+ - `text_encoder`: Encodes text prompts into embeddings
+ - `tokenizer`: Tokenizes raw text input for the text encoder(s).
+ - `processor`: Preprocesses images and extracts features; often used in image-to-image tasks.
+ - `image_encoder`: Specialized image feature extractor (may be distinct from or combined with `processor`).
+ - `dit/transformer`: The core denoising network (DiT/UNet architecture) operating in latent space.
+ - `scheduler`: Controls the timestep schedule and denoising dynamics throughout inference.
+ - `vae`: Variational Autoencoder for encoding/decoding between pixel space and latent space.
+
+## Available Pipeline Stages
+
+You can build your custom `ComposedPipeline` by combining the following available stages as needed. Each stage is responsible for a specific part of the generation process.
+
+| Stage Class | Description |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------- |
+| `InputValidationStage` | Validates the user-provided `SamplingParams` to ensure they are correct before starting the pipeline. |
+| `TextEncodingStage` | Encodes text prompts into embeddings using one or more text encoders. |
+| `ImageEncodingStage` | Encodes input images into embeddings, often used in image-to-image tasks. |
+| `ImageVAEEncodingStage` | Specifically encodes an input image into the latent space using a Variational Autoencoder (VAE). |
+| `TimestepPreparationStage` | Prepares the scheduler's timesteps for the diffusion process. |
+| `LatentPreparationStage` | Creates the initial noisy latent tensor that will be denoised. |
+| `DenoisingStage` | Executes the main denoising loop, iteratively applying the model (e.g., UNet) to refine the latents. |
+| `DecodingStage` | Decodes the final latent tensor from the denoising loop back into pixel space (e.g., an image) using the VAE. |
+| `DmdDenoisingStage` | A specialized denoising stage for certain model architectures. |
+| `CausalDMDDenoisingStage` | A specialized causal denoising stage for specific video models. |
+
+## Example: Implementing `Qwen-Image-Edit`
+
+To illustrate the process, let's look at how `Qwen-Image-Edit` is implemented. The typical implementation order is:
+
+1. **Analyze Required Modules**:
+ - Study the target model's components by examining its `model_index.json` or Diffusers implementation to identify required modules:
+ - `processor`: Image preprocessing and feature extraction
+ - `scheduler`: Diffusion timestep scheduling
+ - `text_encoder`: Text-to-embedding conversion
+ - `tokenizer`: Text tokenization for the encoder
+ - `transformer`: Core DiT denoising network
+ - `vae`: Variational autoencoder for latent encoding/decoding
+
+2. **Create Configs**:
+ - **PipelineConfig**: [`QwenImageEditPipelineConfig`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/configs/pipelines/qwen_image.py) defines model-specific parameters, precision settings, preprocessing functions, and latent shape calculations.
+ - **SamplingParams**: [`QwenImageSamplingParams`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/configs/sample/qwenimage.py) sets runtime defaults like `num_frames=1`, `guidance_scale=4.0`, `num_inference_steps=50`.
+
+3. **Implement Model Components**:
+ - Adapt or implement specific model components in the appropriate directories:
+ - **DiT/Transformer**: Implement in [`runtime/models/dits/`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/dits/) - e.g., [`qwen_image.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/dits/qwen_image.py) for Qwen's DiT architecture
+ - **Encoders**: Implement in [`runtime/models/encoders/`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/encoders/) - e.g., text encoders like [`qwen2_5vl.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/encoders/qwen2_5vl.py)
+ - **VAEs**: Implement in [`runtime/models/vaes/`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/vaes/) - e.g., [`autoencoder_kl_qwenimage.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/vaes/autoencoder_kl_qwenimage.py)
+ - **Schedulers**: Implement in [`runtime/models/schedulers/`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/models/schedulers/) if needed
+ - These components handle the core model logic, attention mechanisms, and data transformations specific to the target diffusion model.
+
+4. **Define Pipeline Class**:
+ - The [`QwenImageEditPipeline`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/runtime/architectures/basic/qwen_image/qwen_image.py) class inherits from `ComposedPipelineBase` and orchestrates stages sequentially.
+ - Declare required modules via `_required_config_modules` and implement the pipeline stages:
+
+ ```python
+ class QwenImageEditPipeline(ComposedPipelineBase):
+ pipeline_name = "QwenImageEditPipeline" # Matches Diffusers model_index.json
+ _required_config_modules = ["processor", "scheduler", "text_encoder", "tokenizer", "transformer", "vae"]
+
+ def create_pipeline_stages(self, server_args: ServerArgs):
+ self.add_stage(InputValidationStage())
+ self.add_stage(ImageEncodingStage(...))
+ self.add_stage(ImageVAEEncodingStage(...))
+ self.add_stage(TimestepPreparationStage(...))
+ self.add_stage(LatentPreparationStage(...))
+ self.add_stage(DenoisingStage(...))
+ self.add_stage(DecodingStage(...))
+ ```
+ The pipeline is constructed by adding stages in order. `Qwen-Image-Edit` uses `ImageEncodingStage` (for prompt and image processing) and `ImageVAEEncodingStage` (for latent extraction) before standard denoising and decoding.
+
+5. **Register Configs**:
+ - Register the configs in the central registry ([`registry.py`](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/registry.py)) via `_register_configs` to enable automatic loading and instantiation for the model. Modules are automatically loaded and injected based on the config and repository structure.
+
+By following this pattern of defining configurations and composing pipelines, you can integrate new diffusion models
+into SGLang with ease.
diff --git a/sglang/docs/get_started/install.md b/sglang/docs/get_started/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..cee24d05c06fde0380f7db364e1dda3d69d9a46d
--- /dev/null
+++ b/sglang/docs/get_started/install.md
@@ -0,0 +1,226 @@
+# Install SGLang
+
+You can install SGLang using one of the methods below.
+This page primarily applies to common NVIDIA GPU platforms.
+For other or newer platforms, please refer to the dedicated pages for [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [TPU](../platforms/tpu.md), [NVIDIA DGX Spark](https://lmsys.org/blog/2025-11-03-gpt-oss-on-nvidia-dgx-spark/), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md), and [Intel XPU](../platforms/xpu.md).
+
+## Method 1: With pip or uv
+
+It is recommended to use uv for faster installation:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install sglang
+```
+
+### For CUDA 13
+
+Docker is recommended (see Method 3 note on B300/GB300/CUDA 13). If you do not have Docker access, follow these steps:
+
+1. Install PyTorch with CUDA 13 support first:
+```bash
+# Replace X.Y.Z with the version by your SGLang install
+uv pip install torch==X.Y.Z torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+```
+
+2. Install sglang:
+```bash
+uv pip install sglang
+```
+
+3. Install the `sgl_kernel` wheel for CUDA 13 from [the sgl-project whl releases](https://github.com/sgl-project/whl/blob/gh-pages/cu130/sgl-kernel/index.html). Replace `X.Y.Z` with the `sgl_kernel` version required by your SGLang install (you can find this by running `uv pip show sgl_kernel`). Examples:
+```bash
+# x86_64
+uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_x86_64.whl"
+
+# aarch64
+uv pip install "https://github.com/sgl-project/whl/releases/download/vX.Y.Z/sgl_kernel-X.Y.Z+cu130-cp310-abi3-manylinux2014_aarch64.whl"
+```
+
+### **Quick fixes to common problems**
+- If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
+ 1. Use `export CUDA_HOME=/usr/local/cuda-` to set the `CUDA_HOME` environment variable.
+ 2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
+
+## Method 2: From source
+
+```bash
+# Use the last release branch
+git clone -b v0.5.9 https://github.com/sgl-project/sglang.git
+cd sglang
+
+# Install the python packages
+pip install --upgrade pip
+pip install -e "python"
+```
+
+**Quick fixes to common problems**
+
+- If you want to develop SGLang, you can try the dev docker image. Please refer to [setup docker container](../developer_guide/development_guide_using_docker.md#setup-docker-container). The docker image is `lmsysorg/sglang:dev`.
+
+## Method 3: Using docker
+
+The docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
+Replace `` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+
+```bash
+docker run --gpus all \
+ --shm-size 32g \
+ -p 30000:30000 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HF_TOKEN=" \
+ --ipc=host \
+ lmsysorg/sglang:latest \
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+```
+
+For production deployments, use the `runtime` variant which is significantly smaller (~40% reduction) by excluding build tools and development dependencies:
+
+```bash
+docker run --gpus all \
+ --shm-size 32g \
+ -p 30000:30000 \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HF_TOKEN=" \
+ --ipc=host \
+ lmsysorg/sglang:latest-runtime \
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+```
+
+You can also find the nightly docker images [here](https://hub.docker.com/r/lmsysorg/sglang/tags?name=nightly).
+
+Notes:
+- On B300/GB300 (SM103) or CUDA 13 environment, we recommend using the nightly image at `lmsysorg/sglang:dev-cu13` or stable image at `lmsysorg/sglang:latest-cu130-runtime`. Please, do not re-install the project as editable inside the docker image, since it will override the version of libraries specified by the cu13 docker image.
+
+## Method 4: Using Kubernetes
+
+Please check out [OME](https://github.com/sgl-project/ome), a Kubernetes operator for enterprise-grade management and serving of large language models (LLMs).
+
+
+More
+
+1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
+
+ Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
+
+2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
+
+ Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
+
+
+
+## Method 5: Using docker compose
+
+
+More
+
+> This method is recommended if you plan to serve it as a service.
+> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
+
+1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
+2. Execute the command `docker compose up -d` in your terminal.
+
+
+## Method 6: Run on Kubernetes or Clouds with SkyPilot
+
+
+More
+
+To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
+
+1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+2. Deploy on your own infra with a single command and get the HTTP API endpoint:
+
+SkyPilot YAML: sglang.yaml
+
+```yaml
+# sglang.yaml
+envs:
+ HF_TOKEN: null
+
+resources:
+ image_id: docker:lmsysorg/sglang:latest
+ accelerators: A100
+ ports: 30000
+
+run: |
+ conda deactivate
+ python3 -m sglang.launch_server \
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000
+```
+
+
+
+```bash
+# Deploy on any cloud or Kubernetes cluster. Use --cloud to select a specific cloud provider.
+HF_TOKEN= sky launch -c sglang --env HF_TOKEN sglang.yaml
+
+# Get the HTTP API endpoint
+sky status --endpoint 30000 sglang
+```
+
+3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
+
+
+
+## Method 7: Run on AWS SageMaker
+
+
+More
+
+To deploy on SGLang on AWS SageMaker, check out [AWS SageMaker Inference](https://aws.amazon.com/sagemaker/ai/deploy)
+
+Amazon Web Services provide supports for SGLang containers along with routine security patching. For available SGLang containers, check out [AWS SGLang DLCs](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#sglang-containers)
+
+To host a model with your own container, follow the following steps:
+
+1. Build a docker container with [sagemaker.Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/sagemaker.Dockerfile) alongside the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script.
+2. Push your container onto AWS ECR.
+
+
+Dockerfile Build Script: build-and-push.sh
+
+```bash
+#!/bin/bash
+AWS_ACCOUNT=""
+AWS_REGION=""
+REPOSITORY_NAME=""
+IMAGE_TAG=""
+
+ECR_REGISTRY="${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+IMAGE_URI="${ECR_REGISTRY}/${REPOSITORY_NAME}:${IMAGE_TAG}"
+
+echo "Starting build and push process..."
+
+# Login to ECR
+echo "Logging into ECR..."
+aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${ECR_REGISTRY}
+
+# Build the image
+echo "Building Docker image..."
+docker build -t ${IMAGE_URI} -f sagemaker.Dockerfile .
+
+echo "Pushing ${IMAGE_URI}"
+docker push ${IMAGE_URI}
+
+echo "Build and push completed successfully!"
+```
+
+
+
+3. Deploy a model for serving on AWS Sagemaker, refer to [deploy_and_serve_endpoint.py](https://github.com/sgl-project/sglang/blob/main/examples/sagemaker/deploy_and_serve_endpoint.py). For more information, check out [sagemaker-python-sdk](https://github.com/aws/sagemaker-python-sdk).
+ 1. By default, the model server on SageMaker will run with the following command: `python3 -m sglang.launch_server --model-path opt/ml/model --host 0.0.0.0 --port 8080`. This is optimal for hosting your own model with SageMaker.
+ 2. To modify your model serving parameters, the [serve](https://github.com/sgl-project/sglang/blob/main/docker/serve) script allows for all available options within `python3 -m sglang.launch_server --help` cli by specifying environment variables with prefix `SM_SGLANG_`.
+ 3. The serve script will automatically convert all environment variables with prefix `SM_SGLANG_` from `SM_SGLANG_INPUT_ARGUMENT` into `--input-argument` to be parsed into `python3 -m sglang.launch_server` cli.
+ 4. For example, to run [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) with reasoning parser, simply add additional environment variables `SM_SGLANG_MODEL_PATH=Qwen/Qwen3-0.6B` and `SM_SGLANG_REASONING_PARSER=qwen3`.
+
+
+
+## Common Notes
+
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
+- To reinstall flashinfer locally, use the following command: `pip3 install --upgrade flashinfer-python --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`.
+- When encountering `ptxas fatal : Value 'sm_103a' is not defined for option 'gpu-name'` on B300/GB300, fix it with `export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas`.
diff --git a/sglang/docs/references/custom_chat_template.md b/sglang/docs/references/custom_chat_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..f22ee8bec30c0c48ddf6e4dbb6a7af9a2626f20f
--- /dev/null
+++ b/sglang/docs/references/custom_chat_template.md
@@ -0,0 +1,51 @@
+# Custom Chat Template
+
+**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
+
+By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
+It should just work for most official models such as Llama-2/Llama-3.
+
+If needed, you can also override the chat template when launching the server:
+
+```bash
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-2-7b-chat-hf \
+ --port 30000 \
+ --chat-template llama-2
+```
+
+If the chat template you are looking for is missing, you are welcome to contribute it or load it from a file.
+
+## JSON Format
+
+You can load the JSON format, which is defined by `conversation.py`.
+
+```json
+{
+ "name": "my_model",
+ "system": "<|im_start|>system",
+ "user": "<|im_start|>user",
+ "assistant": "<|im_start|>assistant",
+ "sep_style": "CHATML",
+ "sep": "<|im_end|>",
+ "stop_str": ["<|im_end|>", "<|im_start|>"]
+}
+```
+
+```bash
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-2-7b-chat-hf \
+ --port 30000 \
+ --chat-template ./my_model_template.json
+```
+
+## Jinja Format
+
+You can also use the [Jinja template format](https://huggingface.co/docs/transformers/main/en/chat_templating) as defined by Hugging Face Transformers.
+
+```bash
+python -m sglang.launch_server \
+ --model-path meta-llama/Llama-2-7b-chat-hf \
+ --port 30000 \
+ --chat-template ./my_model_template.jinja
+```
diff --git a/sglang/docs/references/environment_variables.md b/sglang/docs/references/environment_variables.md
new file mode 100644
index 0000000000000000000000000000000000000000..17f115f9a78d5e4aea886656de2a80e5b8359682
--- /dev/null
+++ b/sglang/docs/references/environment_variables.md
@@ -0,0 +1,172 @@
+# Environment Variables
+
+SGLang supports various environment variables that can be used to configure its runtime behavior. This document provides a comprehensive list and aims to stay updated over time.
+
+*Note: SGLang uses two prefixes for environment variables: `SGL_` and `SGLANG_`. This is likely due to historical reasons. While both are currently supported for different settings, future versions might consolidate them.*
+
+## General Configuration
+
+| Environment Variable | Description | Default Value |
+|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|------------------------------|
+| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
+| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
+| `SGLANG_PORT` | Port for the server | auto-detected |
+| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
+| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
+| `SGLANG_LOG_REQUEST_HEADERS` | Comma-separated list of additional HTTP headers to log when `--log-requests` is enabled. Appends to the default `x-smg-routing-key`. | Not set |
+| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
+| `SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL` | The interval of passes to collect the metric of selected count of physical experts on each layer and GPU rank. 0 means disabled. | `0` |
+| `SGLANG_FORWARD_UNKNOWN_TOOLS` | Forward unknown tool calls to clients instead of dropping them | `false` (drop unknown tools) |
+| `SGLANG_REQ_WAITING_TIMEOUT` | Timeout (in seconds) for requests waiting in the queue before being scheduled | `-1` |
+| `SGLANG_REQ_RUNNING_TIMEOUT` | Timeout (in seconds) for requests running in the decode batch | `-1` |
+
+## Performance Tuning
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_ENABLE_TORCH_INFERENCE_MODE` | Control whether to use torch.inference_mode | `false` |
+| `SGLANG_ENABLE_TORCH_COMPILE` | Enable torch.compile | `true` |
+| `SGLANG_SET_CPU_AFFINITY` | Enable CPU affinity setting (often set to `1` in Docker builds) | `0` |
+| `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN` | Allows the scheduler to overwrite longer context length requests (often set to `1` in Docker builds) | `0` |
+| `SGLANG_IS_FLASHINFER_AVAILABLE` | Control FlashInfer availability check | `true` |
+| `SGLANG_SKIP_P2P_CHECK` | Skip P2P (peer-to-peer) access check | `false` |
+| `SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD` | Sets the threshold for enabling chunked prefix caching | `8192` |
+| `SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION` | Enable RoPE fusion in Fused Multi-Layer Attention | `1` |
+| `SGLANG_DISABLE_CONSECUTIVE_PREFILL_OVERLAP` | Disable overlap schedule for consecutive prefill batches | `false` |
+| `SGLANG_SCHEDULER_MAX_RECV_PER_POLL` | Set the maximum number of requests per poll, with a negative value indicating no limit | `-1` |
+| `SGLANG_DISABLE_FA4_WARMUP` | Disable Flash Attention 4 warmup passes (set to `1`, `true`, `yes`, or `on` to disable) | `false` |
+| `SGLANG_DATA_PARALLEL_BUDGET_INTERVAL` | Interval for DPBudget updates | `1` |
+| `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_DEFAULT` | Default weight value for scheduler recv skipper counter (used when forward mode doesn't match specific modes). Only active when `--scheduler-recv-interval > 1`. The counter accumulates weights and triggers request polling when reaching the interval threshold. | `1000` |
+| `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_DECODE` | Weight increment for decode forward mode in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency during decode phase. | `1` |
+| `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_VERIFY` | Weight increment for target verify forward mode in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency during verification phase. | `1` |
+| `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_NONE` | Weight increment when forward mode is None in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency when no specific forward mode is active. | `1` |
+| `SGLANG_MM_BUFFER_SIZE_MB` | Size of preallocated GPU buffer (in MB) for multi-modal feature hashing optimization. When set to a positive value, temporarily moves features to GPU for faster hash computation, then moves them back to CPU to save GPU memory. Larger features benefit more from GPU hashing. Set to `0` to disable. | `0` |
+| `SGLANG_MM_PRECOMPUTE_HASH` | Enable precomputing of hash values for MultimodalDataItem | `false` |
+| `SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH` | Enable NCCL for gathering when preparing mlp sync batch under overlap scheduler (without this flag gloo is used for gathering) | `false` |
+| `SGLANG_SYMM_MEM_PREALLOC_GB_SIZE` | Size of preallocated GPU buffer (in GB) for NCCL symmetric memory pool to limit memory fragmentation. Only have an effect when server arg `--enable-symm-mem` is set. | `4` |
+| `SGLANG_CUSTOM_ALLREDUCE_ALGO` | The algorithm of custom all-reduce. Set to `oneshot` or `1stage` to force use one-shot. Set to `twoshot` or `2stage` to force use two-shot. | `` |
+
+
+## DeepGEMM Configuration (Advanced Optimization)
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_ENABLE_JIT_DEEPGEMM` | Enable Just-In-Time compilation of DeepGEMM kernels (enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) GPUs when the DeepGEMM package is installed; set to `"0"` to disable) | `"true"` |
+| `SGLANG_JIT_DEEPGEMM_PRECOMPILE` | Enable precompilation of DeepGEMM kernels | `"true"` |
+| `SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS` | Number of workers for parallel DeepGEMM kernel compilation | `4` |
+| `SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE` | Indicator flag used during the DeepGEMM precompile script | `"false"` |
+| `SGLANG_DG_CACHE_DIR` | Directory for caching compiled DeepGEMM kernels | `~/.cache/deep_gemm` |
+| `SGLANG_DG_USE_NVRTC` | Use NVRTC (instead of Triton) for JIT compilation (Experimental) | `"0"` |
+| `SGLANG_USE_DEEPGEMM_BMM` | Use DeepGEMM for Batched Matrix Multiplication (BMM) operations | `"false"` |
+| `SGLANG_JIT_DEEPGEMM_FAST_WARMUP` | Precompile less kernels during warmup, which reduces the warmup time from 30min to less than 3min. Might cause performance degradation during runtime. | `"false"` |
+
+## DeepEP Configuration
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_DEEPEP_BF16_DISPATCH` | Use Bfloat16 for dispatch | `"false"` |
+| `SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK` | The maximum number of dispatched tokens on each GPU | `"128"` |
+| `SGLANG_FLASHINFER_NUM_MAX_DISPATCH_TOKENS_PER_RANK` | The maximum number of dispatched tokens on each GPU for --moe-a2a-backend=flashinfer | `"1024"` |
+| `SGLANG_DEEPEP_LL_COMBINE_SEND_NUM_SMS` | Number of SMs used for DeepEP combine when single batch overlap is enabled | `"32"` |
+| `SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBO` | Run shared experts on an alternate stream when single batch overlap is enabled on GB200. When not setting this flag, shared experts and down gemm will be overlapped with DeepEP combine together. | `"false"` |
+
+## MORI Configuration
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_MORI_FP8_DISP` | Use FP8 for dispatch | `"false"` |
+| `SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK` | Maximum number of dispatch tokens per rank for MORI-EP buffer allocation | `4096` |
+| `SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD` | Threshold for switching between `InterNodeV1` and `InterNodeV1LL` kernel types. `InterNodeV1LL` is used if `SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK` is less than or equal to this threshold; otherwise, `InterNodeV1` is used. | `256` |
+| `SGLANG_MORI_QP_PER_TRANSFER` | Number of RDMA Queue Pairs (QPs) used per transfer operation | `1` |
+| `SGLANG_MORI_POST_BATCH_SIZE` | Number of RDMA work requests posted in a single batch to each QP | `-1` |
+| `SGLANG_MORI_NUM_WORKERS` | Number of worker threads in the RDMA executor thread pool | `1` |
+
+## NSA Backend Configuration (For DeepSeek V3.2)
+
+
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_NSA_FUSE_TOPK` | Fuse the operation of picking topk logits and picking topk indices from page table | `true` |
+| `SGLANG_NSA_ENABLE_MTP_PRECOMPUTE_METADATA` | Precompute metadata that can be shared among different draft steps when MTP is enabled | `true` |
+
+
+## Memory Management
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_DEBUG_MEMORY_POOL` | Enable memory pool debugging | `false` |
+| `SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION` | Clip max new tokens estimation for memory planning | `4096` |
+| `SGLANG_DETOKENIZER_MAX_STATES` | Maximum states for detokenizer | Default value based on system |
+| `SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK` | Enable checks for memory imbalance across Tensor Parallel ranks | `true` |
+| `SGLANG_MOONCAKE_CUSTOM_MEM_POOL` | Configure the custom memory pool type for Mooncake. Supports `NVLINK`, `BAREX`, `INTRA_NODE_NVLINK`. If set to `true`, it defaults to `NVLINK`. | `None` |
+
+## Model-Specific Options
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_USE_AITER` | Use AITER optimize implementation | `false` |
+| `SGLANG_MOE_PADDING` | Enable MoE padding (sets padding size to 128 if value is `1`, often set to `1` in Docker builds) | `0` |
+| `SGLANG_CUTLASS_MOE` (deprecated) | Use Cutlass FP8 MoE kernel on Blackwell GPUs (deprecated, use --moe-runner-backend=cutlass) | `false` |
+
+## Quantization
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_INT4_WEIGHT` | Enable INT4 weight quantization | `false` |
+| `SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2` | Apply per token group quantization kernel with fused silu and mul and masked m | `false` |
+| `SGLANG_FORCE_FP8_MARLIN` | Force using FP8 MARLIN kernels even if other FP8 kernels are available | `false` |
+| `SGLANG_FLASHINFER_FP4_GEMM_BACKEND` (deprecated) | Select backend for `mm_fp4` on Blackwell GPUs. **DEPRECATED**: Please use `--fp4-gemm-backend` instead. | `` |
+| `SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN` | Quantize q_b_proj from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
+| `SGLANG_MOE_NVFP4_DISPATCH` | Use nvfp4 for moe dispatch (on flashinfer_cutlass or flashinfer_cutedsl moe runner backend) | `"false"` |
+| `SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE` | Quantize moe of nextn layer from BF16 to FP8 when launching DeepSeek NVFP4 checkpoint | `false` |
+| `SGLANG_ENABLE_FLASHINFER_FP8_GEMM` (deprecated) | Use flashinfer kernels when running blockwise fp8 GEMM on Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=flashinfer_trtllm` (SM100/SM103) or `--fp8-gemm-backend=flashinfer_cutlass` (SM120/SM121 and newer) instead. | `false` |
+| `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` (deprecated) | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs. **DEPRECATED**: Please use `--fp8-gemm-backend=cutlass` instead. | `false` |
+| `SGLANG_QUANT_ALLOW_DOWNCASTING` | Allow weights downcasting | `false` |
+
+
+## Distributed Computing
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_BLOCK_NONZERO_RANK_CHILDREN` | Control blocking of non-zero rank children processes | `1` |
+| `SGLANG_IS_FIRST_RANK_ON_NODE` | Indicates if the current process is the first rank on its node | `"true"` |
+| `SGLANG_PP_LAYER_PARTITION` | Pipeline parallel layer partition specification | Not set |
+| `SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS` | Set one visible device per process for distributed computing | `false` |
+
+## Testing & Debugging (Internal/CI)
+
+*These variables are primarily used for internal testing, continuous integration, or debugging.*
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_IS_IN_CI` | Indicates if running in CI environment | `false` |
+| `SGLANG_IS_IN_CI_AMD` | Indicates running in AMD CI environment | `0` |
+| `SGLANG_TEST_RETRACT` | Enable retract decode testing | `false` |
+| `SGLANG_TEST_RETRACT_NO_PREFILL_BS` | When SGLANG_TEST_RETRACT is enabled, no prefill is performed if the batch size exceeds SGLANG_TEST_RETRACT_NO_PREFILL_BS. | `2 ** 31` |
+| `SGLANG_RECORD_STEP_TIME` | Record step time for profiling | `false` |
+| `SGLANG_TEST_REQUEST_TIME_STATS` | Test request time statistics | `false` |
+
+## Profiling & Benchmarking
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
+| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | `true` |
+| `SGLANG_PROFILE_RECORD_SHAPES` | Set `record_shapes` option (bool) for PyTorch profiler (record shapes) | `true` |
+| `SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS` | Config BatchSpanProcessor.schedule_delay_millis if tracing is enabled | `500` |
+| `SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE` | Config BatchSpanProcessor.max_export_batch_size if tracing is enabled | `64` |
+
+## Storage & Caching
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_WAIT_WEIGHTS_READY_TIMEOUT` | Timeout period for waiting on weights | `120` |
+| `SGLANG_DISABLE_OUTLINES_DISK_CACHE` | Disable Outlines disk cache | `true` |
+| `SGLANG_USE_CUSTOM_TRITON_KERNEL_CACHE` | Use SGLang's custom Triton kernel cache implementation for lower overheads (automatically enabled on CUDA) | `false` |
+
+## Function Calling / Tool Use
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_TOOL_STRICT_LEVEL` | Controls the strictness level of tool call parsing and validation.
**Level 0**: Off - No strict validation
**Level 1**: Function strict - Enables structural tag constraints for all tools (even if none have `strict=True` set)
**Level 2**: Parameter strict - Enforces strict parameter validation for all tools, treating them as if they all have `strict=True` set | `0` |
diff --git a/sglang/docs/references/faq.md b/sglang/docs/references/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..ffa1a7c54fd5ebd3b40fd113196d2c14d765494f
--- /dev/null
+++ b/sglang/docs/references/faq.md
@@ -0,0 +1,39 @@
+# Troubleshooting and Frequently Asked Questions
+
+## Troubleshooting
+
+This page lists common errors and tips for resolving them.
+
+### CUDA Out of Memory
+If you encounter out-of-memory (OOM) errors, you can adjust the following parameters:
+
+- If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
+- If OOM occurs during decoding, try lowering `--max-running-requests`.
+- You can also decrease `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
+- Another common case for OOM is requesting input logprobs for a long prompt as it requires significant memory. To address this, set `logprob_start_len` in your sampling parameters to include only the necessary parts. If you do need input logprobs for a long prompt, try reducing `--mem-fraction-static`.
+
+### CUDA Error: Illegal Memory Access Encountered
+This error may result from kernel errors or out-of-memory issues:
+- If it is a kernel error, resolving it may be challenging. Please file an issue on GitHub.
+- If it is an out-of-memory issue, it may sometimes be reported as this error instead of "Out of Memory." Refer to the section above for guidance on avoiding OOM issues.
+
+### The server hangs
+- If the server hangs during initialization or running, it can be memory issues (out of memory), network issues (nccl errors), or other bugs in sglang.
+ - If it is out of memory, you might see that `avail mem` is very low during the initialization or right after initialization. In this case,
+ you can try to decrease `--mem-fraction-static`, decrease `--cuda-graph-max-bs`, or decrease `--chunked-prefill-size`.
+- Other bugs, please file an issue on GitHub.
+
+
+## Frequently Asked Questions
+
+### The results are not deterministic, even with a temperature of 0
+
+You may notice that when you send the same request twice, the results from the engine will be slightly different, even when the temperature is set to 0.
+
+From our initial investigation, this indeterminism arises from two factors: dynamic batching and prefix caching. Roughly speaking, dynamic batching accounts for about 95% of the indeterminism, while prefix caching accounts for the remaining portion. The server runs dynamic batching under the hood. Different batch sizes can cause PyTorch/CuBLAS to dispatch to different CUDA kernels, which can lead to slight numerical differences. This difference accumulates across many layers, resulting in nondeterministic output when the batch size changes. Similarly, when prefix caching is enabled, it can also dispatch to different kernels. Even when the computations are mathematically equivalent, small numerical differences from different kernel implementations lead to the final nondeterministic outputs.
+
+To achieve more deterministic outputs in the current code, you can add `--disable-radix-cache` and send only one request at a time. The results will be mostly deterministic under this setting.
+
+**Update**:
+Recently, we also introduced a deterministic mode, you can enable it with `--enable-deterministic-inference`.
+Please find more details in this blog post: https://lmsys.org/blog/2025-09-22-sglang-deterministic/
diff --git a/sglang/docs/references/frontend/choices_methods.md b/sglang/docs/references/frontend/choices_methods.md
new file mode 100644
index 0000000000000000000000000000000000000000..30a0a1814b2bad39d40511e20ee162f9d9ee40cb
--- /dev/null
+++ b/sglang/docs/references/frontend/choices_methods.md
@@ -0,0 +1,77 @@
+# Choices Methods in SGLang
+This doc describes the choices methods supported by SGLang.
+
+The optional `choices_method` arg determines how options supplied to SGLang's `choices` primitive are selected. Only the `RuntimeEndpoint` backend supports the `choices_method` arg. Other backends, such as `OpenAI`, have bespoke selection implementations due to API limitations.
+
+## Methods
+
+### Token Length Normalized
+
+Token length normalized is the default SGLang choices method. It selects the option with the highest average logprob across all of its tokens.
+
+Usage example (alternatively, simply omit the `choices_method` arg):
+```python
+@sgl.function
+def example(s):
+ s += sgl.user("What is the capital of France?")
+ s += sgl.assistant(
+ sgl.gen(
+ "answer",
+ choices=["London", "Paris", "Berlin"],
+ choices_method=sgl.token_length_normalized,
+ )
+ )
+```
+
+
+This can perform poorly if an option contains many tokens, where its later tokens are predicted with high confidence based on its earlier tokens. For instance, even strong models will fail the above example if the specified options are `["Paris", "Antidisestablishmentarianism"]`.
+
+### Greedy Token Selection
+
+Greedy token selection simply selects the option with the highest logprob for its initial token. For overlapping options where one option is a subset of a longer option, the logprobs of the shorter option are extended using its average logprob for comparison against the longer option.
+
+Usage example:
+```python
+@sgl.function
+def example(s):
+ s += sgl.user("What is the capital of France?")
+ s += sgl.assistant(
+ sgl.gen(
+ "answer",
+ choices=["London", "Paris", "Berlin"],
+ choices_method=sgl.greedy_token_selection,
+ )
+ )
+```
+
+This can perform poorly if an option misleads the model down a bad path based on an attractive initial token. For instance, greedy selection will result in an incorrect response for this example:
+```python
+@sgl.function
+def us_president_example(s):
+ s += sgl.user("Name a US president.")
+ s += sgl.assistant(
+ sgl.gen(
+ "answer",
+ choices=["Donald Duck", "Millard Fillmore"],
+ choices_method=sgl.greedy_token_selection,
+ )
+ )
+```
+
+### Unconditional Likelihood Normalized
+
+Unconditional likelihood normalized selects the option with the highest average token logprob once normalized by the unconditional token logprobs, as described in [this EleutherAI blogpost](https://blog.eleuther.ai/multiple-choice-normalization/). This method incurs an additional LLM call to obtain the unconditional likelihoods.
+
+Usage example:
+```python
+@sgl.function
+def example(s):
+ s += sgl.user("What is the capital of France?")
+ s += sgl.assistant(
+ sgl.gen(
+ "answer",
+ choices=["London", "Paris", "Berlin"],
+ choices_method=sgl.unconditional_likelihood_normalized,
+ )
+ )
+```
diff --git a/sglang/docs/references/frontend/frontend_index.rst b/sglang/docs/references/frontend/frontend_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..62544cba59877553e55a3f3c8cebfec243c1c9b9
--- /dev/null
+++ b/sglang/docs/references/frontend/frontend_index.rst
@@ -0,0 +1,9 @@
+Frontend Language
+=================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Frontend Language
+
+ frontend_tutorial.ipynb
+ choices_methods.md
diff --git a/sglang/docs/references/frontend/frontend_tutorial.ipynb b/sglang/docs/references/frontend/frontend_tutorial.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..fc458fd1899415bbf5219dd96c3e32fb20f42a4b
--- /dev/null
+++ b/sglang/docs/references/frontend/frontend_tutorial.ipynb
@@ -0,0 +1,456 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# SGLang Frontend Language"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "SGLang frontend language can be used to define simple and easy prompts in a convenient, structured way."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Launch A Server\n",
+ "\n",
+ "Launch the server in your terminal and wait for it to initialize."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sglang import assistant_begin, assistant_end\n",
+ "from sglang import assistant, function, gen, system, user\n",
+ "from sglang import image\n",
+ "from sglang import RuntimeEndpoint\n",
+ "from sglang.lang.api import set_default_backend\n",
+ "from sglang.srt.utils import load_image\n",
+ "from sglang.test.doc_patch import launch_server_cmd\n",
+ "from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
+ "\n",
+ "server_process, port = launch_server_cmd(\n",
+ " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+ "print(f\"Server started on http://localhost:{port}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Set the default backend. Note: Besides the local server, you may use also `OpenAI` or other API endpoints."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic Usage\n",
+ "\n",
+ "The most simple way of using SGLang frontend language is a simple question answer dialog between a user and an assistant."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def basic_qa(s, question):\n",
+ " s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+ " s += user(question)\n",
+ " s += assistant(gen(\"answer\", max_tokens=512))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "state = basic_qa(\"List 3 countries and their capitals.\")\n",
+ "print_highlight(state[\"answer\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi-turn Dialog\n",
+ "\n",
+ "SGLang frontend language can also be used to define multi-turn dialogs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def multi_turn_qa(s):\n",
+ " s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+ " s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
+ " s += assistant(gen(\"first_answer\", max_tokens=512))\n",
+ " s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
+ " s += assistant(gen(\"second_answer\", max_tokens=512))\n",
+ " return s\n",
+ "\n",
+ "\n",
+ "state = multi_turn_qa()\n",
+ "print_highlight(state[\"first_answer\"])\n",
+ "print_highlight(state[\"second_answer\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Control flow\n",
+ "\n",
+ "You may use any Python code within the function to define more complex control flows."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def tool_use(s, question):\n",
+ " s += assistant(\n",
+ " \"To answer this question: \"\n",
+ " + question\n",
+ " + \". I need to use a \"\n",
+ " + gen(\"tool\", choices=[\"calculator\", \"search engine\"])\n",
+ " + \". \"\n",
+ " )\n",
+ "\n",
+ " if s[\"tool\"] == \"calculator\":\n",
+ " s += assistant(\"The math expression is: \" + gen(\"expression\"))\n",
+ " elif s[\"tool\"] == \"search engine\":\n",
+ " s += assistant(\"The key word to search is: \" + gen(\"word\"))\n",
+ "\n",
+ "\n",
+ "state = tool_use(\"What is 2 * 2?\")\n",
+ "print_highlight(state[\"tool\"])\n",
+ "print_highlight(state[\"expression\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Parallelism\n",
+ "\n",
+ "Use `fork` to launch parallel prompts. Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def tip_suggestion(s):\n",
+ " s += assistant(\n",
+ " \"Here are two tips for staying healthy: \"\n",
+ " \"1. Balanced Diet. 2. Regular Exercise.\\n\\n\"\n",
+ " )\n",
+ "\n",
+ " forks = s.fork(2)\n",
+ " for i, f in enumerate(forks):\n",
+ " f += assistant(\n",
+ " f\"Now, expand tip {i+1} into a paragraph:\\n\"\n",
+ " + gen(\"detailed_tip\", max_tokens=256, stop=\"\\n\\n\")\n",
+ " )\n",
+ "\n",
+ " s += assistant(\"Tip 1:\" + forks[0][\"detailed_tip\"] + \"\\n\")\n",
+ " s += assistant(\"Tip 2:\" + forks[1][\"detailed_tip\"] + \"\\n\")\n",
+ " s += assistant(\n",
+ " \"To summarize the above two tips, I can say:\\n\" + gen(\"summary\", max_tokens=512)\n",
+ " )\n",
+ "\n",
+ "\n",
+ "state = tip_suggestion()\n",
+ "print_highlight(state[\"summary\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Constrained Decoding\n",
+ "\n",
+ "Use `regex` to specify a regular expression as a decoding constraint. This is only supported for local models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def regular_expression_gen(s):\n",
+ " s += user(\"What is the IP address of the Google DNS servers?\")\n",
+ " s += assistant(\n",
+ " gen(\n",
+ " \"answer\",\n",
+ " temperature=0,\n",
+ " regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
+ " )\n",
+ " )\n",
+ "\n",
+ "\n",
+ "state = regular_expression_gen()\n",
+ "print_highlight(state[\"answer\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Use `regex` to define a `JSON` decoding schema."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "character_regex = (\n",
+ " r\"\"\"\\{\\n\"\"\"\n",
+ " + r\"\"\" \"name\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+ " + r\"\"\" \"house\": \"(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)\",\\n\"\"\"\n",
+ " + r\"\"\" \"blood status\": \"(Pure-blood|Half-blood|Muggle-born)\",\\n\"\"\"\n",
+ " + r\"\"\" \"occupation\": \"(student|teacher|auror|ministry of magic|death eater|order of the phoenix)\",\\n\"\"\"\n",
+ " + r\"\"\" \"wand\": \\{\\n\"\"\"\n",
+ " + r\"\"\" \"wood\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+ " + r\"\"\" \"core\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+ " + r\"\"\" \"length\": [0-9]{1,2}\\.[0-9]{0,2}\\n\"\"\"\n",
+ " + r\"\"\" \\},\\n\"\"\"\n",
+ " + r\"\"\" \"alive\": \"(Alive|Deceased)\",\\n\"\"\"\n",
+ " + r\"\"\" \"patronus\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+ " + r\"\"\" \"bogart\": \"[\\w\\d\\s]{1,16}\"\\n\"\"\"\n",
+ " + r\"\"\"\\}\"\"\"\n",
+ ")\n",
+ "\n",
+ "\n",
+ "@function\n",
+ "def character_gen(s, name):\n",
+ " s += user(\n",
+ " f\"{name} is a character in Harry Potter. Please fill in the following information about this character.\"\n",
+ " )\n",
+ " s += assistant(gen(\"json_output\", max_tokens=256, regex=character_regex))\n",
+ "\n",
+ "\n",
+ "state = character_gen(\"Harry Potter\")\n",
+ "print_highlight(state[\"json_output\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Batching \n",
+ "\n",
+ "Use `run_batch` to run a batch of prompts."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def text_qa(s, question):\n",
+ " s += user(question)\n",
+ " s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+ "\n",
+ "\n",
+ "states = text_qa.run_batch(\n",
+ " [\n",
+ " {\"question\": \"What is the capital of the United Kingdom?\"},\n",
+ " {\"question\": \"What is the capital of France?\"},\n",
+ " {\"question\": \"What is the capital of Japan?\"},\n",
+ " ],\n",
+ " progress_bar=True,\n",
+ ")\n",
+ "\n",
+ "for i, state in enumerate(states):\n",
+ " print_highlight(f\"Answer {i+1}: {states[i]['answer']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Streaming \n",
+ "\n",
+ "Use `stream` to stream the output to the user."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def text_qa(s, question):\n",
+ " s += user(question)\n",
+ " s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+ "\n",
+ "\n",
+ "state = text_qa.run(\n",
+ " question=\"What is the capital of France?\", temperature=0.1, stream=True\n",
+ ")\n",
+ "\n",
+ "for out in state.text_iter():\n",
+ " print(out, end=\"\", flush=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Complex Prompts\n",
+ "\n",
+ "You may use `{system|user|assistant}_{begin|end}` to define complex prompts."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def chat_example(s):\n",
+ " s += system(\"You are a helpful assistant.\")\n",
+ " # Same as: s += s.system(\"You are a helpful assistant.\")\n",
+ "\n",
+ " with s.user():\n",
+ " s += \"Question: What is the capital of France?\"\n",
+ "\n",
+ " s += assistant_begin()\n",
+ " s += \"Answer: \" + gen(\"answer\", max_tokens=100, stop=\"\\n\")\n",
+ " s += assistant_end()\n",
+ "\n",
+ "\n",
+ "state = chat_example()\n",
+ "print_highlight(state[\"answer\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi-modal Generation\n",
+ "\n",
+ "You may use SGLang frontend language to define multi-modal prompts.\n",
+ "See [here](https://docs.sglang.io/supported_models/text_generation/multimodal_language_models.html) for supported models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "server_process, port = launch_server_cmd(\n",
+ " \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+ ")\n",
+ "\n",
+ "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+ "print(f\"Server started on http://localhost:{port}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ask a question about an image."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "@function\n",
+ "def image_qa(s, image_file, question):\n",
+ " s += user(image(image_file) + question)\n",
+ " s += assistant(gen(\"answer\", max_tokens=256))\n",
+ "\n",
+ "\n",
+ "image_url = \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+ "image_bytes, _ = load_image(image_url)\n",
+ "state = image_qa(image_bytes, \"What is in the image?\")\n",
+ "print_highlight(state[\"answer\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "terminate_process(server_process)"
+ ]
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sglang/docs/references/learn_more.md b/sglang/docs/references/learn_more.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0d6ffb8b3b1b5e3624fcd15ff1648d7f6fcc715
--- /dev/null
+++ b/sglang/docs/references/learn_more.md
@@ -0,0 +1,9 @@
+# Learn More and Join the Community
+
+- The development roadmap: [https://roadmap.sglang.io](https://roadmap.sglang.io)
+- Join weekly public development meeting: [https://meet.sglang.io](https://meet.sglang.io)
+- Join Slack: [https://slack.sglang.io/](https://slack.sglang.io/)
+- Follow on X (formerly Twitter): [https://x.com/lmsysorg](https://x.com/lmsysorg)
+- Follow on LinkedIn: [https://www.linkedin.com/company/sgl-project/](https://www.linkedin.com/company/sgl-project/)
+- The latest SGLang features and updates are shared through the [LMSYS blog](https://lmsys.org/blog/)
+- More blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials)
diff --git a/sglang/docs/references/multi_node_deployment/deploy_on_k8s.md b/sglang/docs/references/multi_node_deployment/deploy_on_k8s.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfc099f56f434fa40e6f48cb062cf06fb675cb96
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/deploy_on_k8s.md
@@ -0,0 +1,337 @@
+# Deploy On Kubernetes
+
+This document is for deploying a RoCE network-based SGLang two-node inference service on a Kubernetes (K8S) cluster.
+
+[LeaderWorkerSet (LWS)](https://github.com/kubernetes-sigs/lws) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference.
+
+SGLang can also be deployed with LWS on Kubernetes for distributed model serving.
+
+Please see this guide for more details on deploying SGLang on Kubernetes using LWS.
+
+Here we take the deployment of DeepSeek-R1 as an example.
+
+## Prerequisites
+
+1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
+
+2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
+
+## Basic example
+
+For the basic example documentation, refer to [Deploy Distributed Inference Service with SGLang and LWS on GPUs](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/sglang).
+
+However, that document only covers the basic NCCL socket mode.
+
+In this section, we’ll make some simple modifications to adapt the setup to the RDMA scenario.
+
+## RDMA RoCE case
+
+* Check your env:
+
+```bash
+[root@node1 ~]# ibstatus
+Infiniband device 'mlx5_bond_0' port 1 status:
+ default gid: fe80:0000:0000:0000:0225:9dff:fe64:c79a
+ base lid: 0x0
+ sm lid: 0x0
+ state: 4: ACTIVE
+ phys state: 5: LinkUp
+ rate: 200 Gb/sec (2X NDR)
+ link_layer: Ethernet
+
+Infiniband device 'mlx5_bond_1' port 1 status:
+ default gid: fe80:0000:0000:0000:0225:9dff:fe6e:c3ec
+ base lid: 0x0
+ sm lid: 0x0
+ state: 4: ACTIVE
+ phys state: 5: LinkUp
+ rate: 200 Gb/sec (2X NDR)
+ link_layer: Ethernet
+
+Infiniband device 'mlx5_bond_2' port 1 status:
+ default gid: fe80:0000:0000:0000:0225:9dff:fe73:0dd7
+ base lid: 0x0
+ sm lid: 0x0
+ state: 4: ACTIVE
+ phys state: 5: LinkUp
+ rate: 200 Gb/sec (2X NDR)
+ link_layer: Ethernet
+
+Infiniband device 'mlx5_bond_3' port 1 status:
+ default gid: fe80:0000:0000:0000:0225:9dff:fe36:f7ff
+ base lid: 0x0
+ sm lid: 0x0
+ state: 4: ACTIVE
+ phys state: 5: LinkUp
+ rate: 200 Gb/sec (2X NDR)
+ link_layer: Ethernet
+```
+
+* Prepare the `lws.yaml` file for deploying on k8s.
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+ name: sglang
+spec:
+ replicas: 1
+ leaderWorkerTemplate:
+ size: 2
+ restartPolicy: RecreateGroupOnPodRestart
+ leaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ spec:
+ dnsPolicy: ClusterFirstWithHostNet
+ hostNetwork: true
+ hostIPC: true
+ containers:
+ - name: sglang-leader
+ image: sglang:latest
+ securityContext:
+ privileged: true
+ env:
+ - name: NCCL_IB_GID_INDEX
+ value: "3"
+ command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --mem-fraction-static
+ - "0.93"
+ - --torch-compile-max-bs
+ - "8"
+ - --max-running-requests
+ - "20"
+ - --tp
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20000
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --host
+ - "0.0.0.0"
+ - --port
+ - "40000"
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ ports:
+ - containerPort: 40000
+ readinessProbe:
+ tcpSocket:
+ port: 40000
+ initialDelaySeconds: 15
+ periodSeconds: 10
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ - name: model
+ mountPath: /work/models
+ - name: ib
+ mountPath: /dev/infiniband
+ volumes:
+ - name: dshm
+ emptyDir:
+ medium: Memory
+ - name: model
+ hostPath:
+ path: '< your models dir >' # modify it according your models dir
+ - name: ib
+ hostPath:
+ path: /dev/infiniband
+ workerTemplate:
+ spec:
+ dnsPolicy: ClusterFirstWithHostNet
+ hostNetwork: true
+ hostIPC: true
+ containers:
+ - name: sglang-worker
+ image: sglang:latest
+ securityContext:
+ privileged: true
+ env:
+ - name: NCCL_IB_GID_INDEX
+ value: "3"
+ command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --mem-fraction-static
+ - "0.93"
+ - --torch-compile-max-bs
+ - "8"
+ - --max-running-requests
+ - "20"
+ - --tp
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20000
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ - name: model
+ mountPath: /work/models
+ - name: ib
+ mountPath: /dev/infiniband
+ volumes:
+ - name: dshm
+ emptyDir:
+ medium: Memory
+ - name: ib
+ hostPath:
+ path: /dev/infiniband
+ - name: model
+ hostPath:
+ path: /data1/models/deepseek_v3_moe
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: sglang-leader
+spec:
+ selector:
+ leaderworkerset.sigs.k8s.io/name: sglang
+ role: leader
+ ports:
+ - protocol: TCP
+ port: 40000
+ targetPort: 40000
+
+```
+
+* Then use `kubectl apply -f lws.yaml` you will get this output.
+
+```text
+NAME READY STATUS RESTARTS AGE
+sglang-0 0/1 Running 0 9s
+sglang-0-1 1/1 Running 0 9s
+```
+
+Wait for the sglang leader (`sglang-0`) status to change to 1/1, which indicates it is `Ready`.
+
+You can use the command `kubectl logs -f sglang-0` to view the logs of the leader node.
+
+Once successful, you should see output like this:
+
+```text
+[2025-02-17 05:27:24 TP1] Capture cuda graph end. Time elapsed: 84.89 s
+[2025-02-17 05:27:24 TP6] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP0] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP7] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP3] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP2] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP4] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP1] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP5] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24] INFO: Started server process [1]
+[2025-02-17 05:27:24] INFO: Waiting for application startup.
+[2025-02-17 05:27:24] INFO: Application startup complete.
+[2025-02-17 05:27:24] INFO: Uvicorn running on http://0.0.0.0:40000 (Press CTRL+C to quit)
+[2025-02-17 05:27:25] INFO: 127.0.0.1:48908 - "GET /get_model_info HTTP/1.1" 200 OK
+[2025-02-17 05:27:25 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2025-02-17 05:27:32] INFO: 127.0.0.1:48924 - "POST /generate HTTP/1.1" 200 OK
+[2025-02-17 05:27:32] The server is fired up and ready to roll!
+```
+
+If it doesn’t start up successfully, please follow these steps to check for any remaining issues. Thanks!
+
+### Debug
+
+* Set `NCCL_DEBUG=TRACE` to check if it is a NCCL communication problem.
+
+This should resolve most NCCL-related issues.
+
+***Notice: If you find that NCCL_DEBUG=TRACE is not effective in the container environment, but the process is stuck or you encounter hard-to-diagnose issues, try switching to a different container image. Some images may not handle standard error output properly.***
+
+#### RoCE scenario
+
+* Please make sure that RDMA devices are available in the cluster environment.
+* Please make sure that the nodes in the cluster have Mellanox NICs with RoCE. In this example, we use Mellanox ConnectX 5 model NICs, and the proper OFED driver has been installed. If not, please refer to the document [Install OFED Driver](https://docs.nvidia.com/networking/display/mlnxofedv461000/installing+mellanox+ofed) to install the driver.
+* Check your env:
+
+ ```shell
+ $ lspci -nn | grep Eth | grep Mellanox
+ 0000:7f:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0000:7f:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0000:c7:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0000:c7:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0001:08:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0001:08:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0001:a2:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ 0001:a2:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+ ```
+
+* Check the OFED driver:
+
+ ```shell
+ ofed_info -s
+ OFED-internal-23.07-0.5.0:
+ ```
+
+* Show RDMA link status and check IB devices:
+
+ ```shell
+ $ rdma link show
+ 8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
+ 9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
+ 10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
+ 11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
+
+ $ ibdev2netdev
+ 8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
+ 9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
+ 10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
+ 11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
+ ```
+
+* Test RoCE network speed on the host:
+
+ ```shell
+ yum install qperf
+ # for server:
+ execute qperf
+ # for client
+ qperf -t 60 -cm1 rc_rdma_write_bw
+ ```
+
+* Check RDMA accessible in your container:
+
+ ```shell
+ # ibv_devices
+ # ibv_devinfo
+ ```
+
+## Keys to success
+
+* In the YAML configuration above, pay attention to the NCCL environment variable. For older versions of NCCL, you should check the NCCL_IB_GID_INDEX environment setting.
+* NCCL_SOCKET_IFNAME is also crucial, but in a containerized environment, this typically isn’t an issue.
+* In some cases, it’s necessary to configure GLOO_SOCKET_IFNAME correctly.
+* NCCL_DEBUG is essential for troubleshooting, but I've found that sometimes it doesn't show error logs within containers. This could be related to the Docker image you're using. You may want to try switching images if needed.
+* Avoid using Docker images based on Ubuntu 18.04, as they tend to have compatibility issues.
+
+## Remaining issues
+
+* In Kubernetes, Docker, or Containerd environments, we use hostNetwork to prevent performance degradation.
+* We utilize privileged mode, which isn’t secure. Additionally, in containerized environments, full GPU isolation cannot be achieved.
+
+## TODO
+
+* Integrated with [k8s-rdma-shared-dev-plugin](https://github.com/Mellanox/k8s-rdma-shared-dev-plugin).
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27f98009e1a4bf69f1caf93e84569719023f5c7f
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-decode-main
+spec:
+ selector:
+ leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+ role: leader
+ ports:
+ - protocol: TCP
+ port: 30000
+ targetPort: 30000
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dbb51b51918ddaf513b0ee8f75c5c58349e3899a
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
@@ -0,0 +1,290 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+ name: deepseekr10528-decode-main
+spec:
+ leaderWorkerTemplate:
+ leaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --port
+ - "30000"
+ - --host
+ - "0.0.0.0"
+ - --model-path
+ - /work/models
+ - --chunked-prefill-size
+ - "262144"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ - --dp-size
+ - "16"
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.849"
+ - --context-length
+ - "32768"
+ - --disaggregation-ib-device
+ - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+ - --cuda-graph-max-bs
+ - "64"
+ - --max-running-requests
+ - "2048"
+ - --tp-size
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: NVSHMEM_HCA_PE_MAPPING
+ value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "16"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:latest
+ name: sglang-leader
+ ports:
+ - containerPort: 30000
+ protocol: TCP
+ readinessProbe:
+ periodSeconds: 30
+ tcpSocket:
+ port: 30000
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ # should modify according your deployment env
+ pd: "yes"
+ tolerations:
+ # should modify according your deployment env
+ - key: bopd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /data1/sgl_cache1
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ restartPolicy: RecreateGroupOnPodRestart
+ size: 2
+ workerTemplate:
+ metadata: {}
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --chunked-prefill-size
+ - "262144"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ - --dp-size
+ - "16"
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.849"
+ - --context-length
+ - "32768"
+ - --disaggregation-ib-device
+ - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+ - --cuda-graph-max-bs
+ - "64"
+ - --max-running-requests
+ - "2048"
+ - --tp-size
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: NVSHMEM_HCA_PE_MAPPING
+ value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "16"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:latest
+ name: sglang-worker
+ ports:
+ - containerPort: 30001
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ # should modify according your deployment env
+ pd: "yes"
+ tolerations:
+ # should modify according your deployment env
+ - key: bopd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /data1/sgl_cache1
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ networkConfig:
+ subdomainPolicy: Shared
+ replicas: 1
+ rolloutStrategy:
+ rollingUpdateConfiguration:
+ maxSurge: 0
+ maxUnavailable: 1
+ type: RollingUpdate
+ startupPolicy: LeaderCreated
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ca690969ab8467032acb8d5be2e45ea95876ab4
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: deepseekr10528-lb-main
+ labels:
+ app: deepseekr10528-lb
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: deepseekr10528-lb
+ template:
+ metadata:
+ labels:
+ app: deepseekr10528-lb
+ spec:
+ nodeSelector:
+ bo: "yes"
+ tolerations:
+ - key: bopd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ containers:
+ - name: sgl-minilb
+ image: lmsysorg/sglang:latest
+ command:
+ - python
+ - -m
+ - sglang_router.launch_router
+ - --pd-disaggregation
+ - --prefill
+ - http://deepseekr10528-prefill-main:30000
+ - --decode
+ - http://deepseekr10528-decode-main:30000
+ - --host
+ - 0.0.0.0
+ - --port
+ - "8000"
+ ports:
+ - containerPort: 8000
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-lb-service
+spec:
+ type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
+ selector:
+ app: deepseekr10528-lb
+ ports:
+ - protocol: TCP
+ port: 8000 # Service Port(In-Cluster)
+ targetPort: 8000 # Exposed Container
+ nodePort: 30800
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6826a13dfa8f03c266e794cd914b5680b1148d9b
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-prefill-main
+spec:
+ selector:
+ leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+ role: leader
+ ports:
+ - protocol: TCP
+ port: 30000
+ targetPort: 30000
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11bbcd30a65669b3b1683ffb418125b9f2974948
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
@@ -0,0 +1,304 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+ name: deepseekr10528-prefill-main
+spec:
+ leaderWorkerTemplate:
+ leaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --port
+ - "30000"
+ - --host
+ - "0.0.0.0"
+ - --model-path
+ - /work/models
+ - --disaggregation-ib-device
+ # should modify according your rdma env
+ - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+ - --chunked-prefill-size
+ - "524288"
+ - --max-prefill-tokens
+ - "32768"
+ - --page-size
+ - "64"
+ - --ep-dispatch-algorithm
+ - dynamic
+ - --eplb-algorithm
+ - deepseek
+ - --enable-dp-lm-head
+ - --enable-dp-attention
+ - --dp-size
+ - "16"
+ - --disable-radix-cache
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - prefill
+ - --mem-fraction-static
+ - "0.7"
+ - --context-length
+ - "32768"
+ - --tp
+ - "16"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ - --max-running-requests
+ - "1024"
+ env:
+ - name: NVSHMEM_HCA_PE_MAPPING
+ # should modify according your rdma env
+ value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: SGLANG_SET_CPU_AFFINITY
+ value: "true"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: none
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "false"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:latest
+ name: sglang-leader
+ ports:
+ - containerPort: 30000
+ protocol: TCP
+ readinessProbe:
+ periodSeconds: 30
+ tcpSocket:
+ port: 30000
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ - mountPath: /root/.cache
+ name: sgl-cache
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ # should modify according your deployment env
+ pd: "yes"
+ tolerations:
+ # should modify according your deployment env
+ - key: bopd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ - hostPath:
+ path: /data1/sgl_cache
+ type: DirectoryOrCreate
+ name: sgl-cache
+ restartPolicy: RecreateGroupOnPodRestart
+ size: 2
+ workerTemplate:
+ metadata: {}
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --disaggregation-ib-device
+ # should modify according your rdma env
+ - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+ - --chunked-prefill-size
+ - "524288"
+ - --max-prefill-tokens
+ - "32768"
+ - --page-size
+ - "64"
+ - --ep-dispatch-algorithm
+ - dynamic
+ - --eplb-algorithm
+ - deepseek
+ # - --deepep-config
+ # - /home/aiges/tuned/tuned_8sms.json
+ # can be tuned using deepep test scripts
+ - --enable-dp-lm-head
+ - --enable-dp-attention
+ - --dp-size
+ - "16"
+ - --disable-radix-cache
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - prefill
+ - --mem-fraction-static
+ - "0.7"
+ - --context-length
+ - "32768"
+ - --tp
+ - "16"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ - --max-running-requests
+ - "1024"
+ env:
+ - name: SGLANG_SET_CPU_AFFINITY
+ value: "true"
+ - name: NVSHMEM_HCA_PE_MAPPING
+ # should modify according your rdma env
+ value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "8"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD
+ value: "0"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: none
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:latest
+ name: sglang-worker
+ ports:
+ - containerPort: 30001
+ protocol: TCP
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ # should modify according your deployment env
+ pd: "yes"
+ tolerations:
+ # should modify according your deployment env
+ - key: bopd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/sgl_cache
+ type: DirectoryOrCreate
+ name: sgl-cache
diff --git a/sglang/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md b/sglang/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
new file mode 100644
index 0000000000000000000000000000000000000000..419474a4e55eac6dbb81fc5d6a1cf539ba470ed4
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
@@ -0,0 +1,783 @@
+# LWS Based PD Deploy
+
+## 0. Prerequisites
+
+1. k8s >=1.26
+2. lws installed on k8s.
+
+## 1. Image Preparation
+
+`lmsysorg/sglang:deepep`
+
+## 2. Deployment Manifest Files
+
+***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
+
+### Prefill
+
+Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
+
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+ name: deepseekr10528-prefill-main
+spec:
+ leaderWorkerTemplate:
+ leaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --port
+ - "30000"
+ - --host
+ - "0.0.0.0"
+ - --model-path
+ - /work/models
+ - --disaggregation-ib-device
+ # should modify according your rdma env
+ - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+ - --chunked-prefill-size
+ - "524288"
+ - --max-prefill-tokens
+ - "32768"
+ - --page-size
+ - "64"
+ # - --init-expert-location
+ # - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+ - --ep-dispatch-algorithm
+ - dynamic
+ - --eplb-algorithm
+ - deepseek
+ # - --deepep-config
+ # - /home/aiges/tuned/tuned_8sms.json
+ - --enable-dp-lm-head
+ - --enable-dp-attention
+ - --dp-size
+ - "16"
+ - --disable-radix-cache
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - prefill
+ - --mem-fraction-static
+ - "0.7"
+ - --context-length
+ - "32768"
+ - --tp
+ - "16"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ - --max-running-requests
+ - "1024"
+ env:
+# - name: NVSHMEM_HCA_PE_MAPPING
+# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+# - name: NVSHMEM_HCA_LIST
+# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: SGLANG_SET_CPU_AFFINITY
+ value: "true"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: none
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "false"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:deepep
+ name: sglang-leader
+ ports:
+ - containerPort: 30000
+ protocol: TCP
+ readinessProbe:
+ periodSeconds: 30
+ tcpSocket:
+ port: 30000
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ - mountPath: /root/.cache
+ name: sgl-cache
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/sgl_cache
+ type: DirectoryOrCreate
+ name: sgl-cache
+ restartPolicy: RecreateGroupOnPodRestart
+ size: 2
+ workerTemplate:
+ metadata: {}
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --disaggregation-ib-device
+ - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+ - --chunked-prefill-size
+ - "524288"
+ - --max-prefill-tokens
+ - "32768"
+ - --page-size
+ - "64"
+ #- --init-expert-location
+ #- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+ - --ep-dispatch-algorithm
+ - dynamic
+ - --eplb-algorithm
+ - deepseek
+# - --deepep-config
+# - /home/aiges/tuned/tuned_8sms.json
+ - --enable-dp-lm-head
+ - --enable-dp-attention
+ - --dp-size
+ - "16"
+ - --disable-radix-cache
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - prefill
+ - --mem-fraction-static
+ - "0.7"
+ - --context-length
+ - "32768"
+ - --tp
+ - "16"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ - --max-running-requests
+ - "1024"
+ env:
+ - name: SGLANG_SET_CPU_AFFINITY
+ value: "true"
+ - name: SGLANG_HACK_DEEPEP_NUM_SMS
+ value: "8"
+ - name: SGLANG_HACK_DEEPEP_NEW_MODE
+ value: "0"
+# - name: NVSHMEM_HCA_PE_MAPPING
+# value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+# - name: NVSHMEM_HCA_LIST
+# value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "8"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD
+ value: "0"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: none
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:deepep
+ name: sglang-worker
+ ports:
+ - containerPort: 30001
+ protocol: TCP
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ - hostPath:
+ path: /data1/sgl_cache
+ type: DirectoryOrCreate
+ name: sgl-cache
+
+```
+
+### Decode
+
+Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
+
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+ name: deepseekr10528-decode-main
+spec:
+ leaderWorkerTemplate:
+ leaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --port
+ - "30000"
+ - --host
+ - "0.0.0.0"
+ - --model-path
+ - /work/models
+ - --chunked-prefill-size
+ - "262144"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ - --dp-size
+ - "16"
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.849"
+ - --context-length
+ - "32768"
+ - --disaggregation-ib-device
+ - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+ - --cuda-graph-max-bs
+ - "64"
+ - --max-running-requests
+ - "2048"
+ - --tp-size
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "16"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:deepep
+ name: sglang-leader
+ ports:
+ - containerPort: 30000
+ protocol: TCP
+ readinessProbe:
+ periodSeconds: 30
+ tcpSocket:
+ port: 30000
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /data1/sgl_cache1
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ restartPolicy: RecreateGroupOnPodRestart
+ size: 2
+ workerTemplate:
+ metadata: {}
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --chunked-prefill-size
+ - "262144"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ #- --enable-two-batch-overlap
+ - --dp-size
+ - "16"
+ - --moe-a2a-backend
+ - deepep
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.849"
+ - --context-length
+ - "32768"
+ - --disaggregation-ib-device
+ # should modify according your rdma env
+ - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+ - --cuda-graph-max-bs
+ - "64"
+ - --max-running-requests
+ - "2048"
+ - --tp-size
+ - "16" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: SGLANG_HACK_DEEPEP_NUM_SMS
+ value: "24"
+ - name: SGLANG_HACK_DEEPEP_NEW_MODE
+ value: "0"
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+ value: "1"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: MC_TE_METRIC
+ value: "true"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: SGLANG_MOONCAKE_TRANS_THREAD
+ value: "16"
+ - name: SGLANG_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ image: lmsysorg/sglang:deepep
+ name: sglang-worker
+ ports:
+ - containerPort: 30001
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+ name: cf
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /data1/sgl_cache1
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+ name: model
+ - hostPath:
+ # modify according to you deployment env
+ path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+ name: cf
+ networkConfig:
+ subdomainPolicy: Shared
+ replicas: 1
+ rolloutStrategy:
+ rollingUpdateConfiguration:
+ maxSurge: 0
+ maxUnavailable: 1
+ type: RollingUpdate
+ startupPolicy: LeaderCreated
+```
+
+Execute separately:
+
+```bash
+kubectl apply -f p.yaml
+kubectl apply -f d.yaml
+```
+
+At this point, we have completed the deployment of the 1P1D SGLang engine part.
+
+To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
+
+Currently, we use a static K8S service + minilb approach to implement model API calls.
+
+### Creating Service for Prefill and Decode
+
+#### Create prefill k8s service
+[p-svc.yaml](lws-examples/p-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-prefill-main
+spec:
+ selector:
+ leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+ role: leader
+ ports:
+ - protocol: TCP
+ port: 30000
+ targetPort: 30000
+```
+Execute `kubectl apply -f p-svc.yaml`
+
+#### Create decode k8s service
+[d-svc.yaml](lws-examples/d-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-decode-main
+spec:
+ selector:
+ leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+ role: leader
+ ports:
+ - protocol: TCP
+ port: 30000
+ targetPort: 30000
+```
+Execute `kubectl apply -f d-svc.yaml`
+
+#### Deploy minilb and lb service
+[lb.yaml](lws-examples/lb.yaml)
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: deepseekr10528-lb-main
+ labels:
+ app: deepseekr10528-lb
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: deepseekr10528-lb
+ template:
+ metadata:
+ labels:
+ app: deepseekr10528-lb
+ spec:
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ - key: node-role
+ operator: Exists
+ containers:
+ - name: sgl-minilb
+ image: lmsysorg/sglang:deepep
+ command:
+ - python
+ - -m
+ - sglang_router.launch_router
+ - --pd-disaggregation
+ - --prefill
+ - http://deepseekr10528-prefill-main:30000
+ - --decode
+ - http://deepseekr10528-decode-main:30000
+ - --host
+ - 0.0.0.0
+ - --port
+ - "8000"
+ ports:
+ - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: deepseekr10528-lb-service
+spec:
+ type: NodePort
+ selector:
+ app: deepseekr10528-lb
+ ports:
+ - protocol: TCP
+ port: 8000 # Service Port(In-Cluster)
+ targetPort: 8000 # Exposed Container
+ nodePort: 30800
+```
+Execute `kubectl apply -f lb.yaml`
+
+After waiting for all model deployments to succeed, you will get the following output:
+
+```bash
+[root@ecs-001]# kubectl get po
+deepseekr10528-decode-main-0 1/1 Running 0 74m
+deepseekr10528-decode-main-0-1 1/1 Running 0 74m
+deepseekr10528-lb-main-9c5dbfc57-6lcbd 1/1 Running 0 22m
+deepseekr10528-prefill-main-0 1/1 Running 0 74m
+deepseekr10528-prefill-main-0-1 1/1 Running 0 74m
+[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee
+deepseekr10528-decode-main ClusterIP None 97m
+deepseekr10528-lb-service NodePort 172.16.242.169 8000:30800/TCP 22m
+deepseekr10528-prefill-main ClusterIP None 97m
+```
+
+At this point, select a nodePort:30800 to access:
+
+```bash
+[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
+> -H "Content-Type: application/json" \
+> -H "Authorization: Bearer None" \
+> -d '{
+> "rid":"ccccdd",
+> "model": "r1",
+> "messages": [
+> {"role": "system", "content": "0: You are a helpful AI assistant"},
+> {"role": "user", "content": "你是谁?."}
+> ],
+> "max_tokens":221
+> }'
+{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
+
+```
+## FAQ
+
+1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
+
+2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
diff --git a/sglang/docs/references/multi_node_deployment/multi_node.md b/sglang/docs/references/multi_node_deployment/multi_node.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6e5b53444fe93d8641e7509c07b42820d91dd8f
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/multi_node.md
@@ -0,0 +1,100 @@
+# Multi-Node Deployment
+
+## Llama 3.1 405B
+
+**Run 405B (fp16) on Two Nodes**
+
+```bash
+# replace 172.16.4.52:20000 with your own node ip address and port of the first node
+
+python3 -m sglang.launch_server \
+ --model-path meta-llama/Meta-Llama-3.1-405B-Instruct \
+ --tp 16 \
+ --dist-init-addr 172.16.4.52:20000 \
+ --nnodes 2 \
+ --node-rank 0
+
+python3 -m sglang.launch_server \
+ --model-path meta-llama/Meta-Llama-3.1-405B-Instruct \
+ --tp 16 \
+ --dist-init-addr 172.16.4.52:20000 \
+ --nnodes 2 \
+ --node-rank 1
+```
+
+Note that LLama 405B (fp8) can also be launched on a single node.
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+```
+
+## DeepSeek V3/R1
+
+Please refer to [DeepSeek documents for reference](https://docs.sglang.io/basic_usage/deepseek.html#running-examples-on-multi-node).
+
+## Multi-Node Inference on SLURM
+
+This example showcases how to serve SGLang server across multiple nodes by SLURM. Submit the following job to the SLURM cluster.
+
+```
+#!/bin/bash -l
+
+#SBATCH -o SLURM_Logs/%x_%j_master.out
+#SBATCH -e SLURM_Logs/%x_%j_master.err
+#SBATCH -D ./
+#SBATCH -J Llama-405B-Online-Inference-TP16-SGL
+
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1 # Ensure 1 task per node
+#SBATCH --cpus-per-task=18
+#SBATCH --mem=224GB
+#SBATCH --partition="lmsys.org"
+#SBATCH --gres=gpu:8
+#SBATCH --time=12:00:00
+
+echo "[INFO] Activating environment on node $SLURM_PROCID"
+if ! source ENV_FOLDER/bin/activate; then
+ echo "[ERROR] Failed to activate environment" >&2
+ exit 1
+fi
+
+# Define parameters
+model=MODEL_PATH
+tp_size=16
+
+echo "[INFO] Running inference"
+echo "[INFO] Model: $model"
+echo "[INFO] TP Size: $tp_size"
+
+# Set NCCL initialization address using the hostname of the head node
+HEAD_NODE=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
+NCCL_INIT_ADDR="${HEAD_NODE}:8000"
+echo "[INFO] NCCL_INIT_ADDR: $NCCL_INIT_ADDR"
+
+# Launch the model server on each node using SLURM
+srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
+ --error="SLURM_Logs/%x_%j_node$SLURM_NODEID.err" \
+ python3 -m sglang.launch_server \
+ --model-path "$model" \
+ --grammar-backend "xgrammar" \
+ --tp "$tp_size" \
+ --dist-init-addr "$NCCL_INIT_ADDR" \
+ --nnodes 2 \
+ --node-rank "$SLURM_NODEID" &
+
+# Wait for the NCCL server to be ready on port 30000
+while ! nc -z "$HEAD_NODE" 30000; do
+ sleep 1
+ echo "[INFO] Waiting for $HEAD_NODE:30000 to accept connections"
+done
+
+echo "[INFO] $HEAD_NODE:30000 is ready to accept connections"
+
+# Keep the script running until the SLURM job times out
+wait
+```
+
+Then, you can test the server by sending requests following other [documents](https://docs.sglang.io/basic_usage/openai_api_completions.html).
+
+Thanks for [aflah02](https://github.com/aflah02) for providing the example, based on his [blog post](https://aflah02.substack.com/p/multi-node-llm-inference-with-sglang).
diff --git a/sglang/docs/references/multi_node_deployment/multi_node_index.rst b/sglang/docs/references/multi_node_deployment/multi_node_index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78636869ec26c880c094052886983e84716aa246
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/multi_node_index.rst
@@ -0,0 +1,14 @@
+Multi-Node Deployment
+=====================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Multi-Node Deployment
+
+ multi_node.md
+ deploy_on_k8s.md
+ lws_pd/lws_pd_deploy.md
+ rbg_pd/deepseekv32_pd.md
+
+- `Deploying DeepSeek with PD Disaggregation and Large-Scale Expert Parallelism on 96 H100 GPUs `_
+- `Deploying Kimi K2 with PD Disaggregation and Large-Scale Expert Parallelism on 128 H200 GPUs `_
diff --git a/sglang/docs/references/multi_node_deployment/rbg_pd/deepseekv32_pd.md b/sglang/docs/references/multi_node_deployment/rbg_pd/deepseekv32_pd.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3d4562b74cab8bb9264403477f048d174ca999d
--- /dev/null
+++ b/sglang/docs/references/multi_node_deployment/rbg_pd/deepseekv32_pd.md
@@ -0,0 +1,567 @@
+# DeepSeekV32-Exp RBG Based PD Deploy
+
+## 0. Prerequisites
+
+1. k8s >=1.26
+2. lws installed on k8s.
+3. rbg installed on k8s.
+
+For RBG installation, please refer to: https://github.com/sgl-project/rbg
+
+## 1. Image Preparation
+
+`lmsysorg/sglang:latest`
+
+
+### 2. All In One manifest file
+
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+
+rbg-dsv32.yml
+
+```yaml
+apiVersion: workloads.x-k8s.io/v1alpha1
+kind: RoleBasedGroup
+metadata:
+ name: deepseek-rbg-32exp
+ namespace: default
+spec:
+ roles:
+ - name: prefill
+ replicas: 1
+ workload:
+ apiVersion: leaderworkerset.x-k8s.io/v1
+ kind: LeaderWorkerSet
+ restartPolicy: None
+ leaderWorkerSet:
+ size: 1
+ patchLeaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ pd_role: prefill
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --port
+ - "30000"
+ - --trust-remote
+ - --host
+ - 0.0.0.0
+ - --disaggregation-ib-device
+ - mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7
+ - --disable-radix-cache
+ - --chunked-prefill-size
+ - "131072"
+ - --page-size
+ - "64"
+ # - --enable-eplb
+ - --ep-dispatch-algorithm
+ - dynamic
+ - --eplb-algorithm
+ - deepseek
+ - --enable-dp-lm-head
+ - --enable-dp-attention
+ - --dp-size
+ - "8"
+ - --moe-a2a-backend
+ - deepep
+ - --deepep-mode
+ - normal
+ - --disaggregation-mode
+ - prefill
+ - --mem-fraction-static
+ - "0.8"
+ - --max-prefill-tokens
+ - "32768"
+ - --context-length
+ - "32768"
+ - --tp
+ - "8"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ - --max-running-requests
+ - "1024"
+ env:
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ livenessProbe:
+ failureThreshold: 3000
+ httpGet:
+ path: /health
+ port: 30000
+ initialDelaySeconds: 300
+ periodSeconds: 60
+ successThreshold: 1
+ timeoutSeconds: 10
+ readinessProbe:
+ failureThreshold: 20
+ httpGet:
+ path: /health
+ port: 30000
+ periodSeconds: 30
+ successThreshold: 1
+ timeoutSeconds: 10
+ name: sglang
+ ports:
+ - containerPort: 30000
+ name: sglang-http
+ protocol: TCP
+
+ patchWorkerTemplate: {}
+ template:
+ metadata:
+ labels:
+ inference-framework: sglang
+ inference-stack.io/monitoring: "enabled"
+ spec:
+ containers:
+ - name: sglang
+ image: lmsysorg/sglang:latest
+ env:
+ - name: SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK
+ value: "1"
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT
+ value: "1000000000"
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_DISABLE_P2P
+ value: "0"
+ - name: ENABLE_METRICS
+ value: "true"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NVSHMEM_IB_SL
+ value: "5"
+ - name: SGLANG_SET_CPU_AFFINITY
+ value: "true"
+ - name: SGL_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: NCCL_IB_TIMEOUT
+ value: "22"
+ - name: NCCL_IB_GID_INDEX
+ value: "3"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: NCCL_SOCKET_IFNAME
+ value: bond0
+ - name: GLOO_SOCKET_IFNAME
+ value: bond0
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME
+ value: "bond0"
+ - name: MC_TE_METRIC
+ value: "false"
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang
+ name: src
+
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /var/run/sys-topology
+ name: topo
+ - hostPath:
+ path: /data1/sgl_cache4
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /data/DeepSeek-V3.2-Exp
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - hostPath:
+ path: /data/src/sglang
+ type: DirectoryOrCreate
+ name: src
+
+ - name: decode
+ replicas: 1
+ workload:
+ apiVersion: leaderworkerset.x-k8s.io/v1
+ kind: LeaderWorkerSet
+ leaderWorkerSet:
+ size: 1
+ patchLeaderTemplate:
+ metadata:
+ labels:
+ role: leader
+ pd_role: decode
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --port
+ - "30000"
+ - --trust-remote
+ - --host
+ - 0.0.0.0
+ - --disaggregation-ib-device
+ - mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7
+ - --chunked-prefill-size
+ - "131072"
+ - --eplb-rebalance-layers-per-chunk
+ - "29"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ - --dp-size
+ - "8"
+ - --moe-a2a-backend
+ - deepep
+ - --deepep-mode
+ - low_latency
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.8"
+ - --context-length
+ - "32768"
+ - --max-running-requests
+ - "2048"
+ - --tp-size
+ - "8" # Size of Tensor Parallelism
+ - --cuda-graph-max-bs
+ - "16"
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ livenessProbe:
+ failureThreshold: 30000
+ httpGet:
+ path: /health
+ port: 30000
+ initialDelaySeconds: 300
+ periodSeconds: 60
+ successThreshold: 1
+ timeoutSeconds: 10
+ name: sglang
+ readinessProbe:
+ failureThreshold: 20
+ httpGet:
+ path: /health
+ port: 30000
+ periodSeconds: 30
+ successThreshold: 1
+ timeoutSeconds: 10
+ patchWorkerTemplate:
+ spec:
+ containers:
+ - command:
+ - python3
+ - -m
+ - sglang.launch_server
+ - --model-path
+ - /work/models
+ - --crash-dump-folder
+ - /log
+ - --chunked-prefill-size
+ - "262144"
+ - --eplb-rebalance-layers-per-chunk
+ - "29"
+ - --page-size
+ - "64"
+ - --enable-dp-attention
+ - --enable-dp-lm-head
+ - --dp-size
+ - "32"
+ - --moe-a2a-backend
+ - "deepep"
+ - --deepep-mode
+ - low_latency
+ - --disaggregation-mode
+ - decode
+ - --mem-fraction-static
+ - "0.849"
+ - --context-length
+ - "32768"
+ - --disaggregation-ib-device
+ - mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7
+ - --max-running-requests
+ - "4096"
+ - --cuda-graph-max-bs
+ - "16"
+ - --tp-size
+ - "8" # Size of Tensor Parallelism
+ - --dist-init-addr
+ - $(LWS_LEADER_ADDRESS):20102
+ - --nnodes
+ - $(LWS_GROUP_SIZE)
+ - --node-rank
+ - $(LWS_WORKER_INDEX)
+ - --trust-remote-code
+ - --ep-num-redundant-experts
+ - "32"
+ - --moe-dense-tp-size
+ - "1"
+ env:
+ - name: LWS_WORKER_INDEX
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+ name: sglang
+ template:
+ metadata:
+ labels:
+ inference-framework: sglang-unuse
+ inference-stack.io/monitoring: "enabled"
+ spec:
+ containers:
+ - image: lmsysorg/sglang:latest
+ name: sglang
+ resources:
+ limits:
+ nvidia.com/gpu: "8"
+ securityContext:
+ capabilities:
+ add:
+ - IPC_LOCK
+ privileged: true
+ volumeMounts:
+ - mountPath: /root/.cache
+ name: sgl-cache
+ - mountPath: /dev/shm
+ name: dshm
+ - mountPath: /work/models
+ name: model
+ - mountPath: /dev/infiniband
+ name: ib
+ - mountPath: /sgl-workspace/sglang
+ name: src
+ env:
+ - name: SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK
+ value: "1"
+ - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT
+ value: "100000000"
+ - name: NVSHMEM_DISABLE_P2P
+ value: "0"
+ - name: NVSHMEM_IB_TRAFFIC_CLASS
+ value: "16"
+ - name: NVSHMEM_IB_SL
+ value: "5"
+ - name: ENABLE_METRICS
+ value: "true"
+ - name: CUDA_LAUNCH_BLOCKING
+ value: "0"
+ - name: NVSHMEM_IB_GID_INDEX
+ value: "3"
+ - name: NCCL_IB_QPS_PER_CONNECTION
+ value: "8"
+ - name: NCCL_IB_SPLIT_DATA_ON_QPS
+ value: "1"
+ - name: NCCL_NET_PLUGIN
+ value: "none"
+ - name: NCCL_IB_TC
+ value: "136"
+ - name: NCCL_IB_SL
+ value: "5"
+ - name: NCCL_IB_TIMEOUT
+ value: "22"
+ - name: NCCL_IB_GID_INDEX
+ value: "3"
+ - name: NCCL_MIN_NCHANNELS
+ value: "4"
+ - name: NCCL_SOCKET_IFNAME
+ value: bond0
+ - name: GLOO_SOCKET_IFNAME
+ value: bond0
+ - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME
+ value: "bond0"
+ - name: NCCL_IB_HCA
+ value: ^=mlx5_0,mlx5_5,mlx5_6
+ - name: MC_TE_METRIC
+ value: "false"
+ - name: SGL_ENABLE_JIT_DEEPGEMM
+ value: "1"
+ dnsPolicy: ClusterFirstWithHostNet
+ hostIPC: true
+ hostNetwork: true
+ nodeSelector:
+ pd: "yes"
+ tolerations:
+ - key: pd
+ operator: Exists
+ volumes:
+ - hostPath:
+ path: /var/run/sys-topology
+ name: topo
+ - hostPath:
+ path: /data1/sgl_cache4
+ type: DirectoryOrCreate
+ name: sgl-cache
+ - hostPath:
+ path: /data/src/sglang
+ type: DirectoryOrCreate
+ name: src
+ - emptyDir:
+ medium: Memory
+ name: dshm
+ - hostPath:
+ path: /data/DeepSeek-V3.2-Exp
+ name: model
+ - hostPath:
+ path: /dev/infiniband
+ name: ib
+ - name: router
+ replicas: 1
+ dependencies: [ "decode", "prefill" ]
+ template:
+ spec:
+ containers:
+ - name: scheduler
+ image: lmsysorg/sglang:latest
+ command:
+ - sh
+ - -c
+ - >
+ python3 -m sglang_router.launch_router
+ --host 0.0.0.0
+ --port 8080
+ --pd-disaggregation
+ --policy random
+ --service-discovery
+ --service-discovery-namespace ${NAMESPACE}
+ --service-discovery-port 30000
+ --prefill-selector pd_role=prefill
+ --decode-selector pd_role=decode
+ --max-payload-size 2147483648
+ --worker-startup-timeout-secs 1200
+ env:
+ - name: NAMESPACE
+ valueFrom:
+ fieldRef:
+ apiVersion: v1
+ fieldPath: metadata.namespace
+---
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ app: deepseek-rbg-32exp
+ name: deepseek-rbg-32exp
+ namespace: default
+spec:
+ ports:
+ - name: http
+ port: 8080
+ protocol: TCP
+ targetPort: 8080
+ nodePort: 30080
+
+ selector:
+ rolebasedgroup.workloads.x-k8s.io/name: deepseek-rbg-32exp
+ rolebasedgroup.workloads.x-k8s.io/role: router
+ type: NodePort
+
+```
+
+```bash
+[root@ecs-001]# kubectl get po -n default
+deepseek-rbg-32exp-decode-main-0 1/1 Running 0 74m
+deepseek-rbg-32exp-decode-0-1 1/1 Running 0 74m
+deepseek-rbg-32exp-router-9c5dbfc57 1/1 Running 0 22m
+deepseek-rbg-32exp-prefill-0 1/1 Running 0 74m
+
+[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl get svc |grep dee
+deepseek-rbg-32exp-decode ClusterIP None 97m
+deepseek-rbg-32exp-router-service NodePort 172.16.242.169 8000:30800/TCP 22m
+deepseek-rbg-32exp-prefill ClusterIP None 97m
+```
+
+At this point, select a nodePort:30800 to access:
+
+```bash
+[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
+> -H "Content-Type: application/json" \
+> -H "Authorization: Bearer None" \
+> -d '{
+> "rid":"ccccdd",
+> "model": "dsv32",
+> "messages": [
+> {"role": "system", "content": "0: You are a helpful AI assistant"},
+> {"role": "user", "content": "你是谁?."}
+> ],
+> "max_tokens":221
+> }'
+{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"\n嗯,用户问了一个很基础的自我介绍问题"你是谁?"。这可能是第一次互动时的常规开场白,也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息,语气简洁中性。这种场景下新用户的可能性较高,需要给出清晰友好的自我介绍,同时突出实用价值来降低陌生感。\n\n考虑到中文用户,应该用简体中文回复。重点要说明三点:身份归属(深度求索)、功能定位(AI助手)、服务范围(学习/工作/生活)。结尾用开放性问题引导对话很关键——既能了解需求,又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气,那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量,避免显得轻浮。\n\n你好呀!我是你的AI助手,由深度求索公司(DeepSeek)开发的语言模型,名字叫 **DeepSeek-V32**。你可以把我当成一个知识丰富、随叫随到的小帮手~😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
+
+```
+## FAQ
+
+1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
+
+2. Please ensure that the sglang code in the image has incorporated the changes from [PR #10912](https://github.com/sgl-project/sglang/pull/10912).
diff --git a/sglang/docs/references/post_training_integration.md b/sglang/docs/references/post_training_integration.md
new file mode 100644
index 0000000000000000000000000000000000000000..4dddf5905a86105f038233bd882e766a6cba224c
--- /dev/null
+++ b/sglang/docs/references/post_training_integration.md
@@ -0,0 +1,31 @@
+# Post-Training Integration
+
+SGLang has become the de facto inference backend for modern LLM training frameworks, powering state-of-the-art models across the industry. From GLM-4.6 to Qwen3, leading models leverage SGLang's high-performance inference during reinforcement learning and post-training workflows.
+
+What makes SGLang essential for post-training?
+
+- Open-To-Use Refit Functionality: diverse method for colocate or disaggregate
+- Easy To Postpone Generation: enable partial rollout and dedicated rollout control
+- Fine-Grained Engine Sleep And Wake Up: facilitate maximum-powered rollout and training
+- Training Serving Alignment: ensure the performance consistency in training and serving
+- Load Balancing Router: cache-aware load-balancing for high-throughput rollout
+- Deterministic Inference: ensure zero kl divergence between rollout and training
+
+These capabilities, combined with native integration support across major frameworks, have established SGLang as the infrastructure backbone for modern LLM/VLMs post-training. We also share our latest work in this slide, [Optimizing Large-Scale RL with SGLang](https://gamma.app/docs/Optimizing-RL-with-SGLang-y0kqgj877k34779).
+
+## Adoption
+
+- [**Miles**](https://github.com/radixark/miles): Enterprise-scale RL framework for large MoE models with SGLang-native rollout, speculative training, and production-grade stability
+- [**slime**](https://github.com/THUDM/slime): Post-training framework combining Megatron and SGLang, used to train GLM-4.6
+- [**AReaL**](https://github.com/inclusionAI/AReaL): Fully asynchronous RL system achieving 2.77x speedup with SGLang backend for continuous rollout generation
+- [**ROLL**](https://github.com/alibaba/ROLL): ROLL is an efficient and user-friendly RL library designed for Large Language Models utilizing Large Scale GPU resources
+- [**verl**](https://github.com/volcengine/verl): Full-stack RLHF framework supporting PPO, GRPO, and ReMax with modular SGLang integration
+- [**Unsloth**](https://docs.unsloth.ai/basics/inference-and-deployment/sglang-guide): 2x faster fine-tuning with optimized kernels, deploys seamlessly with SGLang inference
+- [**LLaMA Factory**](https://github.com/hiyouga/LLaMA-Factory): Unified framework for training 100+ LLMs with LoRA, QLoRA, and full fine-tuning methods
+- [**Tunix**](https://github.com/google/tunix): Google's JAX-native library for LLM post-training with SFT, DPO, PPO, and GRPO support
+- [**RL2**](https://github.com/ChenmienTan/RL2): Ray Less Reinforcement Learning, a concise library of post-training for large language models
+
+
+## Collaboration
+
+Due to the privacy of the design partners, we cannot list the companies that adopt SGLang for post-training. However, we are happy to share the details with you if you are interested and trust the choice among 10+ top companies and frontier labs across US and China. If you are interested in integrating SGLang with your training framework or need technical support, we're here to help! Reach out to us at **rl_team@lmsys.org** for partnerships, integration guidance, and custom feature development.
diff --git a/sglang/docs/references/production_metrics.md b/sglang/docs/references/production_metrics.md
new file mode 100644
index 0000000000000000000000000000000000000000..85a6ff8a64a6d9c5f1ab6aa21a5e207514f2d29f
--- /dev/null
+++ b/sglang/docs/references/production_metrics.md
@@ -0,0 +1,231 @@
+# Production Metrics
+
+SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.
+
+An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).
+
+Here is an example of the metrics:
+
+```
+$ curl http://localhost:30000/metrics
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
+# HELP sglang:generation_tokens_total Number of generation tokens processed.
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
+# HELP sglang:token_usage The token usage
+# TYPE sglang:token_usage gauge
+sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
+# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE sglang:time_to_first_token_seconds histogram
+sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
+sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.0
+sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 27.0
+sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 314.0
+sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 941.0
+sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1330.0
+sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1970.0
+sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2326.0
+sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2417.0
+sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2513.0
+sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
+# TYPE sglang:e2e_request_latency_seconds histogram
+sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
+sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
+sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 10.0
+sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11.0
+sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 14.0
+sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 247.0
+sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 486.0
+sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 845.0
+sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1513.0
+sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE sglang:time_per_output_token_seconds histogram
+sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
+sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
+sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
+sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 593.0
+sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 855.0
+sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1035.0
+sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1815.0
+sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11685.0
+sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 433413.0
+sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 4.950195e+06
+sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.039435e+06
+sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.171662e+06
+sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.266055e+06
+sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.296752e+06
+sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.312226e+06
+sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.339675e+06
+sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.357747e+06
+sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.389414e+06
+sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+# HELP sglang:func_latency_seconds Function latency in seconds
+# TYPE sglang:func_latency_seconds histogram
+sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
+sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
+sglang:func_latency_seconds_count{name="generate_request"} 14007.0
+# HELP sglang:num_running_reqs The number of running requests
+# TYPE sglang:num_running_reqs gauge
+sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
+# HELP sglang:num_used_tokens The number of used tokens
+# TYPE sglang:num_used_tokens gauge
+sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
+# HELP sglang:gen_throughput The generate throughput (token/s)
+# TYPE sglang:gen_throughput gauge
+sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
+# HELP sglang:num_queue_reqs The number of requests in the waiting queue
+# TYPE sglang:num_queue_reqs gauge
+sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
+```
+
+## Setup Guide
+
+This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `examples/monitoring` directory.
+
+### Prerequisites
+
+- Docker and Docker Compose installed
+- SGLang server running with metrics enabled
+
+### Usage
+
+1. **Start your SGLang server with metrics enabled:**
+
+ ```bash
+ python -m sglang.launch_server \
+ --model-path \
+ --port 30000 \
+ --enable-metrics
+ ```
+ Replace `` with the actual path to your model (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct`). Ensure the server is accessible from the monitoring stack (you might need `--host 0.0.0.0` if running in Docker). By default, the metrics endpoint will be available at `http://:30000/metrics`.
+
+2. **Navigate to the monitoring example directory:**
+ ```bash
+ cd examples/monitoring
+ ```
+
+3. **Start the monitoring stack:**
+ ```bash
+ docker compose up -d
+ ```
+ This command will start Prometheus and Grafana in the background.
+
+4. **Access the monitoring interfaces:**
+ * **Grafana:** Open your web browser and go to [http://localhost:3000](http://localhost:3000).
+ * **Prometheus:** Open your web browser and go to [http://localhost:9090](http://localhost:9090).
+
+5. **Log in to Grafana:**
+ * Default Username: `admin`
+ * Default Password: `admin`
+ You will be prompted to change the password upon your first login.
+
+6. **View the Dashboard:**
+ The SGLang dashboard is pre-configured and should be available automatically. Navigate to `Dashboards` -> `Browse` -> `SGLang Monitoring` folder -> `SGLang Dashboard`.
+
+### Troubleshooting
+
+* **Port Conflicts:** If you encounter errors like "port is already allocated," check if other services (including previous instances of Prometheus/Grafana) are using ports `9090` or `3000`. Use `docker ps` to find running containers and `docker stop ` to stop them, or use `lsof -i :` to find other processes using the ports. You might need to adjust the ports in the `docker-compose.yaml` file if they permanently conflict with other essential services on your system.
+
+To modify Grafana's port to the other one(like 3090) in your Docker Compose file, you need to explicitly specify the port mapping under the grafana service.
+
+ Option 1: Add GF_SERVER_HTTP_PORT to the environment section:
+ ```
+ environment:
+ - GF_AUTH_ANONYMOUS_ENABLED=true
+ - GF_SERVER_HTTP_PORT=3090 # <-- Add this line
+ ```
+ Option 2: Use port mapping:
+ ```
+ grafana:
+ image: grafana/grafana:latest
+ container_name: grafana
+ ports:
+ - "3090:3000" # <-- Host:Container port mapping
+ ```
+* **Connection Issues:**
+ * Ensure both Prometheus and Grafana containers are running (`docker ps`).
+ * Verify the Prometheus data source configuration in Grafana (usually auto-configured via `grafana/datasources/datasource.yaml`). Go to `Connections` -> `Data sources` -> `Prometheus`. The URL should point to the Prometheus service (e.g., `http://prometheus:9090`).
+ * Confirm that your SGLang server is running and the metrics endpoint (`http://:30000/metrics`) is accessible *from the Prometheus container*. If SGLang is running on your host machine and Prometheus is in Docker, use `host.docker.internal` (on Docker Desktop) or your machine's network IP instead of `localhost` in the `prometheus.yaml` scrape configuration.
+* **No Data on Dashboard:**
+ * Generate some traffic to your SGLang server to produce metrics. For example, run a benchmark:
+ ```bash
+ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 100 --random-input 128 --random-output 128
+ ```
+ * Check the Prometheus UI (`http://localhost:9090`) under `Status` -> `Targets` to see if the SGLang endpoint is being scraped successfully.
+ * Verify the `model_name` and `instance` labels in your Prometheus metrics match the variables used in the Grafana dashboard. You might need to adjust the Grafana dashboard variables or the labels in your Prometheus configuration.
+
+### Configuration Files
+
+The monitoring setup is defined by the following files within the `examples/monitoring` directory:
+
+* `docker-compose.yaml`: Defines the Prometheus and Grafana services.
+* `prometheus.yaml`: Prometheus configuration, including scrape targets.
+* `grafana/datasources/datasource.yaml`: Configures the Prometheus data source for Grafana.
+* `grafana/dashboards/config/dashboard.yaml`: Tells Grafana to load dashboards from the specified path.
+* `grafana/dashboards/json/sglang-dashboard.json`: The actual Grafana dashboard definition in JSON format.
+
+You can customize the setup by modifying these files. For instance, you might need to update the `static_configs` target in `prometheus.yaml` if your SGLang server runs on a different host or port.
+
+#### Check if the metrics are being collected
+
+Run:
+```
+python3 -m sglang.bench_serving \
+ --backend sglang \
+ --dataset-name random \
+ --num-prompts 3000 \
+ --random-input 1024 \
+ --random-output 1024 \
+ --random-range-ratio 0.5
+```
+
+to generate some requests.
+
+Then you should be able to see the metrics in the Grafana dashboard.
diff --git a/sglang/docs/references/production_request_trace.md b/sglang/docs/references/production_request_trace.md
new file mode 100644
index 0000000000000000000000000000000000000000..d1dfdd2f067da7ce3d784560d5350b01dcf04d9a
--- /dev/null
+++ b/sglang/docs/references/production_request_trace.md
@@ -0,0 +1,133 @@
+# Production Request Tracing
+
+SGLang exports request trace data based on the OpenTelemetry Collector. You can enable tracing by adding the `--enable-trace` and configure the OpenTelemetry Collector endpoint using `--otlp-traces-endpoint` when launching the server.
+
+You can find example screenshots of the visualization in https://github.com/sgl-project/sglang/issues/8965.
+
+## Setup Guide
+This section explains how to configure the request tracing and export the trace data.
+1. Install the required packages and tools
+ * install Docker and Docker Compose
+ * install the dependencies
+ ```bash
+ # enter the SGLang root directory
+ pip install -e "python[tracing]"
+
+ # or manually install the dependencies using pip
+ pip install opentelemetry-sdk opentelemetry-api opentelemetry-exporter-otlp opentelemetry-exporter-otlp-proto-grpc
+ ```
+
+2. Launch OpenTelemetry collector and Jaeger
+ ```bash
+ docker compose -f examples/monitoring/tracing_compose.yaml up -d
+ ```
+
+3. Start your SGLang server with tracing enabled
+ ```bash
+ # set env variables
+ export SGLANG_OTLP_EXPORTER_SCHEDULE_DELAY_MILLIS=500
+ export SGLANG_OTLP_EXPORTER_MAX_EXPORT_BATCH_SIZE=64
+ # start the prefill and decode server
+ python -m sglang.launch_server --enable-trace --otlp-traces-endpoint 0.0.0.0:4317
+ # start the model-gate-way
+ python -m sglang_router.launch_router --enable-trace --otlp-traces-endpoint 0.0.0.0:4317
+ ```
+
+ Replace `0.0.0.0:4317` with the actual endpoint of the OpenTelemetry collector. If you launched the openTelemetry collector with tracing_compose.yaml, the default receiving port is 4317.
+
+ To use the HTTP/protobuf span exporter, set the following environment variable and point to an HTTP endpoint, for example, `http://0.0.0.0:4318/v1/traces`.
+ ```bash
+ export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+ ```
+
+
+4. Raise some requests
+5. Observe whether trace data is being exported
+ * Access port 16686 of Jaeger using a web browser to visualize the request traces.
+ * The OpenTelemetry Collector also exports trace data in JSON format to /tmp/otel_trace.json. In a follow-up patch, we will provide a tool to convert this data into a Perfetto-compatible format, enabling visualization of requests in the Perfetto UI.
+
+6. Dynamically adjust trace level
+ The trace level accepts configurable values from `0` to `3`. The meanings of different trace level values are as follows:
+ ```
+ 0: disable tracing
+ 1: Trace important slices
+ 2: Trace all slices except nested ones
+ 3: Trace all slices
+ ```
+ The trace level can be dynamically set via HTTP API, for example:
+ ```bash
+ curl http://0.0.0.0:30000/set_trace_level?level=2
+ ```
+ Replace `0.0.0.0:30000` with your actual server address, and replace `level=2` with the level you want to set.
+
+ **Note**: You must set the parameter `--enable-trace`; otherwise, the trace capability will not be enabled regardless of any dynamic adjustments to the trace level.
+
+## How to add Tracing for slices you're interested in?(API introduction)
+We have already inserted instrumentation points in the tokenizer and scheduler main threads. If you wish to trace additional request execution segments or perform finer-grained tracing, please use the APIs from the tracing package as described below.
+
+**All of the following implementations are done in python/sglang/srt/observability/req_time_stats.py. If you want to add another slice, please do it here.**
+
+1. Initialization
+
+ Every process involved in tracing during the initialization phase should execute:
+ ```python
+ process_tracing_init(otlp_traces_endpoint, server_name)
+ ```
+ The otlp_traces_endpoint is obtained from the arguments, and you can set server_name freely, but it should remain consistent across all processes.
+
+ Every thread involved in tracing during the initialization phase should execute:
+ ```python
+ trace_set_thread_info("thread label", tp_rank, dp_rank)
+ ```
+ The "thread label" can be regarded as the name of the thread, used to distinguish different threads in the visualization view.
+
+2. Create a trace context for a request
+ Each request needs to call `TraceReqContext()` to initialize a request context, which is used to generate slice spans and record request stage info. You can either store it within the request object or maintain it as a global variable.
+
+3. Mark the beginning and end of a request
+ ```
+ trace_ctx.trace_req_start().
+ trace_ctx.trace_req_finish()
+ ```
+ trace_req_start() and trace_req_finish() must be called within the same process, for example, in the tokenizer.
+
+4. Add tracing for a slice
+
+ * Add slice tracing normally:
+ ```python
+ trace_ctx.trace_slice_start(RequestStage.TOKENIZER.stage_name)
+ trace_ctx.trace_slice_end(RequestStage.TOKENIZER.stage_name)
+
+ or
+ trace_ctx.trace_slice(slice: TraceSliceContext)
+ ```
+
+ - The end of the last slice in a thread must be marked with thread_finish_flag=True, or explicitly call trace_ctx.abort(); otherwise, the thread's span will not be properly generated.
+ ```python
+ trace_ctx.slice_end(RequestStage.D.stage_name, thread_finish_flag = True)
+ trace_ctx.abort()
+ ```
+
+5. When the request execution flow transfers to another thread, the thread context needs to be explicitly rebuilt.
+ - receiver: Execute the following code after receiving the request via ZMQ
+ ```python
+ trace_ctx.rebuild_thread_context()
+ ```
+
+## How to Extend the Tracing Framework to Support Complex Tracing Scenarios
+
+The currently provided tracing package still has potential for further development. If you wish to build more advanced features upon it, you must first understand its existing design principles.
+
+The core of the tracing framework's implementation lies in the design of the span structure and the trace context. To aggregate scattered slices and enable concurrent tracking of multiple requests, we have designed a three-level trace context structure or span structure: `TraceReqContext`, `TraceThreadContext` and `TraceSliceContext`. Their relationship is as follows:
+```
+TraceReqContext (req_id="req-123")
+├── TraceThreadContext(thread_label="scheduler", tp_rank=0)
+| └── TraceSliceContext(slice_name="prefill")
+|
+└── TraceThreadContext(thread_label="scheduler", tp_rank=1)
+ └── TraceSliceContext(slice_name="prefill")
+```
+
+Each traced request maintains a global `TraceReqContext` and creates a corresponding request span. For every thread that processes the request, a `TraceThreadContext` is recorded and a thread span is created. The `TraceThreadContext` is nested within the `TraceReqContext`, and each currently traced code slice—potentially nested—is stored in its associated `TraceThreadContext`.
+
+In addition to the above hierarchy, each slice also records its previous slice via Span.add_link(), which can be used to trace the execution flow.
diff --git a/sglang/docs/references/release_lookup.rst b/sglang/docs/references/release_lookup.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e8833f6c78d586cc2533725acbb028baa83b442
--- /dev/null
+++ b/sglang/docs/references/release_lookup.rst
@@ -0,0 +1,325 @@
+Release Lookup
+==============
+
+Find which SGLang release first included a specific PR or commit.
+
+.. raw:: html
+
+
+
+
+
+
+
+
+
+ Loading index…
+
+
+
Initializing…
+
+
+
diff --git a/sglang/docs/references/torch_compile_cache.md b/sglang/docs/references/torch_compile_cache.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2bb257f4300df66f5432479d6563d933c5e29ee
--- /dev/null
+++ b/sglang/docs/references/torch_compile_cache.md
@@ -0,0 +1,13 @@
+# Enabling cache for torch.compile
+
+SGLang uses `max-autotune-no-cudagraphs` mode of torch.compile. The auto-tuning can be slow.
+If you want to deploy a model on many different machines, you can ship the torch.compile cache to these machines and skip the compilation steps.
+
+This is based on https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html
+
+
+1. Generate the cache by setting TORCHINDUCTOR_CACHE_DIR and running the model once.
+```
+TORCHINDUCTOR_CACHE_DIR=/root/inductor_root_cache python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile
+```
+2. Copy the cache folder to other machines and launch the server with `TORCHINDUCTOR_CACHE_DIR`.
diff --git a/sglang/docs/supported_models/extending/index.rst b/sglang/docs/supported_models/extending/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dbd5ff6cece4a3496498307ec8bebd922263d65a
--- /dev/null
+++ b/sglang/docs/supported_models/extending/index.rst
@@ -0,0 +1,12 @@
+Extending SGLang
+================
+
+Adding new models and alternative backends.
+
+.. toctree::
+ :maxdepth: 1
+
+ support_new_models.md
+ transformers_fallback.md
+ modelscope.md
+ mindspore_models.md
diff --git a/sglang/docs/supported_models/extending/mindspore_models.md b/sglang/docs/supported_models/extending/mindspore_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce82fec6867d77ddf5895c56b6f8171d348ff458
--- /dev/null
+++ b/sglang/docs/supported_models/extending/mindspore_models.md
@@ -0,0 +1,151 @@
+# MindSpore Models
+
+## Introduction
+
+MindSpore is a high-performance AI framework optimized for Ascend NPUs. This doc guides users to run MindSpore models in SGLang.
+
+## Requirements
+
+MindSpore currently only supports Ascend NPU devices. Users need to first install CANN 8.5.
+The CANN software packages can be downloaded from the [Ascend Official Website](https://www.hiascend.com).
+
+## Supported Models
+
+Currently, the following models are supported:
+
+- **Qwen3**: Dense and MoE models
+- **DeepSeek V3/R1**
+- *More models coming soon...*
+
+## Installation
+
+> **Note**: Currently, MindSpore models are provided by an independent package `sgl-mindspore`. Support for MindSpore is built upon current SGLang support for Ascend NPU platform. Please first [install SGLang for Ascend NPU](../../platforms/ascend_npu.md) and then install `sgl-mindspore`:
+
+```shell
+git clone https://github.com/mindspore-lab/sgl-mindspore.git
+cd sgl-mindspore
+pip install -e .
+```
+
+
+## Run Model
+
+Current SGLang-MindSpore supports Qwen3 and DeepSeek V3/R1 models. This doc uses Qwen3-8B as an example.
+
+### Offline inference
+
+Use the following script for offline inference:
+
+```python
+import sglang as sgl
+
+# Initialize the engine with MindSpore backend
+llm = sgl.Engine(
+ model_path="/path/to/your/model", # Local model path
+ device="npu", # Use NPU device
+ model_impl="mindspore", # MindSpore implementation
+ attention_backend="ascend", # Attention backend
+ tp_size=1, # Tensor parallelism size
+ dp_size=1 # Data parallelism size
+)
+
+# Generate text
+prompts = [
+ "Hello, my name is",
+ "The capital of France is",
+ "The future of AI is"
+]
+
+sampling_params = {"temperature": 0, "top_p": 0.9}
+outputs = llm.generate(prompts, sampling_params)
+
+for prompt, output in zip(prompts, outputs):
+ print(f"Prompt: {prompt}")
+ print(f"Generated: {output['text']}")
+ print("---")
+```
+
+### Start server
+
+Launch a server with MindSpore backend:
+
+```bash
+# Basic server startup
+python3 -m sglang.launch_server \
+ --model-path /path/to/your/model \
+ --host 0.0.0.0 \
+ --device npu \
+ --model-impl mindspore \
+ --attention-backend ascend \
+ --tp-size 1 \
+ --dp-size 1
+```
+
+For distributed server with multiple nodes:
+
+```bash
+# Multi-node distributed server
+python3 -m sglang.launch_server \
+ --model-path /path/to/your/model \
+ --host 0.0.0.0 \
+ --device npu \
+ --model-impl mindspore \
+ --attention-backend ascend \
+ --dist-init-addr 127.0.0.1:29500 \
+ --nnodes 2 \
+ --node-rank 0 \
+ --tp-size 4 \
+ --dp-size 2
+```
+
+## Troubleshooting
+
+#### Debug Mode
+
+Enable sglang debug logging by log-level argument.
+
+```bash
+python3 -m sglang.launch_server \
+ --model-path /path/to/your/model \
+ --host 0.0.0.0 \
+ --device npu \
+ --model-impl mindspore \
+ --attention-backend ascend \
+ --log-level DEBUG
+```
+
+Enable mindspore info and debug logging by setting environments.
+
+```bash
+export GLOG_v=1 # INFO
+export GLOG_v=0 # DEBUG
+```
+
+#### Explicitly select devices
+
+Use the following environment variable to explicitly select the devices to use.
+
+```shell
+export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 # to set device
+```
+
+#### Some communication environment issues
+
+In case of some environment with special communication environment, users need set some environment variables.
+
+```shell
+export MS_ENABLE_LCCL=off # current not support LCCL communication mode in SGLang-MindSpore
+```
+
+#### Some dependencies of protobuf
+
+In case of some environment with special protobuf version, users need set some environment variables to avoid binary version mismatch.
+
+```shell
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python # to avoid protobuf binary version mismatch
+```
+
+## Support
+For MindSpore-specific issues:
+
+- Refer to the [MindSpore documentation](https://www.mindspore.cn/)
diff --git a/sglang/docs/supported_models/extending/transformers_fallback.md b/sglang/docs/supported_models/extending/transformers_fallback.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c7dd961c14292d36d415bdbd95369808a159bba
--- /dev/null
+++ b/sglang/docs/supported_models/extending/transformers_fallback.md
@@ -0,0 +1,58 @@
+# Transformers fallback in SGLang
+
+`sglang` can fall back to using models that are available in `transformers`. This works for most decoder-style language models and support for vision-language models is coming soon!
+
+## Example launch Command
+
+By default, we will use sglang implementation if it is available. Otherwise, we will fall back to transformers one. However, you can switch the implementation by setting `--model-impl` to `transformers`.
+
+```shell
+python3 -m sglang.launch_server \
+ --model-path meta-llama/Llama-3.2-1B-Instruct \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --model-impl transformers
+```
+
+## Supported features
+
+### Quantization
+
+Transformers fall back has supported most of available quantization in SGLang (except GGUF). See [Quantization page](../advanced_features/quantization.md) for more information about supported quantization in SGLang.
+
+### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+ def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+ ...
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ **kwargs,
+ )
+ ...
+
+class MyModel(PreTrainedModel):
+ _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/srt/models/transformers`, which leverages `self.config._attn_implementation = "sglang"`, thus the need to use `ALL_ATTENTION_FUNCTIONS`.
+
+That's it!
diff --git a/sglang/docs/supported_models/retrieval_ranking/index.rst b/sglang/docs/supported_models/retrieval_ranking/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c669f9b7bea774c1af6ada02b7a0432c505a11
--- /dev/null
+++ b/sglang/docs/supported_models/retrieval_ranking/index.rst
@@ -0,0 +1,11 @@
+Retrieval & Ranking
+===================
+
+Models for embeddings, reranking, and classification.
+
+.. toctree::
+ :maxdepth: 1
+
+ embedding_models.md
+ rerank_models.md
+ classify_models.md
diff --git a/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9ce36e537fc53c0923fcd178226d9017794e174
Binary files /dev/null and b/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/batch_invariant_ops.cpython-311.pyc b/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/batch_invariant_ops.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2cd3e24d6f83109c54647e6d509cbaeceb22ff3
Binary files /dev/null and b/sglang/python/sglang/srt/batch_invariant_ops/__pycache__/batch_invariant_ops.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a31261d515bed701177d3dccb4463db8e8b7de22
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/afmoe.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/afmoe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa80bebbf2dd5f3c3e6c69a00ff89883c9dfd2be
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/afmoe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/bailing_hybrid.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/bailing_hybrid.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d66898f61353426dde3b9e4fc4ebba96ec3d689
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/bailing_hybrid.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/chatglm.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/chatglm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..732b66b84c57522f31b2821b6d9bdba0a5bcca88
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/chatglm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/dbrx.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/dbrx.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9cbc71929bd1dd674b797580f270dc7380a349f
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/dbrx.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/deepseek_ocr.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/deepseek_ocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29ddb6cdbe337ec40e14952a53c8f627f126a817
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/deepseek_ocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/deepseekvl2.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/deepseekvl2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99d2005be9147532d81526d7db2b70d8f2728962
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/deepseekvl2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/device_config.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/device_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b651834acf3e4aba51ea29fc3ffb9b12747a39b6
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/device_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/dots_ocr.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/dots_ocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..000b8ff0b829aa56ca37c17ea4adf0ae24d93910
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/dots_ocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/dots_vlm.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/dots_vlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49cc30313a2e5d5fd4420541acfcf82d4a4064c8
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/dots_vlm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/exaone.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/exaone.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0c4af8578aeb99a61db6a79626593f7c7133806
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/exaone.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/falcon_h1.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/falcon_h1.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ee15dab6233f788367e9511382c2ac8f08b6779
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/falcon_h1.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/granitemoehybrid.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/granitemoehybrid.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc8b17898f671bdb5a663c6237abbb78553f57f2
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/granitemoehybrid.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/internvl.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/internvl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f355cb769627a04beaa662ddced88ccd252fcd69
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/internvl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/janus_pro.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/janus_pro.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11cdbcae097ef629b2f3b922b8554616e2b53069
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/janus_pro.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/jet_nemotron.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/jet_nemotron.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab94ff375df244bb3308b5998c214f96d7fc3b1f
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/jet_nemotron.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/jet_vlm.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/jet_vlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04a96f61ea9374039c407b951546113a4ea4af27
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/jet_vlm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/kimi_k25.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/kimi_k25.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e518883bdd72136bddeb21faaea8e1a502c5c4cd
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/kimi_k25.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/kimi_linear.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/kimi_linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9081fe6edca67a1b6874249f2d42780414d1f7c1
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/kimi_linear.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/kimi_vl.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/kimi_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e60ee78c2c074dd4557339cc5a493b7a21d4246
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/kimi_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/kimi_vl_moonvit.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/kimi_vl_moonvit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0e2224aba391747ca1803eb655bd8c153340e8d
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/kimi_vl_moonvit.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/lfm2.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/lfm2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70bfec51340a7b717322dbfd2e6d814b7612c2b6
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/lfm2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/lfm2_moe.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/lfm2_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df817ef1fc5bf32e229fd755e0719dce1bef25f0
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/lfm2_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/load_config.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/load_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dd5ec1d79de6735e376a117e60ff54f871ab4cc
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/load_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/longcat_flash.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/longcat_flash.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8377cd0e7717d24784b85cd55ac8f37bb4caa556
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/longcat_flash.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/mamba_utils.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/mamba_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..456be2d731d3def729045a346574793565024410
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/mamba_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/model_config.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/model_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6167504d524350525a2c487eedab139c8ae68cd8
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/model_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/modelopt_config.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/modelopt_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6eb0c8f4da289d0b8f9e443ad99ca439a602b867
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/modelopt_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/nano_nemotron_vl.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/nano_nemotron_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8301448d6e9c55aee50a6aa8e40bd5910217d5a5
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/nano_nemotron_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/nemotron_h.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/nemotron_h.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdec361baed976aab29b847a9c0daea46a649378
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/nemotron_h.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/olmo3.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/olmo3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6475bce18cadc0fd9fa81ab7c62e38fe6ac5073c
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/olmo3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/points_v15_chat.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/points_v15_chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..739fbb58c05d5fb826102af97f20e1dd0cc248d5
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/points_v15_chat.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/qwen3_5.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/qwen3_5.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e646afe740fc7dfc64d1a6248cd177e52073ba1f
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/qwen3_5.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/qwen3_next.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/qwen3_next.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31696897c491ecbf7a8b9acb6973cbf6b77c3d32
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/qwen3_next.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/qwen3_omni.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/qwen3_omni.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c66663cba3d1e941e407ee225fba7e5db06287ba
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/qwen3_omni.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/qwen3_vl.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/qwen3_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..136a9016ece148e6db5d2387b67093eb8de5908a
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/qwen3_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/radio.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/radio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eea662d497908ae8ea9aa4e2bd6c48ec24538444
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/radio.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/step3_vl.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/step3_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5244f5a25479f0bf9f45b63a6ba008544eb1e1ab
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/step3_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/step3p5.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/step3p5.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5d96468cde3b301231489151641e72928b717e6
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/step3p5.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/update_config.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/update_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e95632e80d81467f3497ca44ce4a7a6f0e9c92d
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/update_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/configs/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/configs/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a700e84d287c7d58d0ef9ce1a4c06fe5e198ee05
Binary files /dev/null and b/sglang/python/sglang/srt/configs/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75a62a09c8ec0de6897cd95bf99e2af49d99dad7
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/base_connector.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/base_connector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a98c7bc4a961249e2e787d6e5adebe1abc2ede34
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/base_connector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/redis.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/redis.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2be26c5ea60fba991fd5eae2e3c1e429ed9bc20
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/redis.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/remote_instance.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/remote_instance.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eee88441b2433dcc30a8f4b023e9f71066e53093
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/remote_instance.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/s3.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/s3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30a13e6ac49074e68f209e7b028fe05fcd8192c6
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/s3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/connector/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c027d31fa515b11c757709cf23c6fa0458225b21
Binary files /dev/null and b/sglang/python/sglang/srt/connector/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/serde/__init__.py b/sglang/python/sglang/srt/connector/serde/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c05b20afa2c93e59f2d15981c1b1c840b244bb7e
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/serde/__init__.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# inspired by LMCache
+from typing import Optional, Tuple
+
+import torch
+
+from sglang.srt.connector.serde.safe_serde import SafeDeserializer, SafeSerializer
+from sglang.srt.connector.serde.serde import Deserializer, Serializer
+
+
+def create_serde(serde_type: str) -> Tuple[Serializer, Deserializer]:
+ s: Optional[Serializer] = None
+ d: Optional[Deserializer] = None
+
+ if serde_type == "safe":
+ s = SafeSerializer()
+ d = SafeDeserializer()
+ else:
+ raise ValueError(f"Unknown serde type: {serde_type}")
+
+ return s, d
+
+
+__all__ = [
+ "Serializer",
+ "Deserializer",
+ "SafeSerializer",
+ "SafeDeserializer",
+ "create_serde",
+]
diff --git a/sglang/python/sglang/srt/connector/serde/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/connector/serde/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..004819501166ed1696a0d99c42cd105090fab606
Binary files /dev/null and b/sglang/python/sglang/srt/connector/serde/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/serde/__pycache__/safe_serde.cpython-311.pyc b/sglang/python/sglang/srt/connector/serde/__pycache__/safe_serde.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e91c476082fb7c01aa9963881e9b25e365f008d5
Binary files /dev/null and b/sglang/python/sglang/srt/connector/serde/__pycache__/safe_serde.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/serde/__pycache__/serde.cpython-311.pyc b/sglang/python/sglang/srt/connector/serde/__pycache__/serde.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a8f743f45f796b80b056e741ea3a6d43e446cac
Binary files /dev/null and b/sglang/python/sglang/srt/connector/serde/__pycache__/serde.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/connector/serde/safe_serde.py b/sglang/python/sglang/srt/connector/serde/safe_serde.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e75f9bfc4acd246c591f3c4a0d9e5c73a38ac35
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/serde/safe_serde.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+import torch
+from safetensors.torch import load, save
+
+from sglang.srt.connector.serde.serde import Deserializer, Serializer
+
+
+class SafeSerializer(Serializer):
+
+ def __init__(self):
+ super().__init__()
+
+ def to_bytes(self, t: torch.Tensor) -> bytes:
+ return save({"tensor_bytes": t.cpu().contiguous()})
+
+
+class SafeDeserializer(Deserializer):
+
+ def __init__(self):
+ # TODO: dtype options
+ super().__init__(torch.float32)
+
+ def from_bytes_normal(self, b: Union[bytearray, bytes]) -> torch.Tensor:
+ return load(bytes(b))["tensor_bytes"]
+
+ def from_bytes(self, b: Union[bytearray, bytes]) -> torch.Tensor:
+ return self.from_bytes_normal(b)
diff --git a/sglang/python/sglang/srt/connector/serde/serde.py b/sglang/python/sglang/srt/connector/serde/serde.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6f804d754fc055ce8edc8d91e2908dc3d8b839
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/serde/serde.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import abc
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class Serializer(ABC):
+
+ @abstractmethod
+ def to_bytes(self, t: torch.Tensor) -> bytes:
+ """
+ Serialize a pytorch tensor to bytes. The serialized bytes should contain
+ both the data and the metadata (shape, dtype, etc.) of the tensor.
+
+ Input:
+ t: the input pytorch tensor, can be on any device, in any shape,
+ with any dtype
+
+ Returns:
+ bytes: the serialized bytes
+ """
+ raise NotImplementedError
+
+
+class Deserializer(metaclass=abc.ABCMeta):
+
+ def __init__(self, dtype):
+ self.dtype = dtype
+
+ @abstractmethod
+ def from_bytes(self, bs: bytes) -> torch.Tensor:
+ """
+ Deserialize a pytorch tensor from bytes.
+
+ Input:
+ bytes: a stream of bytes
+
+ Output:
+ torch.Tensor: the deserialized pytorch tensor
+ """
+ raise NotImplementedError
diff --git a/sglang/python/sglang/srt/debug_utils/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/debug_utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddf9beb5c3e575863f614cbf67796be60a63c2a5
Binary files /dev/null and b/sglang/python/sglang/srt/debug_utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/debug_utils/__pycache__/cuda_coredump.cpython-311.pyc b/sglang/python/sglang/srt/debug_utils/__pycache__/cuda_coredump.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9eaef2d2916146589211cf131a30d0b05060dfdf
Binary files /dev/null and b/sglang/python/sglang/srt/debug_utils/__pycache__/cuda_coredump.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/debug_utils/__pycache__/dumper.cpython-311.pyc b/sglang/python/sglang/srt/debug_utils/__pycache__/dumper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4a575235efbf285617c2f164fca1bbc36a0d120
Binary files /dev/null and b/sglang/python/sglang/srt/debug_utils/__pycache__/dumper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/debug_utils/__pycache__/tensor_dump_forward_hook.cpython-311.pyc b/sglang/python/sglang/srt/debug_utils/__pycache__/tensor_dump_forward_hook.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2411d8c23d59fd67e7cd0ff2052a41cf35c7c8dd
Binary files /dev/null and b/sglang/python/sglang/srt/debug_utils/__pycache__/tensor_dump_forward_hook.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aec40062be28b87b7987acba537b407ecf1ee8b
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/__init__.py
@@ -0,0 +1,9 @@
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.traced_types import ( # noqa: F401
+ TracedAlignerPlan,
+)
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.types import ( # noqa: F401
+ AlignerPlan,
+)
+from sglang.srt.debug_utils.comparator.output_types import ComparisonTensorRecord
+
+ComparisonTensorRecord.model_rebuild()
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/__main__.py b/sglang/python/sglang/srt/debug_utils/comparator/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..511d5f6d10e5da5ac4c0fd51b379dd8d2553edc2
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/__main__.py
@@ -0,0 +1,4 @@
+from sglang.srt.debug_utils.comparator.entrypoint import main
+
+if __name__ == "__main__":
+ main()
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/axis_aligner.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/axis_aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..34451d293b11e3bddbe4182f3e4f8e7a3d84eafc
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/aligner/axis_aligner.py
@@ -0,0 +1,190 @@
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+from einops import rearrange
+
+from sglang.srt.debug_utils.comparator.dims_spec import (
+ _FUSED_NAME_SEP,
+ DimSpec,
+ _SingletonDimUtil,
+ parse_dims,
+)
+from sglang.srt.debug_utils.comparator.log_sink import log_sink
+from sglang.srt.debug_utils.comparator.utils import Pair, _FrozenBase
+
+# --- types ---
+
+
+class AxisAlignerPlan(_FrozenBase):
+ pattern: Pair[Optional[str]] # einops pattern per side, None = no-op
+
+
+# --- planner ---
+
+
+def compute_axis_aligner_plan(
+ dims_str_pair: Pair[Optional[str]],
+) -> Optional[AxisAlignerPlan]:
+ if dims_str_pair.x is None or dims_str_pair.y is None:
+ return None
+
+ dims_pair: Pair[str] = Pair(x=dims_str_pair.x, y=dims_str_pair.y)
+ specs_pair: Pair[list[DimSpec]] = dims_pair.map(lambda s: parse_dims(s).dims)
+
+ if not _semantic_names_match(specs_pair):
+ return None
+
+ # Canonical dim order follows y; fused groups stay fused (flatten, not unflatten).
+ canonical_order: Optional[list[str]] = _build_canonical_order(specs_pair)
+ if canonical_order is None:
+ return None
+
+ pattern: Pair[Optional[str]] = specs_pair.map(
+ lambda specs: _build_side_pattern(specs=specs, canonical_order=canonical_order)
+ )
+
+ if pattern.x is None and pattern.y is None:
+ return None
+
+ return AxisAlignerPlan(pattern=pattern)
+
+
+def _semantic_names_match(specs_pair: Pair[list[DimSpec]]) -> bool:
+ """Check that both sides share the same semantic name set (ignoring squeeze dims)."""
+ names_pair: Pair[list[str]] = specs_pair.map(_expand_and_skip_squeeze)
+
+ if set(names_pair.x) == set(names_pair.y):
+ return True
+
+ # Local import to avoid circular dependency:
+ # output_types -> aligner/entrypoint/types -> axis_aligner -> output_types
+ from sglang.srt.debug_utils.comparator.output_types import ErrorLog
+
+ log_sink.add(
+ ErrorLog(
+ category="axis_aligner_dim_mismatch",
+ message=(
+ f"AxisAligner: dim name sets differ (x={names_pair.x}, y={names_pair.y}), "
+ f"skipping axis swap"
+ ),
+ )
+ )
+ return False
+
+
+def _expand_and_skip_squeeze(specs: list[DimSpec]) -> list[str]:
+ """Expand DimSpecs to flat semantic names, skipping squeeze dims."""
+ return [
+ name
+ for spec in specs
+ if not _SingletonDimUtil.is_squeeze(spec)
+ for name in spec.sub_dims
+ ]
+
+
+def _build_canonical_order(specs_pair: Pair[list[DimSpec]]) -> Optional[list[str]]:
+ """Build canonical dim order following y, preferring fused representation.
+
+ Each element is either a plain name (``"c"``) or a fused placeholder (``"a___b"``).
+ Fused groups from *either* side are merged — the separate side must flatten.
+ Squeeze dims are excluded.
+
+ Returns ``None`` if the two sides have overlapping but incompatible fused groups
+ (e.g. x fuses ``(a*b)`` while y fuses ``(b*c)``).
+ """
+ # Map each sub-dim name → (placeholder, siblings) from both sides
+ fused_lookup: dict[str, tuple[str, frozenset[str]]] = {}
+ for spec in (*specs_pair.x, *specs_pair.y):
+ if spec.is_fused:
+ placeholder: str = spec.sanitized_name
+ siblings: frozenset[str] = frozenset(spec.sub_dims)
+ for sub_name in spec.sub_dims:
+ existing: Optional[tuple[str, frozenset[str]]] = fused_lookup.get(
+ sub_name
+ )
+ if existing is not None and existing[1] != siblings:
+ from sglang.srt.debug_utils.comparator.output_types import ErrorLog
+
+ log_sink.add(
+ ErrorLog(
+ category="axis_aligner_fused_conflict",
+ message=(
+ f"AxisAligner: overlapping fused groups for sub-dim {sub_name!r} "
+ f"({existing[0]} vs {placeholder}), skipping axis alignment"
+ ),
+ )
+ )
+ return None
+ fused_lookup.setdefault(sub_name, (placeholder, siblings))
+
+ result: list[str] = []
+ consumed: set[str] = set()
+
+ for spec in specs_pair.y:
+ if _SingletonDimUtil.is_squeeze(spec):
+ continue
+
+ names: list[str] = spec.sub_dims
+ if any(n in consumed for n in names):
+ continue
+
+ entry: Optional[tuple[str, frozenset[str]]] = fused_lookup.get(names[0])
+ if entry is not None:
+ fused_placeholder, sibs = entry
+ result.append(fused_placeholder)
+ consumed.update(sibs)
+ else:
+ result.append(spec.name)
+ consumed.update(names)
+
+ return result
+
+
+def _build_side_pattern(
+ *, specs: list[DimSpec], canonical_order: list[str]
+) -> Optional[str]:
+ """Build an einops pattern for one side to reach ``canonical_order``.
+
+ Fused specs become their placeholder; separate specs that belong to a fused group
+ stay as individual names on the LHS and become ``(a b)`` on the RHS (einops flatten).
+ Squeeze dims (``1``) appear on the LHS but are dropped from the RHS.
+ """
+ source_tokens: list[str] = [spec.sanitized_name for spec in specs]
+
+ # Build per-side target: replace fused placeholders with ``(a b)`` only if this side
+ # has the sub-dims as separate (non-fused) names in the source
+ fused_placeholders: set[str] = {
+ spec.sanitized_name for spec in specs if spec.is_fused
+ }
+ target_tokens: list[str] = [
+ (
+ f"({t.replace(_FUSED_NAME_SEP, ' ')})"
+ if _FUSED_NAME_SEP in t and t not in fused_placeholders
+ else t
+ )
+ for t in canonical_order
+ ]
+
+ if source_tokens == target_tokens:
+ return None
+
+ return f"{' '.join(source_tokens)} -> {' '.join(target_tokens)}"
+
+
+# --- executor ---
+
+
+def execute_axis_aligner_plan(
+ tensor: torch.Tensor, plan: AxisAlignerPlan, *, side: str
+) -> torch.Tensor:
+ if side not in ("x", "y"):
+ raise ValueError(f"side must be 'x' or 'y', got {side!r}")
+
+ pattern: Optional[str] = plan.pattern.x if side == "x" else plan.pattern.y
+
+ if pattern is not None:
+ tensor = rearrange(tensor.rename(None), pattern)
+
+ return tensor
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/entrypoint/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/entrypoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/entrypoint/traced_types.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/entrypoint/traced_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecdad4c207e580fa7abe8632430fba3ee336a2c
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/aligner/entrypoint/traced_types.py
@@ -0,0 +1,37 @@
+"""Traced wrapper types that embed execution traces (ShapeSnapshots) into plan nodes.
+
+These types are created *after* execution, pairing each sub-plan with its
+observed shape snapshot so that downstream formatters never need to manually
+zip plan + trace by index.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.types import (
+ AlignerPerStepSubPlan,
+ AlignerPlan,
+)
+from sglang.srt.debug_utils.comparator.output_types import ShapeSnapshot
+from sglang.srt.debug_utils.comparator.utils import Pair, _StrictBase
+
+
+class TracedSubPlan(_StrictBase):
+ plan: AlignerPerStepSubPlan
+ snapshot: Optional[ShapeSnapshot] = None
+
+
+class TracedStepPlan(_StrictBase):
+ step: int
+ input_object_indices: list[int]
+ sub_plans: list[TracedSubPlan]
+
+
+class TracedSidePlan(_StrictBase):
+ step_plans: list[TracedStepPlan]
+
+
+class TracedAlignerPlan(_StrictBase):
+ plan: AlignerPlan
+ per_side: Pair[TracedSidePlan]
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/reorderer/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/reorderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/token_aligner/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/token_aligner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/token_aligner/concat_steps/executor.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/token_aligner/concat_steps/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..201367d5ed9808e2baeff6ed3cfaca6628dd2797
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/aligner/token_aligner/concat_steps/executor.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+
+from sglang.srt.debug_utils.comparator.dims_spec import (
+ SEQ_DIM_NAME,
+ TOKEN_DIM_NAME,
+)
+from sglang.srt.debug_utils.comparator.utils import Pair
+
+_UNNAMED_TOKEN_DIM_FALLBACK: int = 0
+
+
+def execute_token_aligner_concat_steps(
+ tensor_of_step_pair: Pair[dict[int, torch.Tensor]],
+) -> Pair[torch.Tensor]:
+ """Concat all steps in order, then truncate to min(total_x, total_y) tokens."""
+ some_tensor: torch.Tensor = next(iter(tensor_of_step_pair.x.values()))
+ token_dim: int = _resolve_token_dim(some_tensor)
+
+ concatenated: Pair[torch.Tensor] = tensor_of_step_pair.map(
+ lambda d: _concat_steps(d, dim=token_dim)
+ )
+ common: int = min(concatenated.x.shape[token_dim], concatenated.y.shape[token_dim])
+ return concatenated.map(lambda t: t.narrow(dim=token_dim, start=0, length=common))
+
+
+def _resolve_token_dim(tensor: torch.Tensor) -> int:
+ """Find the token/seq dim index. Falls back to dim 0 for unnamed tensors or
+ tensors without a recognised token/seq dim."""
+ if tensor.names[0] is None:
+ return _UNNAMED_TOKEN_DIM_FALLBACK
+
+ names: tuple[Optional[str], ...] = tensor.names
+ for candidate in (TOKEN_DIM_NAME, SEQ_DIM_NAME):
+ if candidate in names:
+ return list(names).index(candidate)
+
+ return _UNNAMED_TOKEN_DIM_FALLBACK
+
+
+def _concat_steps(tensor_of_step: dict[int, torch.Tensor], *, dim: int) -> torch.Tensor:
+ return torch.cat([tensor_of_step[s] for s in sorted(tensor_of_step)], dim=dim)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/parallel_info.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/parallel_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ff4a005da158232c57af9d16579937ee8d4a6c6
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/parallel_info.py
@@ -0,0 +1,45 @@
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.aligner.unsharder.types import AxisInfo
+from sglang.srt.debug_utils.comparator.dims_spec import ParallelAxis
+
+_PARALLEL_INFO_KEYS = ("sglang_parallel_info", "megatron_parallel_info")
+
+
+def _is_error_sentinel(value: dict) -> bool:
+ """Check if a parallel_info dict is an error sentinel (e.g. {'megatron_error': True})."""
+ return any(k.endswith("_error") for k in value)
+
+
+def normalize_parallel_info(meta: dict) -> dict[ParallelAxis, AxisInfo]:
+ """Extract unified parallel info from dump meta."""
+ info: Optional[dict] = None
+ for key in _PARALLEL_INFO_KEYS:
+ value = meta.get(key)
+ if isinstance(value, dict) and value and not _is_error_sentinel(value):
+ if info is not None:
+ raise ValueError(
+ f"Meta contains multiple parallel_info keys among {_PARALLEL_INFO_KEYS}"
+ )
+ info = value
+
+ if info is None:
+ info = {}
+
+ result: dict[ParallelAxis, AxisInfo] = {}
+ for axis in ParallelAxis:
+ axis_rank = info.get(f"{axis.value}_rank")
+ axis_size = info.get(f"{axis.value}_size")
+
+ # Recompute pseudo-axis lives at top-level meta, not inside parallel_info
+ if axis_rank is None:
+ axis_rank = meta.get(f"{axis.value}_rank")
+ axis_size = meta.get(f"{axis.value}_size")
+
+ if axis_rank is not None and axis_size is not None and axis_size > 1:
+ result[axis] = AxisInfo(
+ axis_rank=axis_rank,
+ axis_size=axis_size,
+ )
+
+ return result
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/types.py b/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..27009041965876c6d78acbb2b45cda8bbe1ec40d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/aligner/unsharder/types.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import Field, model_validator
+
+from sglang.srt.debug_utils.comparator.dims_spec import ParallelAxis
+from sglang.srt.debug_utils.comparator.utils import _FrozenBase
+
+
+class AxisInfo(_FrozenBase):
+ axis_rank: int
+ axis_size: int
+
+ @model_validator(mode="after")
+ def _validate_bounds(self) -> AxisInfo:
+ if self.axis_size <= 0:
+ raise ValueError(f"axis_size must be > 0, got {self.axis_size}")
+ if not (0 <= self.axis_rank < self.axis_size):
+ raise ValueError(
+ f"axis_rank must be in [0, {self.axis_size}), got {self.axis_rank}"
+ )
+ return self
+
+
+class ConcatParams(_FrozenBase):
+ op: Literal["concat"] = "concat"
+ dim_name: str
+
+
+class CpThdConcatParams(_FrozenBase):
+ op: Literal["cp_thd_concat"] = "cp_thd_concat"
+ dim_name: str
+ seq_lens_per_rank: list[int] # per-seq token count on each rank, e.g. [50, 32, 46]
+
+
+class PickParams(_FrozenBase):
+ op: Literal["pick"] = "pick"
+
+
+class ReduceSumParams(_FrozenBase):
+ op: Literal["reduce_sum"] = "reduce_sum"
+
+
+UnsharderParams = Annotated[
+ Union[ConcatParams, CpThdConcatParams, PickParams, ReduceSumParams],
+ Field(discriminator="op"),
+]
+
+
+class UnsharderPlan(_FrozenBase):
+ type: Literal["unsharder"] = "unsharder"
+ axis: ParallelAxis
+ params: UnsharderParams
+ # groups[i] = indices in the input tensor list, which will be operated (e.g. concat) into i-th output tensor.
+ #
+ # Multistep example (CP=2, TP=2, 4 input tensors):
+ # plan[0] (CP): groups=[[0,2],[1,3]] — 4 tensors → 2 tensors
+ # plan[1] (TP): groups=[[0,1]] — 2 tensors → 1 tensor
+ groups: list[list[int]]
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/bundle_comparator.py b/sglang/python/sglang/srt/debug_utils/comparator/bundle_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..657e457a42b884c60a0fdcff903b928dabfe0895
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/bundle_comparator.py
@@ -0,0 +1,382 @@
+"""Compare two tensor bundles."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.executor import (
+ AlignerResult,
+ execute_aligner_plan,
+)
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.planner import (
+ compute_aligner_plan,
+)
+from sglang.srt.debug_utils.comparator.aligner.entrypoint.types import AlignerPlan
+from sglang.srt.debug_utils.comparator.aligner.token_aligner.smart.types import (
+ TokenAlignerPlan,
+)
+from sglang.srt.debug_utils.comparator.dims_spec import (
+ SEQ_DIM_NAME,
+ TOKEN_DIM_NAME,
+ apply_dim_names,
+ parse_dims,
+ resolve_dim_names,
+)
+from sglang.srt.debug_utils.comparator.dp_utils import filter_to_non_empty_dp_rank
+from sglang.srt.debug_utils.comparator.log_sink import log_sink
+from sglang.srt.debug_utils.comparator.meta_overrider import MetaOverrider
+from sglang.srt.debug_utils.comparator.output_types import (
+ BundleFileInfo,
+ BundleSideInfo,
+ ComparisonNonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonTensorRecord,
+ ErrorLog,
+ _split_logs,
+)
+from sglang.srt.debug_utils.comparator.tensor_comparator.comparator import (
+ compare_tensor_pair,
+)
+from sglang.srt.debug_utils.comparator.utils import Pair
+from sglang.srt.debug_utils.dump_loader import LOAD_FAILED, ValueWithMeta
+
+_FAILED_SIDE_MAP: dict[str, str] = {"x": "baseline", "y": "target"}
+
+
+def _collect_bundle_side_info(
+ items: list[ValueWithMeta],
+ metas: list[dict[str, Any]],
+) -> BundleSideInfo:
+ from sglang.srt.debug_utils.comparator.display import (
+ _PARALLEL_INFO_KEYS,
+ extract_parallel_info,
+ )
+
+ files: list[BundleFileInfo] = []
+ for item, meta in zip(items, metas):
+ assert isinstance(item.value, torch.Tensor)
+ tensor: torch.Tensor = item.value
+
+ parallel_info: dict[str, str] = {}
+ for key in _PARALLEL_INFO_KEYS:
+ extract_parallel_info(row_data=parallel_info, info=meta.get(key, {}))
+
+ files.append(
+ BundleFileInfo(
+ shape=list(tensor.shape),
+ dtype=str(tensor.dtype),
+ rank=meta.get("rank"),
+ parallel_info=parallel_info if parallel_info else None,
+ )
+ )
+
+ dims: Optional[str] = metas[0].get("dims") if metas else None
+ return BundleSideInfo(num_files=len(files), files=files, dims=dims)
+
+
+def compare_bundle_pair(
+ *,
+ name: str,
+ filenames_pair: Pair[list[str]],
+ dir_pair: Pair[Path],
+ token_aligner_mode: Optional[str],
+ token_aligner_plan: Optional[TokenAlignerPlan],
+ diff_threshold: float,
+ thd_seq_lens_by_step_pair: Pair[Optional[dict[int, list[int]]]] = Pair(
+ x=None, y=None
+ ),
+ viz_output_dir: Optional[Path] = None,
+ compute_per_token: bool = False,
+ meta_overrider: Optional[MetaOverrider] = None,
+) -> Union[ComparisonTensorRecord, ComparisonSkipRecord, ComparisonNonTensorRecord]:
+ with log_sink.context() as collected_logs:
+ result = _compare_bundle_pair_inner(
+ name=name,
+ filenames_pair=filenames_pair,
+ dir_pair=dir_pair,
+ token_aligner_mode=token_aligner_mode,
+ token_aligner_plan=token_aligner_plan,
+ diff_threshold=diff_threshold,
+ thd_seq_lens_by_step_pair=thd_seq_lens_by_step_pair,
+ viz_output_dir=viz_output_dir,
+ compute_per_token=compute_per_token,
+ meta_overrider=meta_overrider,
+ )
+
+ errors, infos = _split_logs(collected_logs)
+ return result.model_copy(update={"errors": errors, "infos": infos})
+
+
+def _compare_bundle_pair_inner(
+ *,
+ name: str,
+ filenames_pair: Pair[list[str]],
+ dir_pair: Pair[Path],
+ token_aligner_mode: Optional[str],
+ token_aligner_plan: Optional[TokenAlignerPlan],
+ diff_threshold: float,
+ thd_seq_lens_by_step_pair: Pair[Optional[dict[int, list[int]]]] = Pair(
+ x=None, y=None
+ ),
+ viz_output_dir: Optional[Path] = None,
+ compute_per_token: bool = False,
+ meta_overrider: Optional[MetaOverrider] = None,
+) -> Union[ComparisonTensorRecord, ComparisonSkipRecord, ComparisonNonTensorRecord]:
+ # 1. Load all successfully loaded values
+ all_pair: Pair[list[ValueWithMeta]] = Pair(
+ x=_load_all_values(filenames=filenames_pair.x, base_path=dir_pair.x),
+ y=_load_all_values(filenames=filenames_pair.y, base_path=dir_pair.y),
+ )
+
+ if not all_pair.x or not all_pair.y:
+ reason = "baseline_load_failed" if not all_pair.x else "target_load_failed"
+ return ComparisonSkipRecord(name=name, reason=reason)
+
+ # 1b. Dims override: patch meta["dims"] before DP filter reads it
+ # (--override-dims may add ``# dp:=moe_dp``, so it must run first)
+ if meta_overrider is not None and not meta_overrider.is_empty:
+ _apply = meta_overrider.apply_to_meta
+ all_pair = Pair(
+ x=[
+ ValueWithMeta(
+ value=v.value, meta=_apply(name=name, meta=v.meta, side="baseline")
+ )
+ for v in all_pair.x
+ ],
+ y=[
+ ValueWithMeta(
+ value=v.value, meta=_apply(name=name, meta=v.meta, side="target")
+ )
+ for v in all_pair.y
+ ],
+ )
+
+ # 1c. DP filter: keep only the non-empty dp_rank
+ all_pair = all_pair.map(
+ lambda items: filter_to_non_empty_dp_rank(
+ items, dp_group_alias=_extract_dp_alias_from_items(items)
+ )
+ )
+
+ # 2. Check if any side has non-tensor values → non-tensor display path
+ has_non_tensor: bool = any(
+ not isinstance(it.value, torch.Tensor) for it in [*all_pair.x, *all_pair.y]
+ )
+ if has_non_tensor:
+ return _compare_bundle_pair_non_tensor_type(name=name, value_pair=all_pair)
+
+ # 3. All values are tensors → tensor comparison path
+ return _compare_bundle_pair_tensor_type(
+ name=name,
+ valid_pair=all_pair,
+ token_aligner_mode=token_aligner_mode,
+ token_aligner_plan=token_aligner_plan,
+ diff_threshold=diff_threshold,
+ thd_seq_lens_by_step_pair=thd_seq_lens_by_step_pair,
+ viz_output_dir=viz_output_dir,
+ compute_per_token=compute_per_token,
+ )
+
+
+def _extract_dp_alias_from_items(items: list[ValueWithMeta]) -> Optional[str]:
+ """Extract dp group alias from the first item's ``meta["dims"]``."""
+ if not items:
+ return None
+ dims_str: Optional[str] = items[0].meta.get("dims")
+ if dims_str is None:
+ return None
+ return parse_dims(dims_str).dp_group_alias
+
+
+def _compare_bundle_pair_tensor_type(
+ *,
+ name: str,
+ valid_pair: Pair[list[ValueWithMeta]],
+ token_aligner_mode: Optional[str],
+ token_aligner_plan: Optional[TokenAlignerPlan],
+ diff_threshold: float,
+ thd_seq_lens_by_step_pair: Pair[Optional[dict[int, list[int]]]] = Pair(
+ x=None, y=None
+ ),
+ viz_output_dir: Optional[Path] = None,
+ compute_per_token: bool = False,
+) -> Union[ComparisonTensorRecord, ComparisonSkipRecord]:
+ if not valid_pair.x or not valid_pair.y:
+ reason = "baseline_load_failed" if not valid_pair.x else "target_load_failed"
+ return ComparisonSkipRecord(name=name, reason=reason)
+
+ # Plan (meta only, no tensor)
+ metas_pair: Pair[list[dict[str, Any]]] = valid_pair.map(
+ lambda items: [it.meta for it in items]
+ )
+ plan: AlignerPlan = compute_aligner_plan(
+ metas_pair=metas_pair,
+ token_aligner_mode=token_aligner_mode,
+ token_aligner_plan=token_aligner_plan,
+ thd_seq_lens_by_step_pair=thd_seq_lens_by_step_pair,
+ )
+
+ # Collect raw bundle info before alignment
+ raw_bundle_info: Pair[BundleSideInfo] = Pair(
+ x=_collect_bundle_side_info(items=valid_pair.x, metas=metas_pair.x),
+ y=_collect_bundle_side_info(items=valid_pair.y, metas=metas_pair.y),
+ )
+
+ # Apply dim names to tensors, then execute
+ tensors_pair: Pair[list[torch.Tensor]] = Pair(
+ x=_apply_dim_names_from_meta(
+ tensors=[it.value for it in valid_pair.x],
+ metas=metas_pair.x,
+ ),
+ y=_apply_dim_names_from_meta(
+ tensors=[it.value for it in valid_pair.y],
+ metas=metas_pair.y,
+ ),
+ )
+ aligner_result: AlignerResult = execute_aligner_plan(
+ tensors_pair=tensors_pair, plan=plan
+ )
+ replicated_checks = aligner_result.replicated_checks
+
+ if aligner_result.tensors is None:
+ assert aligner_result.failed_side_xy is not None
+ side_name: str = _FAILED_SIDE_MAP[aligner_result.failed_side_xy]
+ reason: str = f"{side_name}_load_failed"
+ return ComparisonSkipRecord(name=name, reason=reason)
+
+ # Resolve seq_dim for per-token computation
+ seq_dim: Optional[int] = (
+ _resolve_seq_dim(aligner_result.tensors.y) if compute_per_token else None
+ )
+
+ # Compare
+ aligned_baseline: torch.Tensor = aligner_result.tensors.x.rename(None)
+ aligned_target: torch.Tensor = aligner_result.tensors.y.rename(None)
+
+ info = compare_tensor_pair(
+ x_baseline=aligned_baseline,
+ x_target=aligned_target,
+ name=name,
+ diff_threshold=diff_threshold,
+ seq_dim=seq_dim,
+ )
+ record = ComparisonTensorRecord(
+ **info.model_dump(),
+ traced_plan=aligner_result.traced_plan,
+ replicated_checks=replicated_checks,
+ raw_bundle_info=raw_bundle_info,
+ )
+
+ if viz_output_dir is not None:
+ _try_generate_viz(
+ baseline=aligned_baseline,
+ target=aligned_target,
+ name=name,
+ viz_output_dir=viz_output_dir,
+ )
+
+ return record
+
+
+def _try_generate_viz(
+ *,
+ baseline: torch.Tensor,
+ target: torch.Tensor,
+ name: str,
+ viz_output_dir: Path,
+) -> None:
+ from sglang.srt.debug_utils.comparator.visualizer import (
+ generate_comparison_figure,
+ )
+ from sglang.srt.debug_utils.comparator.visualizer.preprocessing import (
+ _sanitize_filename,
+ )
+
+ filename: str = _sanitize_filename(name) + ".png"
+ output_path: Path = viz_output_dir / filename
+
+ try:
+ generate_comparison_figure(
+ baseline=baseline,
+ target=target,
+ name=name,
+ output_path=output_path,
+ )
+ except Exception as exc:
+ log_sink.add(
+ ErrorLog(
+ category="visualizer",
+ message=f"Visualization failed for {name}: {exc}",
+ )
+ )
+
+
+def _resolve_seq_dim(tensor: torch.Tensor) -> Optional[int]:
+ """Find the token/seq dimension index from the tensor's named dims."""
+ if tensor.names[0] is None:
+ return None
+
+ names: tuple[Optional[str], ...] = tensor.names
+ for target_name in (TOKEN_DIM_NAME, SEQ_DIM_NAME):
+ if target_name in names:
+ return list(names).index(target_name)
+
+ return None
+
+
+def _compare_bundle_pair_non_tensor_type(
+ *,
+ name: str,
+ value_pair: Pair[list[ValueWithMeta]],
+) -> ComparisonNonTensorRecord:
+ baseline_value: Any = value_pair.x[0].value
+ target_value: Any = value_pair.y[0].value
+
+ try:
+ values_equal: bool = bool(baseline_value == target_value)
+ except Exception:
+ values_equal = False
+
+ return ComparisonNonTensorRecord(
+ name=name,
+ baseline_value=repr(baseline_value),
+ target_value=repr(target_value),
+ baseline_type=type(baseline_value).__name__,
+ target_type=type(target_value).__name__,
+ values_equal=values_equal,
+ )
+
+
+def _apply_dim_names_from_meta(
+ *,
+ tensors: list[torch.Tensor],
+ metas: list[dict[str, Any]],
+) -> list[torch.Tensor]:
+ if not metas:
+ return tensors
+
+ dims_str: Optional[str] = metas[0].get("dims")
+ if dims_str is None:
+ return tensors
+
+ dim_names: list[str] = resolve_dim_names(dims_str)
+ return [apply_dim_names(t, dim_names) for t in tensors]
+
+
+def _load_all_values(filenames: list[str], base_path: Path) -> list[ValueWithMeta]:
+ result: list[ValueWithMeta] = []
+ for f in filenames:
+ item: ValueWithMeta = ValueWithMeta.load(base_path / f)
+ if item.value is LOAD_FAILED:
+ log_sink.add(
+ ErrorLog(
+ category="load_failed",
+ message=f"Failed to load tensor file: {f}",
+ )
+ )
+ continue
+ result.append(item)
+ return result
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/bundle_matcher.py b/sglang/python/sglang/srt/debug_utils/comparator/bundle_matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..dacf73462c06c99f3323c8e671bf890b9770891c
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/bundle_matcher.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import dataclasses
+from dataclasses import dataclass
+from typing import Any
+
+import polars as pl
+
+from sglang.srt.debug_utils.comparator.utils import Pair
+from sglang.srt.debug_utils.dump_loader import filter_rows
+
+
+@dataclass(frozen=True)
+class TensorFileInfo:
+ filename: str
+ name: str
+ step: int
+
+
+TensorBundleInfo = list[TensorFileInfo]
+
+
+def match_bundles(
+ *,
+ dfs: Pair[pl.DataFrame],
+ skip_keys: set[str],
+) -> list[Pair[TensorBundleInfo]]:
+ match_key_cols: list[str] = [c for c in dfs.y.columns if c not in skip_keys]
+ unique_keys: pl.DataFrame = dfs.y.select(match_key_cols).unique(maintain_order=True)
+
+ results: list[Pair[TensorBundleInfo]] = []
+ for key_values in unique_keys.iter_rows(named=True):
+ result = dfs.map(
+ lambda df: _rows_to_tensor_infos(filter_rows(df, conditions=key_values))
+ )
+ results.append(result)
+
+ return results
+
+
+def _rows_to_tensor_infos(rows: list[dict[str, Any]]) -> list[TensorFileInfo]:
+ tensor_info_fields: set[str] = {f.name for f in dataclasses.fields(TensorFileInfo)}
+ return [
+ TensorFileInfo(**{k: v for k, v in row.items() if k in tensor_info_fields})
+ for row in rows
+ ]
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d648020968a2f139442370288f2dee513cc6be8
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/__init__.py
@@ -0,0 +1,49 @@
+from sglang.srt.debug_utils.comparator.dims_spec.dim_parser import parse_dim
+from sglang.srt.debug_utils.comparator.dims_spec.dims_parser import (
+ _SingletonDimUtil,
+ parse_dims,
+ resolve_dim_names,
+)
+from sglang.srt.debug_utils.comparator.dims_spec.tensor_naming import (
+ apply_dim_names,
+ find_dim_index,
+ resolve_dim_by_name,
+ strip_dim_names,
+)
+from sglang.srt.debug_utils.comparator.dims_spec.types import (
+ _FUSED_NAME_SEP,
+ BATCH_DIM_NAME,
+ SEQ_DIM_NAME,
+ SQUEEZE_DIM_NAME,
+ TOKEN_DIM_NAME,
+ DimSpec,
+ DimsSpec,
+ Ordering,
+ ParallelAxis,
+ ParallelModifier,
+ Reduction,
+ TokenLayout,
+)
+
+__all__ = [
+ "BATCH_DIM_NAME",
+ "SEQ_DIM_NAME",
+ "SQUEEZE_DIM_NAME",
+ "TOKEN_DIM_NAME",
+ "DimsSpec",
+ "DimSpec",
+ "Ordering",
+ "ParallelAxis",
+ "ParallelModifier",
+ "Reduction",
+ "TokenLayout",
+ "_FUSED_NAME_SEP",
+ "_SingletonDimUtil",
+ "apply_dim_names",
+ "find_dim_index",
+ "parse_dim",
+ "parse_dims",
+ "resolve_dim_by_name",
+ "resolve_dim_names",
+ "strip_dim_names",
+]
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/comment_parser.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/comment_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..222c5ecb11399e21dd26e01e325506bf446cb136
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/comment_parser.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import re
+from typing import NamedTuple, Optional
+
+from sglang.srt.debug_utils.comparator.dims_spec.types import (
+ _AXIS_LOOKUP,
+ ParallelAxis,
+)
+
+_DP_ALIAS_PATTERN = re.compile(r"^dp:=(\w+)$")
+_REPLICATED_PATTERN = re.compile(r"^(\w+):replicated$")
+
+
+class _CommentSuffix(NamedTuple):
+ dp_group_alias: Optional[str] = None
+ replicated_axes: frozenset[ParallelAxis] = frozenset()
+
+
+def _parse_comment_suffix(declaration_part: str) -> _CommentSuffix:
+ """Parse the ``#`` comment section for dp alias and replicated declarations."""
+ dp_group_alias: Optional[str] = None
+ replicated_axes: set[ParallelAxis] = set()
+
+ for token in declaration_part.strip().split():
+ dp_match = _DP_ALIAS_PATTERN.match(token)
+ if dp_match is not None:
+ if dp_group_alias is not None:
+ raise ValueError(
+ f"Duplicate dp alias declaration: already have {dp_group_alias!r}, "
+ f"got {dp_match.group(1)!r}"
+ )
+ dp_group_alias = dp_match.group(1)
+ continue
+
+ repl_match = _REPLICATED_PATTERN.match(token)
+ if repl_match is not None:
+ axis_str: str = repl_match.group(1)
+ axis: Optional[ParallelAxis] = _AXIS_LOOKUP.get(axis_str)
+ if axis is None:
+ raise ValueError(
+ f"Unknown axis {axis_str!r} in replicated declaration: {token!r}"
+ )
+ if axis in replicated_axes:
+ raise ValueError(
+ f"Duplicate replicated declaration for axis {axis_str!r}"
+ )
+ replicated_axes.add(axis)
+ continue
+
+ raise ValueError(
+ f"Unrecognized token {token!r} in # comment section. "
+ f"Expected 'dp:=' or ':replicated'."
+ )
+
+ return _CommentSuffix(
+ dp_group_alias=dp_group_alias,
+ replicated_axes=frozenset(replicated_axes),
+ )
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dim_parser.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dim_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aac65be2d8dd4927c947bec4dd77b3f2be5132b
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dim_parser.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.dims_spec.modifier_parser import (
+ _parse_modifiers,
+)
+from sglang.srt.debug_utils.comparator.dims_spec.types import (
+ SQUEEZE_DIM_NAME,
+ DimSpec,
+ ParallelModifier,
+)
+
+_DIM_PATTERN = re.compile(r"^(?P[a-zA-Z_]\w*)(?:\[(?P[^\]]+)\])?$")
+
+_FUSED_DIM_PATTERN = re.compile(r"^\((?P[^)]+)\)(?:\[(?P[^\]]+)\])?$")
+
+_SUB_DIM_NAME_PATTERN = re.compile(r"^[a-zA-Z_]\w*$")
+
+
+def parse_dim(token: str) -> DimSpec:
+ if token == SQUEEZE_DIM_NAME:
+ return DimSpec(name=SQUEEZE_DIM_NAME)
+
+ fused_match = _FUSED_DIM_PATTERN.match(token)
+ if fused_match is not None:
+ return _parse_fused_dim(token=token, fused_match=fused_match)
+
+ return _parse_single_dim(token)
+
+
+def _parse_single_dim(token: str) -> DimSpec:
+ match = _DIM_PATTERN.match(token)
+ if match is None:
+ raise ValueError(f"Invalid dim token: {token!r}")
+
+ name: str = match.group("name")
+ modifiers: list[ParallelModifier] = _parse_modifiers(
+ modifiers_str=match.group("modifiers"), dim_token=token
+ )
+ return DimSpec(name=name, parallel_modifiers=modifiers)
+
+
+def _parse_fused_dim(*, token: str, fused_match: re.Match[str]) -> DimSpec:
+ inner: str = fused_match.group("inner")
+ modifiers_str: Optional[str] = fused_match.group("modifiers")
+
+ sub_names: list[str] = [s.strip() for s in inner.split("*")]
+ for sub_name in sub_names:
+ if not _SUB_DIM_NAME_PATTERN.match(sub_name):
+ raise ValueError(
+ f"Invalid sub-dim {sub_name!r} in fused dim token: {token!r}"
+ )
+
+ if len(sub_names) != len(set(sub_names)):
+ raise ValueError(f"Duplicate sub-dim names in fused dim token: {token!r}")
+
+ if len(sub_names) < 2:
+ raise ValueError(
+ f"Fused dim must have at least 2 sub-dims, got {len(sub_names)} in: {token!r}"
+ )
+
+ fused_name: str = "*".join(sub_names)
+ modifiers: list[ParallelModifier] = _parse_modifiers(
+ modifiers_str=modifiers_str, dim_token=token
+ )
+ return DimSpec(name=fused_name, parallel_modifiers=modifiers)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dims_parser.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dims_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e0a908956db0871be4e34bfbc9510cced944f3d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/dims_parser.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.dims_spec.comment_parser import (
+ _CommentSuffix,
+ _parse_comment_suffix,
+)
+from sglang.srt.debug_utils.comparator.dims_spec.dim_parser import parse_dim
+from sglang.srt.debug_utils.comparator.dims_spec.types import (
+ SQUEEZE_DIM_NAME,
+ DimSpec,
+ DimsSpec,
+ ParallelAxis,
+)
+
+
+class _SingletonDimUtil:
+ """Utilities for squeeze dims (name="1") and their singleton tensor-name mapping."""
+
+ PREFIX: str = "singleton"
+
+ @staticmethod
+ def is_squeeze(spec: DimSpec) -> bool:
+ return spec.name == SQUEEZE_DIM_NAME
+
+ @staticmethod
+ def filter_out(dim_specs: list[DimSpec]) -> list[DimSpec]:
+ return [s for s in dim_specs if not _SingletonDimUtil.is_squeeze(s)]
+
+ @staticmethod
+ def make_name(index: int) -> str:
+ return f"{_SingletonDimUtil.PREFIX}{index}"
+
+ @staticmethod
+ def is_singleton_name(name: str) -> bool:
+ return (
+ name.startswith(_SingletonDimUtil.PREFIX)
+ and name[len(_SingletonDimUtil.PREFIX) :].isdigit()
+ )
+
+ @staticmethod
+ def sanitize_names(names: list[str]) -> list[str]:
+ """Replace '1' with 'singleton0', 'singleton1', ... for named tensor compatibility."""
+ result: list[str] = []
+ sq_idx: int = 0
+
+ for name in names:
+ if name == SQUEEZE_DIM_NAME:
+ result.append(_SingletonDimUtil.make_name(sq_idx))
+ sq_idx += 1
+ else:
+ result.append(name)
+
+ return result
+
+
+def parse_dims(dims_str: str) -> DimsSpec:
+ """Parse ``"b s[cp:zigzag] h[tp] d # dp:=moe_dp ep:replicated"`` → :class:`DimsSpec`.
+
+ The shape part (before ``#``) produces :pyattr:`DimsSpec.dims`.
+ The declaration part (after ``#``) is scanned for:
+ - ``dp:=`` → :pyattr:`DimsSpec.dp_group_alias`
+ - ``axis:replicated`` → :pyattr:`DimsSpec.replicated_axes`
+ """
+ parts: list[str] = dims_str.split("#", maxsplit=1)
+ raw: str = parts[0]
+
+ if not raw.strip():
+ raise ValueError("dims string must not be empty")
+
+ dims: list[DimSpec] = [parse_dim(token) for token in raw.strip().split()]
+
+ # Collect all semantic names (expanding fused sub-dims) for duplicate detection
+ semantic_names: list[str] = []
+ for spec in dims:
+ if _SingletonDimUtil.is_squeeze(spec):
+ continue
+ semantic_names.extend(spec.sub_dims)
+
+ if len(semantic_names) != len(set(semantic_names)):
+ duplicates = sorted({n for n in semantic_names if semantic_names.count(n) > 1})
+ raise ValueError(f"Duplicate dim names: {duplicates}")
+
+ comment_suffix: _CommentSuffix = (
+ _parse_comment_suffix(parts[1]) if len(parts) > 1 else _CommentSuffix()
+ )
+ dp_group_alias: Optional[str] = comment_suffix.dp_group_alias
+ replicated_axes: frozenset[ParallelAxis] = comment_suffix.replicated_axes
+
+ sharded_axes: set[ParallelAxis] = {
+ m.axis for spec in dims for m in spec.parallel_modifiers
+ }
+ conflict: frozenset[ParallelAxis] = replicated_axes & sharded_axes
+ if conflict:
+ conflict_names: str = ", ".join(sorted(a.value for a in conflict))
+ raise ValueError(
+ f"Axes declared as both sharded (in dim spec) and replicated "
+ f"(in # declaration): {conflict_names}"
+ )
+
+ return DimsSpec(
+ dims=dims,
+ dp_group_alias=dp_group_alias,
+ replicated_axes=replicated_axes,
+ )
+
+
+def resolve_dim_names(dims_str: str) -> list[str]:
+ """Parse dims string and return tensor-compatible names ('1' → 'singleton0', ...)."""
+ specs: list[DimSpec] = parse_dims(dims_str).dims
+ names: list[str] = [spec.sanitized_name for spec in specs]
+ return _SingletonDimUtil.sanitize_names(names)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/modifier_parser.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/modifier_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1ecfa8879b043bfa31a0fb5b94815f515901898
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/modifier_parser.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.dims_spec.types import (
+ _AXIS_LOOKUP,
+ _QUALIFIER_LOOKUP,
+ Ordering,
+ ParallelAxis,
+ ParallelModifier,
+ Reduction,
+)
+
+
+def _parse_modifier_token(modifier_token: str, dim_token: str) -> ParallelModifier:
+ """Parse 'sp', 'cp:zigzag', 'tp:partial', or 'cp:zigzag+partial' → ParallelModifier.
+
+ Format: ``axis`` or ``axis:qual`` or ``axis:qual+qual``.
+ Colon separates axis from qualifiers; ``+`` separates multiple qualifiers.
+ """
+ axis_str: str
+ qualifiers_str: str
+ if ":" in modifier_token:
+ axis_str, qualifiers_str = modifier_token.split(":", maxsplit=1)
+ else:
+ axis_str, qualifiers_str = modifier_token, ""
+
+ axis_str = axis_str.strip()
+ axis: Optional[ParallelAxis] = _AXIS_LOOKUP.get(axis_str)
+ if axis is None:
+ raise ValueError(
+ f"Unknown axis {axis_str!r} in modifier {modifier_token!r} "
+ f"of dim spec: {dim_token!r}"
+ )
+
+ ordering: Optional[Ordering] = None
+ reduction: Optional[Reduction] = None
+
+ for q_str in (q.strip() for q in qualifiers_str.split("+") if q.strip()):
+ if q_str == "sharded":
+ continue
+ qualifier: Optional[Ordering | Reduction] = _QUALIFIER_LOOKUP.get(q_str)
+ if qualifier is None:
+ raise ValueError(
+ f"Unknown qualifier {q_str!r} in modifier "
+ f"{modifier_token!r} of dim spec: {dim_token!r}"
+ )
+ if isinstance(qualifier, Ordering):
+ if ordering is not None:
+ raise ValueError(
+ f"Multiple ordering values in modifier "
+ f"{modifier_token!r} of dim spec: {dim_token!r}"
+ )
+ ordering = qualifier
+ else:
+ if reduction is not None:
+ raise ValueError(
+ f"Multiple reduction values in modifier "
+ f"{modifier_token!r} of dim spec: {dim_token!r}"
+ )
+ reduction = qualifier
+
+ return ParallelModifier(axis=axis, ordering=ordering, reduction=reduction)
+
+
+def _parse_modifiers(
+ *, modifiers_str: Optional[str], dim_token: str
+) -> list[ParallelModifier]:
+ if modifiers_str is None:
+ return []
+
+ modifiers: list[ParallelModifier] = []
+ seen_axes: set[ParallelAxis] = set()
+
+ for modifier_token in (p.strip() for p in modifiers_str.split(",")):
+ modifier: ParallelModifier = _parse_modifier_token(modifier_token, dim_token)
+ if modifier.axis in seen_axes:
+ raise ValueError(
+ f"Duplicate axis {modifier.axis.value!r} in dim spec: {dim_token!r}"
+ )
+ seen_axes.add(modifier.axis)
+ modifiers.append(modifier)
+
+ return modifiers
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/types.py b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb792076258aaaea8fde4b0e4fce52d4c1ef677
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dims_spec/types.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.utils import _FrozenBase
+
+TOKEN_DIM_NAME: str = "t"
+BATCH_DIM_NAME: str = "b"
+SEQ_DIM_NAME: str = "s"
+SQUEEZE_DIM_NAME: str = "1"
+
+
+class TokenLayout(Enum):
+ T = "t" # single flat token dim
+ BS = "bs" # separate batch + seq dims, need collapse
+
+
+class ParallelAxis(Enum):
+ TP = "tp"
+ CP = "cp"
+ EP = "ep"
+ SP = "sp"
+ RECOMPUTE_PSEUDO = "recompute_pseudo"
+
+
+class Ordering(Enum):
+ ZIGZAG = "zigzag"
+ NATURAL = "natural"
+
+
+class Reduction(Enum):
+ PARTIAL = "partial"
+
+
+class ParallelModifier(_FrozenBase):
+ axis: ParallelAxis
+ ordering: Optional[Ordering] = None
+ reduction: Optional[Reduction] = None
+
+
+_AXIS_LOOKUP: dict[str, ParallelAxis] = {m.value: m for m in ParallelAxis}
+_QUALIFIER_LOOKUP: dict[str, Ordering | Reduction] = {
+ **{m.value: m for m in Ordering},
+ **{m.value: m for m in Reduction},
+}
+
+_FUSED_NAME_SEP: str = "___"
+
+
+class DimSpec(_FrozenBase):
+ name: str
+ parallel_modifiers: list[ParallelModifier] = []
+
+ @property
+ def sub_dims(self) -> list[str]:
+ """Sub-dim names. Fused: ``["num_heads", "head_dim"]``; plain: ``["h"]``."""
+ return self.name.split("*")
+
+ @property
+ def is_fused(self) -> bool:
+ return len(self.sub_dims) > 1
+
+ @property
+ def sanitized_name(self) -> str:
+ """Name safe for PyTorch named tensors (``*`` → ``___``)."""
+ if self.is_fused:
+ return _FUSED_NAME_SEP.join(self.sub_dims)
+ return self.name
+
+
+class DimsSpec(_FrozenBase):
+ """Parsed result of a full dims string like ``"b s h[tp] # dp:=moe_dp"``."""
+
+ dims: list[DimSpec]
+ dp_group_alias: Optional[str] = None
+ replicated_axes: frozenset[ParallelAxis] = frozenset()
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/display.py b/sglang/python/sglang/srt/debug_utils/comparator/display.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ae0e955486b74bcc9effbd79347ed28c664784
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/display.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from io import StringIO
+from pathlib import Path
+from typing import Any, Optional
+
+import polars as pl
+
+from sglang.srt.debug_utils.comparator.output_types import (
+ InputIdsRecord,
+ RankInfoRecord,
+)
+from sglang.srt.debug_utils.comparator.report_sink import report_sink
+from sglang.srt.debug_utils.dump_loader import LOAD_FAILED, ValueWithMeta
+
+PARALLEL_INFO_KEYS: list[str] = ["sglang_parallel_info", "megatron_parallel_info"]
+
+
+def emit_display_records(
+ *,
+ df: pl.DataFrame,
+ dump_dir: Path,
+ label: str,
+ tokenizer: Any,
+) -> None:
+ rank_rows: Optional[list[dict[str, Any]]] = _collect_rank_info(
+ df, dump_dir=dump_dir
+ )
+ if rank_rows is not None:
+ report_sink.add(RankInfoRecord(label=label, rows=rank_rows))
+
+ input_ids_rows: Optional[list[dict[str, Any]]] = _collect_input_ids_and_positions(
+ df, dump_dir=dump_dir, tokenizer=tokenizer
+ )
+ if input_ids_rows is not None:
+ report_sink.add(InputIdsRecord(label=label, rows=input_ids_rows))
+
+
+def _render_polars_as_text(df: pl.DataFrame, *, title: Optional[str] = None) -> str:
+ from rich.console import Console
+ from rich.table import Table
+
+ table = Table(title=title)
+ for col in df.columns:
+ table.add_column(col)
+ for row in df.iter_rows():
+ table.add_row(*[str(v) for v in row])
+
+ buf = StringIO()
+ Console(file=buf, force_terminal=False, width=200).print(table)
+ return buf.getvalue().rstrip("\n")
+
+
+def _collect_rank_info(
+ df: pl.DataFrame, dump_dir: Path
+) -> Optional[list[dict[str, Any]]]:
+ unique_rows: pl.DataFrame = (
+ df.filter(pl.col("name") == "input_ids")
+ .sort("rank")
+ .unique(subset=["rank"], keep="first")
+ )
+ if unique_rows.is_empty():
+ return None
+
+ table_rows: list[dict[str, Any]] = []
+ for row in unique_rows.to_dicts():
+ meta: dict[str, Any] = ValueWithMeta.load(dump_dir / row["filename"]).meta
+
+ row_data: dict[str, Any] = {"rank": row["rank"]}
+ for key in PARALLEL_INFO_KEYS:
+ _extract_parallel_info(row_data=row_data, info=meta.get(key, {}))
+ table_rows.append(row_data)
+
+ return table_rows or None
+
+
+def _collect_input_ids_and_positions(
+ df: pl.DataFrame,
+ dump_dir: Path,
+ *,
+ tokenizer: Any = None,
+) -> Optional[list[dict[str, Any]]]:
+ filtered: pl.DataFrame = df.filter(pl.col("name").is_in(["input_ids", "positions"]))
+ if filtered.is_empty():
+ return None
+
+ data_by_step_rank: dict[tuple[int, int], dict[str, Any]] = defaultdict(dict)
+ for row in filtered.to_dicts():
+ key: tuple[int, int] = (row["step"], row["rank"])
+ item: ValueWithMeta = ValueWithMeta.load(dump_dir / row["filename"])
+ if item.value is not LOAD_FAILED:
+ data_by_step_rank[key][row["name"]] = item.value
+
+ table_rows: list[dict[str, Any]] = []
+ for (step, rank), data in sorted(data_by_step_rank.items()):
+ ids = data.get("input_ids")
+ pos = data.get("positions")
+
+ ids_list: Optional[list[int]] = (
+ ids.flatten().tolist() if ids is not None else None
+ )
+
+ row_data: dict[str, Any] = {
+ "step": step,
+ "rank": rank,
+ "num_tokens": len(ids_list) if ids_list is not None else None,
+ "input_ids": str(ids_list) if ids_list is not None else "N/A",
+ "positions": str(pos.flatten().tolist()) if pos is not None else "N/A",
+ }
+
+ if tokenizer is not None and ids_list is not None:
+ row_data["decoded_text"] = repr(
+ tokenizer.decode(ids_list, skip_special_tokens=False)
+ )
+
+ table_rows.append(row_data)
+
+ return table_rows or None
+
+
+def _extract_parallel_info(row_data: dict[str, Any], info: dict[str, Any]) -> None:
+ if not info or info.get("error"):
+ return
+
+ for key in sorted(info.keys()):
+ if key.endswith("_rank"):
+ base: str = key[:-5]
+ size_key: str = f"{base}_size"
+ if size_key in info:
+ row_data[base] = f"{info[key]}/{info[size_key]}"
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/dp_utils.py b/sglang/python/sglang/srt/debug_utils/comparator/dp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ab5d82d7cf9b5d3c866f7a9b5f91a42e5811c3e
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/dp_utils.py
@@ -0,0 +1,102 @@
+"""DP filtering: keep only the non-empty dp_rank items."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from typing import Optional
+
+import torch
+
+from sglang.srt.debug_utils.dump_loader import ValueWithMeta
+
+_PARALLEL_INFO_KEYS = ("sglang_parallel_info", "megatron_parallel_info")
+
+_DP_RANK_FIELD = "dp_rank"
+_DP_SIZE_FIELD = "dp_size"
+
+
+def filter_to_non_empty_dp_rank(
+ items: list[ValueWithMeta],
+ *,
+ dp_group_alias: Optional[str] = None,
+) -> list[ValueWithMeta]:
+ """Filter items to the single non-empty dp_rank.
+
+ - dp_size <= 1: return items unchanged.
+ - dp_size > 1: group by dp_rank, assert exactly one group has non-empty
+ tensors, return that group.
+
+ When *dp_group_alias* is set (e.g. ``"moe_dp"``), the function looks
+ for ``_rank`` / ``_size`` instead of the default
+ ``dp_rank`` / ``dp_size``. If the aliased fields are absent the
+ filter is a noop (items returned unchanged).
+ """
+ if not items:
+ return items
+
+ dp_info: Optional[tuple[int, int]] = _extract_dp_info(
+ items[0].meta, dp_group_alias=dp_group_alias
+ )
+ if dp_info is None:
+ return items
+
+ _dp_rank, dp_size = dp_info
+ if dp_size <= 1:
+ return items
+
+ has_any_tensor: bool = any(isinstance(item.value, torch.Tensor) for item in items)
+ if not has_any_tensor:
+ return items
+
+ groups: dict[int, list[ValueWithMeta]] = defaultdict(list)
+ for item in items:
+ item_dp: Optional[tuple[int, int]] = _extract_dp_info(
+ item.meta, dp_group_alias=dp_group_alias
+ )
+ rank: int = item_dp[0] if item_dp is not None else 0
+ groups[rank].append(item)
+
+ non_empty_ranks: list[int] = [
+ rank for rank, group in groups.items() if _group_has_data(group)
+ ]
+
+ assert len(non_empty_ranks) == 1, (
+ f"Expected exactly 1 non-empty dp_rank, got {len(non_empty_ranks)}: "
+ f"ranks={non_empty_ranks}"
+ )
+
+ return groups[non_empty_ranks[0]]
+
+
+def _extract_dp_info(
+ meta: dict,
+ *,
+ dp_group_alias: Optional[str] = None,
+) -> Optional[tuple[int, int]]:
+ """Extract (dp_rank, dp_size) from meta's parallel_info block.
+
+ When *dp_group_alias* is given, look for ``_rank``/``_size``
+ instead of the default ``dp_rank``/``dp_size``.
+ """
+ rank_field: str = f"{dp_group_alias}_rank" if dp_group_alias else _DP_RANK_FIELD
+ size_field: str = f"{dp_group_alias}_size" if dp_group_alias else _DP_SIZE_FIELD
+
+ for key in _PARALLEL_INFO_KEYS:
+ info = meta.get(key)
+ if not isinstance(info, dict) or not info:
+ continue
+
+ dp_rank = info.get(rank_field)
+ dp_size = info.get(size_field)
+ if dp_rank is not None and dp_size is not None:
+ return (int(dp_rank), int(dp_size))
+
+ return None
+
+
+def _group_has_data(group: list[ValueWithMeta]) -> bool:
+ """Check if any tensor in the group is non-empty (numel > 0)."""
+ return any(
+ isinstance(item.value, torch.Tensor) and item.value.numel() > 0
+ for item in group
+ )
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/entrypoint.py b/sglang/python/sglang/srt/debug_utils/comparator/entrypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab483e1558f34cda1f30cbc560da29a2afb5747
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/entrypoint.py
@@ -0,0 +1,441 @@
+from __future__ import annotations
+
+import argparse
+import sys
+import traceback as _traceback_module
+from pathlib import Path
+from typing import Any, Iterator, Optional, Union
+
+import polars as pl
+
+from sglang.srt.debug_utils.comparator.aligner.token_aligner.entrypoint import (
+ TokenAlignerResult,
+ compute_maybe_token_aligner_result,
+)
+from sglang.srt.debug_utils.comparator.aligner.token_aligner.smart.aux_loader import (
+ AUX_NAMES,
+)
+from sglang.srt.debug_utils.comparator.aligner.token_aligner.smart.types import (
+ TokenAlignerPlan,
+)
+from sglang.srt.debug_utils.comparator.bundle_comparator import compare_bundle_pair
+from sglang.srt.debug_utils.comparator.bundle_matcher import (
+ TensorBundleInfo,
+ match_bundles,
+)
+from sglang.srt.debug_utils.comparator.display import emit_display_records
+from sglang.srt.debug_utils.comparator.meta_overrider import MetaOverrider
+from sglang.srt.debug_utils.comparator.output_types import (
+ ComparisonErrorRecord,
+ ComparisonNonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonTensorRecord,
+ ConfigRecord,
+ RecordLocation,
+ SummaryRecord,
+)
+from sglang.srt.debug_utils.comparator.per_token_visualizer import (
+ generate_per_token_heatmap,
+)
+from sglang.srt.debug_utils.comparator.preset import PRESETS, expand_preset
+from sglang.srt.debug_utils.comparator.report_sink import report_sink
+from sglang.srt.debug_utils.comparator.utils import (
+ Pair,
+ auto_descend_dir,
+ compute_exit_code,
+)
+from sglang.srt.debug_utils.dump_loader import read_meta, read_tokenizer_path
+
+_DEFAULT_SKIP_KEYS: set[str] = {"dump_index", "filename"}
+
+
+def main() -> None:
+ args = parse_args(sys.argv[1:])
+ sys.exit(run(args))
+
+
+def run(args: argparse.Namespace) -> int:
+ report_sink.configure(
+ output_format=args.output_format,
+ report_path=None,
+ verbosity=args.verbosity,
+ )
+
+ dir_pair: Pair[Path] = Pair(
+ x=auto_descend_dir(Path(args.baseline_path), label="baseline_path"),
+ y=auto_descend_dir(Path(args.target_path), label="target_path"),
+ )
+ viz_output_dir: Optional[Path] = (
+ Path(args.viz_output_dir) if args.viz_bundle_details else None
+ )
+ visualize_per_token: Optional[Path] = (
+ Path(args.visualize_per_token) if args.visualize_per_token else None
+ )
+ override_config: Optional[Path] = (
+ Path(args.override_config) if args.override_config else None
+ )
+
+ report_path: Optional[Path] = _resolve_report_path(
+ target_path=dir_pair.y,
+ report_path_arg=args.report_path,
+ )
+ report_sink.configure(
+ output_format=args.output_format,
+ report_path=report_path,
+ verbosity=args.verbosity,
+ )
+
+ try:
+ report_sink.add(ConfigRecord(config=vars(args)))
+
+ dfs: Pair[pl.DataFrame] = _read_df(
+ dir_pair=dir_pair,
+ start_step=args.start_step,
+ end_step=args.end_step,
+ filter_pattern=args.filter,
+ )
+
+ tokenizer: Any = _maybe_load_tokenizer(
+ tokenizer_arg=args.tokenizer, dir_pair=dir_pair
+ )
+ for label, df, dump_dir in [
+ ("baseline", dfs.x, dir_pair.x),
+ ("target", dfs.y, dir_pair.y),
+ ]:
+ emit_display_records(
+ df=df, dump_dir=dump_dir, label=label, tokenizer=tokenizer
+ )
+
+ ta_result: TokenAlignerResult = compute_maybe_token_aligner_result(
+ dir_pair=dir_pair,
+ dfs=dfs,
+ token_aligner_mode=args.token_aligner,
+ )
+
+ if ta_result.mode == "smart":
+ dfs = dfs.map(lambda df: df.filter(~pl.col("name").is_in(AUX_NAMES)))
+
+ skip_keys: set[str] = _DEFAULT_SKIP_KEYS | set(args.grouping_skip_keys or [])
+ bundle_info_pairs: list[Pair[TensorBundleInfo]] = match_bundles(
+ dfs=dfs, skip_keys=skip_keys
+ )
+
+ meta_overrider: MetaOverrider = MetaOverrider.from_args_and_config(
+ override_dims=args.override_dims,
+ override_baseline_dims=args.override_baseline_dims,
+ override_target_dims=args.override_target_dims,
+ override_config=override_config,
+ )
+
+ comparison_records = _compare_bundle_pairs(
+ bundle_info_pairs=bundle_info_pairs,
+ dir_pair=dir_pair,
+ token_aligner_mode=ta_result.mode,
+ token_aligner_plan=ta_result.plan,
+ diff_threshold=args.diff_threshold,
+ thd_seq_lens_by_step_pair=ta_result.thd_seq_lens_by_step_pair,
+ viz_output_dir=viz_output_dir,
+ compute_per_token=visualize_per_token is not None,
+ meta_overrider=meta_overrider,
+ )
+ summary, skipped_names, failed_names, errored_names = (
+ _consume_comparison_records(
+ comparison_records=comparison_records,
+ visualize_per_token=visualize_per_token,
+ )
+ )
+ return compute_exit_code(
+ summary,
+ allow_skipped_pattern=args.allow_skipped_pattern,
+ skipped_names=skipped_names,
+ allow_failed_pattern=args.allow_failed_pattern,
+ failed_names=failed_names,
+ errored_names=errored_names,
+ )
+ finally:
+ report_sink.close()
+ if report_path is not None:
+ print(f"Report: {report_path}", file=sys.stderr)
+
+
+def _resolve_report_path(
+ *, target_path: Path, report_path_arg: Optional[str]
+) -> Optional[Path]:
+ if report_path_arg is not None:
+ return Path(report_path_arg) if report_path_arg else None
+ return target_path / "comparator_report.jsonl"
+
+
+def _maybe_load_tokenizer(*, tokenizer_arg: Optional[str], dir_pair: Pair[Path]) -> Any:
+ tokenizer_path: Optional[str] = tokenizer_arg
+
+ if tokenizer_path is None:
+ for directory in [dir_pair.x, dir_pair.y]:
+ tokenizer_path = read_tokenizer_path(directory)
+ if tokenizer_path is not None:
+ break
+
+ if tokenizer_path is None:
+ return None
+
+ try:
+ from transformers import AutoTokenizer
+
+ return AutoTokenizer.from_pretrained(tokenizer_path)
+ except Exception:
+ return None
+
+
+def _read_df(
+ *,
+ dir_pair: Pair[Path],
+ start_step: int,
+ end_step: int,
+ filter_pattern: Optional[str],
+) -> Pair[pl.DataFrame]:
+ df_baseline = read_meta(dir_pair.x)
+
+ df_target = read_meta(dir_pair.y)
+ df_target = df_target.filter(
+ (pl.col("step") >= start_step) & (pl.col("step") <= end_step)
+ )
+ if filter_pattern:
+ df_target = df_target.filter(pl.col("filename").str.contains(filter_pattern))
+ assert all(c in df_target.columns for c in ["rank", "step", "dump_index", "name"])
+
+ return Pair(x=df_baseline, y=df_target)
+
+
+def _compare_bundle_pairs(
+ *,
+ bundle_info_pairs: list[Pair[TensorBundleInfo]],
+ dir_pair: Pair[Path],
+ token_aligner_mode: Optional[str],
+ token_aligner_plan: Optional[TokenAlignerPlan],
+ diff_threshold: float,
+ thd_seq_lens_by_step_pair: Pair[Optional[dict[int, list[int]]]],
+ viz_output_dir: Optional[Path] = None,
+ compute_per_token: bool = False,
+ meta_overrider: Optional[MetaOverrider] = None,
+) -> Iterator[
+ Union[
+ ComparisonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonNonTensorRecord,
+ ComparisonErrorRecord,
+ ]
+]:
+ for bundle_info_pair in bundle_info_pairs:
+ if not bundle_info_pair.y:
+ continue
+
+ name: str = bundle_info_pair.y[0].name
+ filenames_pair: Pair[list[str]] = bundle_info_pair.map(
+ lambda infos: [info.filename for info in infos]
+ )
+
+ record: Union[
+ ComparisonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonNonTensorRecord,
+ ComparisonErrorRecord,
+ ]
+ try:
+ record = compare_bundle_pair(
+ name=name,
+ filenames_pair=filenames_pair,
+ dir_pair=dir_pair,
+ token_aligner_mode=token_aligner_mode,
+ token_aligner_plan=token_aligner_plan,
+ diff_threshold=diff_threshold,
+ thd_seq_lens_by_step_pair=thd_seq_lens_by_step_pair,
+ viz_output_dir=viz_output_dir,
+ compute_per_token=compute_per_token,
+ meta_overrider=meta_overrider,
+ )
+ except Exception as exc:
+ record = ComparisonErrorRecord(
+ name=name,
+ exception_type=type(exc).__name__,
+ traceback_str=_traceback_module.format_exc(),
+ )
+
+ target_steps: set[int] = {info.step for info in bundle_info_pair.y}
+ step: Optional[int] = target_steps.pop() if len(target_steps) == 1 else None
+ if step is not None:
+ record = record.model_copy(update={"location": RecordLocation(step=step)})
+
+ yield record
+
+
+def _consume_comparison_records(
+ *,
+ comparison_records: Iterator[
+ Union[
+ ComparisonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonNonTensorRecord,
+ ComparisonErrorRecord,
+ ]
+ ],
+ visualize_per_token: Optional[Path] = None,
+) -> tuple[SummaryRecord, list[str], list[str], list[str]]:
+ counts: dict[str, int] = {"passed": 0, "failed": 0, "skipped": 0, "errored": 0}
+ collected_comparisons: list[ComparisonTensorRecord] = []
+ skipped_names: list[str] = []
+ failed_names: list[str] = []
+ errored_names: list[str] = []
+
+ for record in comparison_records:
+ counts[record.category] += 1
+ report_sink.add(record)
+ if isinstance(record, ComparisonSkipRecord) and record.category == "skipped":
+ skipped_names.append(record.name)
+ if record.category == "failed":
+ failed_names.append(record.name)
+ if isinstance(record, ComparisonErrorRecord):
+ errored_names.append(record.name)
+ if visualize_per_token is not None and isinstance(
+ record, ComparisonTensorRecord
+ ):
+ collected_comparisons.append(record)
+
+ summary: SummaryRecord = SummaryRecord(total=sum(counts.values()), **counts)
+ report_sink.add(summary)
+
+ if visualize_per_token is not None and collected_comparisons:
+ generate_per_token_heatmap(
+ records=collected_comparisons,
+ output_path=visualize_per_token,
+ )
+
+ return summary, skipped_names, failed_names, errored_names
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+ """Parse CLI arguments from an argv list. Applies preset expansion."""
+ argv = expand_preset(argv, presets=PRESETS)
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--baseline-path", type=str)
+ parser.add_argument("--target-path", type=str)
+ parser.add_argument("--start-step", type=int, default=0)
+ parser.add_argument("--end-step", type=int, default=1000000)
+ parser.add_argument("--diff-threshold", type=float, default=1e-3)
+ parser.add_argument(
+ "--filter", type=str, default=None, help="Regex to filter filenames (include)"
+ )
+ parser.add_argument(
+ "--output-format",
+ type=str,
+ choices=["text", "json"],
+ default="text",
+ help="Output format: text (default) or json (JSONL, one JSON object per line)",
+ )
+ parser.add_argument(
+ "--verbosity",
+ type=str,
+ choices=["minimal", "normal", "verbose"],
+ default="normal",
+ help="Output verbosity: minimal (1 line per tensor), normal (compact lifecycle), "
+ "verbose (full detail). Default: normal",
+ )
+ parser.add_argument(
+ "--preset",
+ type=str,
+ choices=list(PRESETS.keys()),
+ default=None,
+ help="Preset configuration (expanded before parsing). "
+ f"Available: {list(PRESETS.keys())}",
+ )
+ parser.add_argument(
+ "--grouping-skip-keys",
+ nargs="*",
+ default=None,
+ help="Metadata keys to skip when grouping bundles (additive on top of "
+ "always-skipped dump_index and filename). "
+ "E.g. '--grouping-skip-keys rank step' skips rank and step.",
+ )
+ parser.add_argument(
+ "--token-aligner",
+ type=str,
+ choices=["smart", "concat_steps"],
+ default=None,
+ help="Token aligner mode: concat_steps (BS=1, no aux needed) or smart (BS>1, sequence matching). "
+ "Default None (per-step comparison).",
+ )
+ parser.add_argument(
+ "--tokenizer",
+ type=str,
+ default=None,
+ help="Tokenizer path for decoding input_ids (auto-discovered from dump metadata if not set)",
+ )
+ parser.add_argument(
+ "--viz-bundle-details",
+ action="store_true",
+ default=False,
+ help="Generate comparison heatmap/histogram PNG for each compared tensor",
+ )
+ parser.add_argument(
+ "--viz-output-dir",
+ type=str,
+ default="/tmp/comparator_viz/",
+ help="Output directory for visualization PNGs (default: /tmp/comparator_viz/)",
+ )
+ parser.add_argument(
+ "--visualize-per-token",
+ type=str,
+ default=None,
+ help="Output path for per-token relative difference heatmap PNG",
+ )
+
+ # Dims override
+ parser.add_argument(
+ "--override-dims",
+ action="append",
+ default=[],
+ help="Override dims for both sides: 'name:dims_string' (repeatable)",
+ )
+ parser.add_argument(
+ "--override-baseline-dims",
+ action="append",
+ default=[],
+ help="Override dims for baseline only: 'name:dims_string' (repeatable)",
+ )
+ parser.add_argument(
+ "--override-target-dims",
+ action="append",
+ default=[],
+ help="Override dims for target only: 'name:dims_string' (repeatable)",
+ )
+ parser.add_argument(
+ "--override-config",
+ type=str,
+ default=None,
+ help="Path to YAML override config file (dims overrides, etc.)",
+ )
+ parser.add_argument(
+ "--allow-skipped-pattern",
+ type=str,
+ default=".*",
+ help="Regex pattern for tensor names allowed to be skipped. "
+ "Default '.*' allows all skips. Use '^$' to forbid all skips.",
+ )
+ parser.add_argument(
+ "--allow-failed-pattern",
+ type=str,
+ default=None,
+ help="Regex pattern for tensor names allowed to fail without affecting exit code. "
+ "Default None (all failures affect exit code).",
+ )
+
+ # Report output
+ parser.add_argument(
+ "--report-path",
+ type=str,
+ default=None,
+ help="Path for JSONL report (default: /comparator_report.jsonl). "
+ "Pass empty string '' to disable.",
+ )
+
+ return parser.parse_args(argv)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/log_sink.py b/sglang/python/sglang/srt/debug_utils/comparator/log_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..8515fa84776de7d2c8922faa7c10cc35829be79b
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/log_sink.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from typing import Generator
+
+from sglang.srt.debug_utils.comparator.output_types import BaseLog
+
+
+class LogSink:
+ def __init__(self) -> None:
+ self._stack: list[list[BaseLog]] = []
+
+ @contextmanager
+ def context(self) -> Generator[list[BaseLog], None, None]:
+ bucket: list[BaseLog] = []
+ self._stack.append(bucket)
+ try:
+ yield bucket
+ finally:
+ popped = self._stack.pop()
+ assert popped is bucket
+
+ def add(self, log: BaseLog) -> None:
+ if self._stack:
+ self._stack[-1].append(log)
+ else:
+ from sglang.srt.debug_utils.comparator.output_types import (
+ LogRecord,
+ _split_logs,
+ )
+ from sglang.srt.debug_utils.comparator.report_sink import report_sink
+
+ errors, infos = _split_logs([log])
+ report_sink.add(LogRecord(errors=errors, infos=infos))
+
+
+log_sink = LogSink()
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/meta_overrider.py b/sglang/python/sglang/srt/debug_utils/comparator/meta_overrider.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1ae48eb25ca45c31aaf0c928eab8aae2e8c32de
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/meta_overrider.py
@@ -0,0 +1,107 @@
+"""Meta overrider: replace metadata fields without re-running dumps.
+
+Currently only overrides 'dims', but the design supports overriding
+additional meta fields (e.g. parallel_info) in the future.
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Any, Literal, Optional
+
+import yaml
+
+from sglang.srt.debug_utils.comparator.utils import _StrictBase
+
+
+class MetaOverrideRule(_StrictBase):
+ """Single override rule: regex match on tensor name → replacement meta field(s).
+
+ Currently only 'dims' is supported; more fields may be added in the future.
+ """
+
+ match: str
+ dims: str
+ side: Literal["both", "baseline", "target"] = "both"
+
+
+class MetaOverrideConfig(_StrictBase):
+ """YAML top-level config for overriding comparator behavior."""
+
+ overrides: list[MetaOverrideRule] = []
+
+
+class MetaOverrider:
+ """Holds override rules and applies first-match-wins replacement."""
+
+ def __init__(self, rules: list[MetaOverrideRule]) -> None:
+ self._rules: list[MetaOverrideRule] = rules
+
+ @property
+ def is_empty(self) -> bool:
+ return len(self._rules) == 0
+
+ @classmethod
+ def from_args_and_config(
+ cls,
+ *,
+ override_dims: list[str],
+ override_baseline_dims: list[str],
+ override_target_dims: list[str],
+ override_config: Optional[Path],
+ ) -> "MetaOverrider":
+ per_side_args: list[tuple[list[str], Literal["both", "baseline", "target"]]] = [
+ (override_dims, "both"),
+ (override_baseline_dims, "baseline"),
+ (override_target_dims, "target"),
+ ]
+ cli_rules: list[MetaOverrideRule] = [
+ MetaOverrideRule(match=name, dims=dims_str, side=side)
+ for raw_args, side in per_side_args
+ for name, dims_str in [_parse_cli_override_arg(raw) for raw in raw_args]
+ ]
+
+ yaml_rules: list[MetaOverrideRule] = (
+ _load_yaml_rules(override_config) if override_config is not None else []
+ )
+
+ return cls(rules=cli_rules + yaml_rules)
+
+ def apply_to_meta(
+ self,
+ *,
+ name: str,
+ meta: dict[str, Any],
+ side: Literal["baseline", "target"],
+ ) -> dict[str, Any]:
+ """First-match-wins: return meta with dims replaced by the first matching rule for this side."""
+ for rule in self._rules:
+ if rule.side not in ("both", side):
+ continue
+ if re.search(rule.match, name):
+ return {**meta, "dims": rule.dims}
+
+ return meta
+
+
+def _parse_cli_override_arg(raw: str) -> tuple[str, str]:
+ """Parse 'name:dims_string' from a CLI --override-* argument."""
+ parts: list[str] = raw.split(":", maxsplit=1)
+ if len(parts) != 2 or not parts[0].strip() or not parts[1].strip():
+ raise ValueError(
+ f"Invalid override format: {raw!r}; expected 'name:dims_string'"
+ )
+ return parts[0].strip(), parts[1].strip()
+
+
+def _load_yaml_rules(path: Path) -> list[MetaOverrideRule]:
+ """Load override rules from a YAML config file."""
+ with open(path) as f:
+ raw_data: Any = yaml.safe_load(f)
+
+ if raw_data is None:
+ return []
+
+ config: MetaOverrideConfig = MetaOverrideConfig.model_validate(raw_data)
+ return config.overrides
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/output_formatter.py b/sglang/python/sglang/srt/debug_utils/comparator/output_formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5859ad0c48765b230abeb09bc3e3d4b5eb9cd4d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/output_formatter.py
@@ -0,0 +1,331 @@
+"""Formatting functions for comparator output records.
+
+Extracted from output_types.py to separate data-structure definitions
+from rendering / formatting logic.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from rich.console import Group
+from rich.markup import escape
+from rich.panel import Panel
+
+from sglang.srt.debug_utils.comparator.tensor_comparator.formatter import (
+ format_comparison,
+ format_replicated_checks,
+)
+
+if TYPE_CHECKING:
+ from rich.console import RenderableType
+
+ from sglang.srt.debug_utils.comparator.aligner.entrypoint.traced_types import (
+ TracedAlignerPlan,
+ TracedSubPlan,
+ )
+ from sglang.srt.debug_utils.comparator.aligner.entrypoint.types import AlignerPlan
+ from sglang.srt.debug_utils.comparator.output_types import (
+ ComparisonErrorRecord,
+ ComparisonNonTensorRecord,
+ ComparisonSkipRecord,
+ ComparisonTensorRecord,
+ ConfigRecord,
+ ErrorLog,
+ InfoLog,
+ LogRecord,
+ SummaryRecord,
+ _OutputRecord,
+ _TableRecord,
+ )
+
+Verbosity = Literal["minimal", "normal", "verbose"]
+
+
+# ── Record-level rendering (body + logs) ─────────────────────────────
+
+
+def _render_record_rich(
+ record: _OutputRecord, *, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ body: RenderableType = record._format_rich_body(verbosity=verbosity)
+
+ log_lines: list[str] = _format_log_lines_rich(
+ errors=record.errors, infos=record.infos
+ )
+
+ if not log_lines:
+ return body
+
+ log_block: str = "\n".join(log_lines)
+ if isinstance(body, str):
+ return body + "\n" + log_block
+ return Group(body, log_block)
+
+
+def _render_record_text(record: _OutputRecord) -> str:
+ body: str = record._format_body()
+
+ log_suffix: str = _format_log_lines_text(errors=record.errors, infos=record.infos)
+
+ if log_suffix:
+ body += "\n" + log_suffix
+
+ return body
+
+
+def _format_log_lines_rich(
+ *, errors: list[ErrorLog], infos: list[InfoLog]
+) -> list[str]:
+ lines: list[str] = []
+
+ if errors:
+ lines.extend(f" [red]✗ {e.to_text()}[/]" for e in errors)
+ if infos:
+ lines.extend(f" [dim]ℹ {i.to_text()}[/]" for i in infos)
+
+ return lines
+
+
+def _format_log_lines_text(*, errors: list[ErrorLog], infos: list[InfoLog]) -> str:
+ lines: list[str] = []
+
+ if errors:
+ lines.extend(f" ✗ {e.to_text()}" for e in errors)
+ if infos:
+ lines.extend(f" ℹ {i.to_text()}" for i in infos)
+
+ return "\n".join(lines)
+
+
+# ── ConfigRecord ──────────────────────────────────────────────────────
+
+
+def _format_config_body(record: ConfigRecord) -> str:
+ return f"Config: {record.config}"
+
+
+def _format_config_rich_body(
+ record: ConfigRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ lines: list[str] = [f" [bold]{k}[/] : {v}" for k, v in record.config.items()]
+ return Panel("\n".join(lines), title="Comparator Config", border_style="cyan")
+
+
+# ── ComparisonSkipRecord ─────────────────────────────────────────────
+
+
+def _format_skip_body(record: ComparisonSkipRecord) -> str:
+ return f"Skip: {record.name}{record._format_location_suffix()} ({record.reason})"
+
+
+def _format_skip_rich_body(
+ record: ComparisonSkipRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ suffix: str = record._format_location_suffix()
+ return (
+ f"[dim]⊘ {escape(record.name)}{suffix} ── skipped ({escape(record.reason)})[/]"
+ )
+
+
+# ── ComparisonErrorRecord ────────────────────────────────────────────
+
+
+def _format_error_body(record: ComparisonErrorRecord) -> str:
+ prefix: str = record._format_location_prefix()
+ return (
+ f"{prefix}Error: {record.name} ({record.exception_type})\n"
+ f"{record.traceback_str}"
+ )
+
+
+def _format_error_rich_body(
+ record: ComparisonErrorRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ prefix: str = record._format_location_prefix_rich()
+ name: str = escape(record.name)
+ header: str = (
+ f"{prefix}[bold red]{name} ── errored ({escape(record.exception_type)})[/]"
+ )
+ if verbosity == "minimal":
+ return header
+ return header + f"\n[dim]{escape(record.traceback_str)}[/]"
+
+
+# ── _TableRecord ─────────────────────────────────────────────────────
+
+
+def _format_table_body(record: _TableRecord) -> str:
+ import polars as pl
+
+ from sglang.srt.debug_utils.comparator.display import _render_polars_as_text
+
+ return _render_polars_as_text(
+ pl.DataFrame(record.rows), title=record._table_title()
+ )
+
+
+def _format_table_rich_body(
+ record: _TableRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ import polars as pl
+
+ from sglang.srt.debug_utils.comparator.display import (
+ _render_polars_as_rich_table,
+ )
+
+ return _render_polars_as_rich_table(
+ pl.DataFrame(record.rows), title=record._table_title()
+ )
+
+
+# ── ComparisonTensorRecord ───────────────────────────────────────────
+
+
+def _format_tensor_comparison_body(record: ComparisonTensorRecord) -> str:
+ body: str = record._format_location_prefix() + format_comparison(record)
+ if record.replicated_checks:
+ body += "\n" + format_replicated_checks(record.replicated_checks)
+ if record.traced_plan is not None:
+ body += "\n" + _format_aligner_plan(record.traced_plan)
+ return body
+
+
+def _format_tensor_comparison_rich_body(
+ record: ComparisonTensorRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ from sglang.srt.debug_utils.comparator.tensor_comparator.formatter import (
+ format_comparison_rich,
+ )
+
+ return record._format_location_prefix_rich() + format_comparison_rich(
+ record=record, verbosity=verbosity
+ )
+
+
+# ── ComparisonNonTensorRecord ────────────────────────────────────────
+
+
+def _format_non_tensor_body(record: ComparisonNonTensorRecord) -> str:
+ suffix: str = record._format_location_suffix()
+ if record.values_equal:
+ return f"NonTensor: {record.name}{suffix} = {record.baseline_value} ({record.baseline_type}) [equal]"
+ return (
+ f"NonTensor: {record.name}{suffix}\n"
+ f" baseline = {record.baseline_value} ({record.baseline_type})\n"
+ f" target = {record.target_value} ({record.target_type})"
+ )
+
+
+def _format_non_tensor_rich_body(
+ record: ComparisonNonTensorRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ suffix: str = record._format_location_suffix()
+ name: str = escape(record.name)
+ baseline_val: str = escape(record.baseline_value)
+ target_val: str = escape(record.target_value)
+
+ if record.values_equal:
+ return (
+ f"═ {name}{suffix} = {baseline_val} "
+ f"({record.baseline_type}) [green]✓[/]"
+ )
+ return (
+ f"═ [bold red]{name}{suffix}[/]\n"
+ f" baseline = {baseline_val} ({record.baseline_type})\n"
+ f" target = {target_val} ({record.target_type})"
+ )
+
+
+# ── SummaryRecord ────────────────────────────────────────────────────
+
+
+def _format_summary_body(record: SummaryRecord) -> str:
+ text: str = (
+ f"Summary: {record.passed} passed, {record.failed} failed, "
+ f"{record.skipped} skipped (total {record.total})"
+ )
+ if record.errored > 0:
+ text += f", {record.errored} errored"
+ return text
+
+
+def _format_summary_rich_body(
+ record: SummaryRecord, verbosity: Verbosity = "normal"
+) -> RenderableType:
+ text: str = (
+ f"[bold green]{record.passed} passed[/] │ "
+ f"[bold red]{record.failed} failed[/] │ "
+ f"[yellow]{record.skipped} skipped[/] │ "
+ f"{record.total} total"
+ )
+ if record.errored > 0:
+ text += f" │ [bold red]{record.errored} errored[/]"
+ return Panel(text, title="SUMMARY", border_style="bold")
+
+
+# ── LogRecord ────────────────────────────────────────────────────────
+
+
+def _format_log_body(record: LogRecord) -> str:
+ return ""
+
+
+# ── Standalone helpers ───────────────────────────────────────────────
+
+
+def _format_aligner_plan(traced_plan: TracedAlignerPlan) -> str:
+ lines: list[str] = ["Aligner Plan:"]
+
+ for side_label, traced_side in [
+ ("baseline", traced_plan.per_side.x),
+ ("target", traced_plan.per_side.y),
+ ]:
+ if not traced_side.step_plans:
+ lines.append(f" {side_label}: (no steps)")
+ continue
+
+ step_summaries: list[str] = []
+ for traced_step in traced_side.step_plans:
+ sub_strs: list[str] = [
+ _format_sub_plan_text(traced_sub)
+ for traced_sub in traced_step.sub_plans
+ ]
+ summary: str = ", ".join(sub_strs) if sub_strs else "passthrough"
+ step_summaries.append(f"step={traced_step.step}: {summary}")
+ lines.append(f" {side_label}: [{'; '.join(step_summaries)}]")
+
+ lines.extend(_format_cross_side_plan_text(traced_plan.plan))
+ return "\n".join(lines)
+
+
+def _format_sub_plan_text(traced_sub: TracedSubPlan) -> str:
+ sub_desc: str = f"{traced_sub.plan.type}"
+
+ if traced_sub.snapshot is not None:
+ snap = traced_sub.snapshot
+ in_count: int = len(snap.input_shapes)
+ out_count: int = len(snap.output_shapes)
+ in_shape: str = str(snap.input_shapes[0]) if snap.input_shapes else "?"
+ out_shape: str = str(snap.output_shapes[0]) if snap.output_shapes else "?"
+ sub_desc += f" {in_count}x{in_shape} -> {out_count}x{out_shape}"
+
+ return sub_desc
+
+
+def _format_cross_side_plan_text(plan: AlignerPlan) -> list[str]:
+ lines: list[str] = []
+
+ if plan.token_aligner_plan is not None:
+ num_tokens: int = len(plan.token_aligner_plan.locators.x.steps)
+ lines.append(f" token_aligner: {num_tokens} tokens aligned")
+
+ if plan.axis_aligner_plan is not None:
+ parts: list[str] = []
+ if plan.axis_aligner_plan.pattern.x:
+ parts.append(f"x: {plan.axis_aligner_plan.pattern.x}")
+ if plan.axis_aligner_plan.pattern.y:
+ parts.append(f"y: {plan.axis_aligner_plan.pattern.y}")
+ lines.append(f" axis_aligner: {', '.join(parts)}")
+
+ return lines
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/output_types.py b/sglang/python/sglang/srt/debug_utils/comparator/output_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24615910ee5f28e14a82af7fbc3b8f8e6fce921
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/output_types.py
@@ -0,0 +1,312 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Union
+
+from pydantic import ConfigDict, Discriminator, Field, TypeAdapter, model_validator
+from rich.console import RenderableType
+from rich.markup import escape
+
+from sglang.srt.debug_utils.comparator.output_formatter import ( # noqa: F401 — re-export
+ _format_aligner_plan as _format_aligner_plan,
+)
+from sglang.srt.debug_utils.comparator.output_formatter import (
+ _format_config_body,
+ _format_config_rich_body,
+ _format_error_body,
+ _format_error_rich_body,
+ _format_log_body,
+ _format_non_tensor_body,
+ _format_non_tensor_rich_body,
+ _format_skip_body,
+ _format_skip_rich_body,
+ _format_summary_body,
+ _format_summary_rich_body,
+ _format_table_body,
+ _format_table_rich_body,
+ _format_tensor_comparison_body,
+ _format_tensor_comparison_rich_body,
+ _render_record_rich,
+ _render_record_text,
+)
+from sglang.srt.debug_utils.comparator.tensor_comparator.types import (
+ DiffInfo,
+ TensorComparisonInfo,
+)
+from sglang.srt.debug_utils.comparator.utils import Pair, _StrictBase
+
+if TYPE_CHECKING:
+ from sglang.srt.debug_utils.comparator.aligner.entrypoint.traced_types import (
+ TracedAlignerPlan,
+ )
+ from sglang.srt.debug_utils.comparator.report_sink import Verbosity
+
+
+class BaseLog(_StrictBase):
+ category: str
+ message: str
+
+ def to_text(self) -> str:
+ return self.message
+
+
+class ErrorLog(BaseLog):
+ kind: Literal["error"] = "error"
+
+
+class InfoLog(BaseLog):
+ kind: Literal["info"] = "info"
+
+
+AnyLog = Annotated[Union[ErrorLog, InfoLog], Discriminator("kind")]
+
+
+def _split_logs(logs: list[BaseLog]) -> tuple[list[ErrorLog], list[InfoLog]]:
+ errors: list[ErrorLog] = [log for log in logs if isinstance(log, ErrorLog)]
+ infos: list[InfoLog] = [log for log in logs if isinstance(log, InfoLog)]
+ return errors, infos
+
+
+class ReplicatedCheckResult(_StrictBase):
+ axis: str
+ group_index: int
+ compared_index: int
+ baseline_index: int
+ passed: bool
+ atol: float
+ diff: Optional[DiffInfo] = None
+
+
+class BundleFileInfo(_StrictBase):
+ """Per-file info within a bundle (one rank's raw tensor)."""
+
+ shape: list[int]
+ dtype: str
+ rank: Optional[int] = None
+ parallel_info: Optional[dict[str, str]] = None # e.g. {"tp": "0/4", "ep": "1/2"}
+
+
+class BundleSideInfo(_StrictBase):
+ num_files: int
+ files: list[BundleFileInfo]
+ dims: Optional[str] = None # e.g. "b s h(tp) d"
+
+
+class ShapeSnapshot(_StrictBase):
+ input_shapes: list[list[int]]
+ output_shapes: list[list[int]]
+
+
+class _OutputRecord(_StrictBase):
+ errors: list[ErrorLog] = Field(default_factory=list)
+ infos: list[InfoLog] = Field(default_factory=list)
+
+ @abstractmethod
+ def _format_body(self) -> str: ...
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return self._format_body()
+
+ def to_rich(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _render_record_rich(self, verbosity=verbosity)
+
+ def to_text(self) -> str:
+ return _render_record_text(self)
+
+
+class RecordLocation(_StrictBase):
+ step: Optional[int] = None
+
+
+class _BaseComparisonRecord(_OutputRecord):
+ location: RecordLocation = Field(default_factory=RecordLocation)
+
+ def _format_location_prefix(self) -> str:
+ if self.location.step is not None:
+ return f"[step={self.location.step}] "
+ return ""
+
+ def _format_location_prefix_rich(self) -> str:
+ if self.location.step is not None:
+ return escape(f"[step={self.location.step}]") + " "
+ return ""
+
+ def _format_location_suffix(self) -> str:
+ if self.location.step is not None:
+ return f" (step={self.location.step})"
+ return ""
+
+
+class ConfigRecord(_OutputRecord):
+ type: Literal["config"] = "config"
+ config: dict[str, Any]
+
+ def _format_body(self) -> str:
+ return _format_config_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_config_rich_body(self, verbosity=verbosity)
+
+
+class ComparisonSkipRecord(_BaseComparisonRecord):
+ type: Literal["comparison_skip"] = "comparison_skip"
+ name: str
+ reason: str
+
+ @property
+ def category(self) -> str:
+ if self.errors:
+ return "failed"
+ return "skipped"
+
+ def _format_body(self) -> str:
+ return _format_skip_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_skip_rich_body(self, verbosity=verbosity)
+
+
+class ComparisonErrorRecord(_BaseComparisonRecord):
+ type: Literal["comparison_error"] = "comparison_error"
+ name: str
+ exception_type: str
+ traceback_str: str
+
+ @property
+ def category(self) -> str:
+ return "errored"
+
+ def _format_body(self) -> str:
+ return _format_error_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_error_rich_body(self, verbosity=verbosity)
+
+
+class _TableRecord(_OutputRecord):
+ label: str
+ rows: list[dict[str, Any]]
+
+ @abstractmethod
+ def _table_title(self) -> str: ...
+
+ def _format_body(self) -> str:
+ return _format_table_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_table_rich_body(self, verbosity=verbosity)
+
+
+class RankInfoRecord(_TableRecord):
+ type: Literal["rank_info"] = "rank_info"
+
+ def _table_title(self) -> str:
+ return f"{self.label} ranks"
+
+
+class InputIdsRecord(_TableRecord):
+ type: Literal["input_ids"] = "input_ids"
+
+ def _table_title(self) -> str:
+ return f"{self.label} input_ids & positions"
+
+
+class ComparisonTensorRecord(TensorComparisonInfo, _BaseComparisonRecord):
+ model_config = ConfigDict(extra="forbid", defer_build=True)
+
+ type: Literal["comparison_tensor"] = "comparison_tensor"
+ traced_plan: Optional[TracedAlignerPlan] = None
+ replicated_checks: list[ReplicatedCheckResult] = Field(default_factory=list)
+ raw_bundle_info: Optional[Pair[BundleSideInfo]] = None
+
+ @property
+ def category(self) -> str:
+ if self.errors:
+ return "failed"
+ if any(not check.passed for check in self.replicated_checks):
+ return "failed"
+ return "passed" if self.diff is not None and self.diff.passed else "failed"
+
+ def _format_body(self) -> str:
+ return _format_tensor_comparison_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_tensor_comparison_rich_body(self, verbosity=verbosity)
+
+
+class ComparisonNonTensorRecord(_BaseComparisonRecord):
+ type: Literal["comparison_non_tensor"] = "comparison_non_tensor"
+ name: str
+ baseline_value: str
+ target_value: str
+ baseline_type: str
+ target_type: str
+ values_equal: bool
+
+ @property
+ def category(self) -> str:
+ if self.errors:
+ return "failed"
+ return "passed" if self.values_equal else "failed"
+
+ def _format_body(self) -> str:
+ return _format_non_tensor_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_non_tensor_rich_body(self, verbosity=verbosity)
+
+
+class SummaryRecord(_OutputRecord):
+ type: Literal["summary"] = "summary"
+ total: int
+ passed: int
+ failed: int
+ skipped: int
+ errored: int = 0
+
+ @model_validator(mode="after")
+ def _validate_totals(self) -> "SummaryRecord":
+ expected: int = self.passed + self.failed + self.skipped + self.errored
+ if self.total != expected:
+ raise ValueError(
+ f"total={self.total} != passed({self.passed}) + failed({self.failed}) "
+ f"+ skipped({self.skipped}) + errored({self.errored}) = {expected}"
+ )
+ return self
+
+ def _format_body(self) -> str:
+ return _format_summary_body(self)
+
+ def _format_rich_body(self, verbosity: Verbosity = "normal") -> RenderableType:
+ return _format_summary_rich_body(self, verbosity=verbosity)
+
+
+class LogRecord(_OutputRecord):
+ type: Literal["log"] = "log"
+
+ def _format_body(self) -> str:
+ return _format_log_body(self)
+
+
+AnyRecord = Annotated[
+ Union[
+ ConfigRecord,
+ RankInfoRecord,
+ InputIdsRecord,
+ ComparisonSkipRecord,
+ ComparisonErrorRecord,
+ ComparisonTensorRecord,
+ ComparisonNonTensorRecord,
+ SummaryRecord,
+ LogRecord,
+ ],
+ Discriminator("type"),
+]
+
+
+def _get_any_record_adapter() -> TypeAdapter:
+ return TypeAdapter(AnyRecord)
+
+
+def parse_record_json(json_str: str | bytes) -> AnyRecord:
+ return _get_any_record_adapter().validate_json(json_str)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/per_token_visualizer.py b/sglang/python/sglang/srt/debug_utils/comparator/per_token_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1a30c2c30a9e549d44b162312b62f5835c18f8
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/per_token_visualizer.py
@@ -0,0 +1,83 @@
+"""Per-token relative difference heatmap generator.
+
+Produces a single PNG with rows = tensor names, columns = token positions,
+color = log10(rel_diff).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.output_types import ComparisonTensorRecord
+
+
+def generate_per_token_heatmap(
+ *,
+ records: list[ComparisonTensorRecord],
+ output_path: Path,
+) -> Optional[Path]:
+ """Generate a per-token relative difference heatmap PNG.
+
+ Returns the output path if a file was written, or None if no data was available.
+ """
+ rows_data: list[tuple[str, list[float]]] = _collect_per_token_data(records=records)
+ if not rows_data:
+ return None
+
+ _render_heatmap(rows_data=rows_data, output_path=output_path)
+ return output_path
+
+
+def _collect_per_token_data(
+ *,
+ records: list[ComparisonTensorRecord],
+) -> list[tuple[str, list[float]]]:
+ rows: list[tuple[str, list[float]]] = []
+ for record in records:
+ if record.diff is None or record.diff.per_token_rel_diff is None:
+ continue
+ rows.append((record.name, record.diff.per_token_rel_diff))
+ return rows
+
+
+def _render_heatmap(
+ *,
+ rows_data: list[tuple[str, list[float]]],
+ output_path: Path,
+) -> None:
+ import matplotlib
+ import numpy as np
+
+ matplotlib.use("Agg")
+ import matplotlib.pyplot as plt
+
+ max_len: int = max(len(vals) for _, vals in rows_data)
+ labels: list[str] = [label for label, _ in rows_data]
+
+ matrix: np.ndarray = np.full((len(rows_data), max_len), np.nan, dtype=np.float64)
+ for i, (_, vals) in enumerate(rows_data):
+ matrix[i, : len(vals)] = vals
+
+ fig_width: float = max(12.0, max_len * 0.15)
+ fig_height: float = max(6.0, len(rows_data) * 0.3)
+ fig, ax = plt.subplots(figsize=(fig_width, fig_height))
+
+ im = ax.imshow(
+ np.log10(matrix + 1e-10), aspect="auto", cmap="hot", interpolation="nearest"
+ )
+
+ ax.set_xlabel("Token Position")
+ ax.set_ylabel("Tensor")
+ ax.set_yticks(range(len(labels)))
+ ax.set_yticklabels(labels, fontsize=8)
+
+ colorbar = fig.colorbar(im, ax=ax)
+ colorbar.set_label("log10(rel_diff)")
+
+ ax.set_title("Per-Token Relative Difference Heatmap")
+ fig.tight_layout()
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ fig.savefig(str(output_path), dpi=150)
+ plt.close(fig)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/preset.py b/sglang/python/sglang/srt/debug_utils/comparator/preset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc315de998c5c44c6523d20e98b9d40e11050b10
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/preset.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+PRESETS: dict[str, list[str]] = {
+ "raw": [
+ "--grouping-skip-keys",
+ ],
+ "sglang_dev": [
+ "--grouping-skip-keys",
+ "rank",
+ ],
+ "sglang_megatron": [
+ "--grouping-skip-keys",
+ "rank",
+ "step",
+ "--token-aligner",
+ "concat_steps",
+ ],
+}
+
+DEFAULT_PRESET: str = "sglang_dev"
+
+
+def expand_preset(argv: list[str], presets: dict[str, list[str]]) -> list[str]:
+ """Expand ``--preset `` into the corresponding argv fragment.
+
+ If ``--preset`` is absent **and** ``--grouping-skip-keys`` is also absent,
+ the DEFAULT_PRESET is applied automatically.
+ """
+ if (expanded := _expand_flag(argv, "--preset", presets)) is not None:
+ return expanded
+
+ if "--grouping-skip-keys" not in argv:
+ return presets[DEFAULT_PRESET] + argv
+
+ return argv
+
+
+def _expand_flag(
+ argv: list[str], flag: str, mapping: dict[str, list[str]]
+) -> list[str] | None:
+ """Replace ``flag `` in *argv* with the corresponding argv fragment from *mapping*."""
+ if flag not in argv:
+ return None
+
+ idx: int = argv.index(flag)
+ name: str = argv[idx + 1]
+ if name not in mapping:
+ raise ValueError(
+ f"Unknown value for {flag}: {name}. Available: {list(mapping.keys())}"
+ )
+
+ return argv[:idx] + mapping[name] + argv[idx + 2 :]
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/report_sink.py b/sglang/python/sglang/srt/debug_utils/comparator/report_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f9e9ac55ce21de71e806cc0ee73f65668991fa
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/report_sink.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import IO, Literal, Optional
+
+from rich.console import Console
+
+from sglang.srt.debug_utils.comparator.output_types import _OutputRecord
+
+Verbosity = Literal["minimal", "normal", "verbose"]
+
+
+class ReportSink:
+ """Unified entry point for all record output."""
+
+ def __init__(self) -> None:
+ self._output_format: str = "text"
+ self._verbosity: Verbosity = "normal"
+ self._report_file: Optional[IO[str]] = None
+ self._report_path: Optional[Path] = None
+ self._console: Optional[Console] = None
+
+ @property
+ def verbosity(self) -> Verbosity:
+ return self._verbosity
+
+ def configure(
+ self,
+ *,
+ output_format: str = "text",
+ report_path: Optional[Path] = None,
+ verbosity: Verbosity = "normal",
+ ) -> None:
+ self._output_format = output_format
+ self._verbosity = verbosity
+
+ if report_path is not None:
+ try:
+ report_path.parent.mkdir(parents=True, exist_ok=True)
+ self._report_file = open(report_path, "w", encoding="utf-8")
+ self._report_path = report_path
+ except OSError as exc:
+ print(
+ f"Warning: cannot open report file {report_path}: {exc}",
+ file=sys.stderr,
+ )
+
+ def add(self, record: _OutputRecord) -> None:
+ self._print_to_stdout(record)
+
+ if self._report_file is not None:
+ self._report_file.write(record.model_dump_json())
+ self._report_file.write("\n")
+ self._report_file.flush()
+
+ def close(self) -> None:
+ if self._report_file is not None:
+ self._report_file.close()
+ self._report_file = None
+
+ @property
+ def report_path(self) -> Optional[Path]:
+ return self._report_path
+
+ def _reset(self) -> None:
+ self.close()
+ self._output_format = "text"
+ self._verbosity = "normal"
+ self._report_path = None
+ self._console = None
+
+ def _get_console(self) -> Console:
+ if self._console is None:
+ self._console = Console()
+ return self._console
+
+ def _print_to_stdout(self, record: _OutputRecord) -> None:
+ if self._output_format == "json":
+ print(record.model_dump_json())
+ else:
+ console: Console = self._get_console()
+ console.print(record.to_rich(verbosity=self._verbosity))
+ console.print() # blank line between records
+
+
+report_sink = ReportSink()
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9974802d723d8d43493ed8c49fb07ec5f5edbe4
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/__init__.py
@@ -0,0 +1,3 @@
+from sglang.srt.debug_utils.comparator.tensor_comparator.comparator import (
+ compare_tensor_pair,
+)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/comparator.py b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..296429f9232ea51580ba0b49187a8b61a7ab93ba
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/comparator.py
@@ -0,0 +1,171 @@
+from typing import Optional
+
+import torch
+
+from sglang.srt.debug_utils.comparator.tensor_comparator.types import (
+ DEFAULT_PERCENTILES,
+ DiffInfo,
+ TensorComparisonInfo,
+ TensorInfo,
+ TensorStats,
+)
+from sglang.srt.debug_utils.comparator.utils import (
+ Pair,
+ argmax_coord,
+ calc_per_token_rel_diff,
+ calc_rel_diff,
+ compute_smaller_dtype,
+ try_unify_shape,
+)
+from sglang.srt.debug_utils.dumper import get_truncated_value
+
+QUANTILE_NUMEL_THRESHOLD = 10_000_000
+SAMPLE_DIFF_THRESHOLD = 1e-3
+
+
+def compare_tensor_pair(
+ x_baseline: torch.Tensor,
+ x_target: torch.Tensor,
+ name: str = "",
+ diff_threshold: float = 1e-3,
+ seq_dim: Optional[int] = None,
+) -> TensorComparisonInfo:
+ baseline_info = TensorInfo(
+ shape=list(x_baseline.shape),
+ dtype=str(x_baseline.dtype),
+ stats=_compute_tensor_stats(x_baseline.float()),
+ )
+ target_info = TensorInfo(
+ shape=list(x_target.shape),
+ dtype=str(x_target.dtype),
+ stats=_compute_tensor_stats(x_target.float()),
+ )
+
+ x_baseline = try_unify_shape(x_baseline, target_shape=x_target.shape)
+ unified_shape = list(x_baseline.shape)
+
+ baseline_original_dtype = x_baseline.dtype
+ target_original_dtype = x_target.dtype
+
+ x_baseline_f = x_baseline.float()
+ x_target_f = x_target.float()
+
+ shape_mismatch = x_baseline_f.shape != x_target_f.shape
+
+ diff: Optional[DiffInfo] = None
+ diff_downcast: Optional[DiffInfo] = None
+ downcast_dtype: Optional[torch.dtype] = None
+
+ if not shape_mismatch:
+ diff = compute_diff(
+ x_baseline=x_baseline_f,
+ x_target=x_target_f,
+ diff_threshold=diff_threshold,
+ seq_dim=seq_dim,
+ )
+
+ needs_sample = diff.max_abs_diff > SAMPLE_DIFF_THRESHOLD
+ if needs_sample:
+ baseline_info.sample = str(get_truncated_value(x_baseline_f))
+ target_info.sample = str(get_truncated_value(x_target_f))
+
+ if baseline_original_dtype != target_original_dtype:
+ downcast_dtype = compute_smaller_dtype(
+ Pair(x=baseline_original_dtype, y=target_original_dtype)
+ )
+ if downcast_dtype is not None:
+ diff_downcast = compute_diff(
+ x_baseline=x_baseline_f.to(downcast_dtype),
+ x_target=x_target_f.to(downcast_dtype),
+ diff_threshold=diff_threshold,
+ )
+
+ return TensorComparisonInfo(
+ name=name,
+ baseline=baseline_info,
+ target=target_info,
+ unified_shape=unified_shape,
+ shape_mismatch=shape_mismatch,
+ diff=diff,
+ diff_downcast=diff_downcast,
+ downcast_dtype=str(downcast_dtype) if downcast_dtype is not None else None,
+ )
+
+
+def _compute_tensor_stats(x: torch.Tensor) -> TensorStats:
+ if x.numel() == 0:
+ return TensorStats(
+ mean=0.0,
+ abs_mean=0.0,
+ std=0.0,
+ min=0.0,
+ max=0.0,
+ percentiles={},
+ )
+
+ include_quantiles: bool = x.numel() < QUANTILE_NUMEL_THRESHOLD
+ return TensorStats(
+ mean=torch.mean(x).item(),
+ abs_mean=torch.mean(x.abs()).item(),
+ std=torch.std(x).item(),
+ min=torch.min(x).item(),
+ max=torch.max(x).item(),
+ percentiles=_compute_percentiles(x, include=include_quantiles),
+ )
+
+
+def _compute_percentiles(x: torch.Tensor, *, include: bool) -> dict[int, float]:
+ if not include:
+ return {}
+ x_float: torch.Tensor = x.float()
+ return {p: torch.quantile(x_float, p / 100.0).item() for p in DEFAULT_PERCENTILES}
+
+
+def compute_diff(
+ x_baseline: torch.Tensor,
+ x_target: torch.Tensor,
+ diff_threshold: float = 1e-3,
+ seq_dim: Optional[int] = None,
+) -> DiffInfo:
+ if x_baseline.numel() == 0:
+ return DiffInfo(
+ rel_diff=0.0,
+ max_abs_diff=0.0,
+ mean_abs_diff=0.0,
+ abs_diff_percentiles={},
+ max_diff_coord=[],
+ baseline_at_max=0.0,
+ target_at_max=0.0,
+ diff_threshold=diff_threshold,
+ passed=True,
+ )
+
+ raw_abs_diff = (x_target - x_baseline).abs()
+ max_diff_coord = argmax_coord(raw_abs_diff)
+
+ rel_diff = calc_rel_diff(x_target, x_baseline).item()
+ max_abs_diff = raw_abs_diff.max().item()
+ mean_abs_diff = raw_abs_diff.mean().item()
+
+ include_quantiles: bool = raw_abs_diff.numel() < QUANTILE_NUMEL_THRESHOLD
+
+ per_token_rel_diff: Optional[list[float]] = None
+ if seq_dim is not None and x_baseline.dim() > seq_dim:
+ per_token_rel_diff = calc_per_token_rel_diff(
+ x_baseline, x_target, seq_dim=seq_dim
+ ).tolist()
+
+ return DiffInfo(
+ rel_diff=rel_diff,
+ max_abs_diff=max_abs_diff,
+ mean_abs_diff=mean_abs_diff,
+ abs_diff_percentiles=_compute_percentiles(
+ raw_abs_diff, include=include_quantiles
+ ),
+ max_diff_coord=list(max_diff_coord),
+ baseline_at_max=x_baseline[max_diff_coord].item(),
+ target_at_max=x_target[max_diff_coord].item(),
+ diff_threshold=diff_threshold,
+ passed=rel_diff <= diff_threshold,
+ per_token_rel_diff=per_token_rel_diff,
+ )
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/formatter.py b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6021d065d2a7d1232b4160ce233509ea1cc6e2c8
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/formatter.py
@@ -0,0 +1,510 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, Optional
+
+from rich.markup import escape
+
+from sglang.srt.debug_utils.comparator.aligner.unsharder.types import UnsharderPlan
+from sglang.srt.debug_utils.comparator.tensor_comparator.types import (
+ DiffInfo,
+ TensorComparisonInfo,
+ TensorInfo,
+ TensorStats,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.debug_utils.comparator.aligner.entrypoint.traced_types import (
+ TracedAlignerPlan,
+ TracedSubPlan,
+ )
+ from sglang.srt.debug_utils.comparator.aligner.entrypoint.types import AlignerPlan
+ from sglang.srt.debug_utils.comparator.output_types import (
+ BundleSideInfo,
+ ComparisonTensorRecord,
+ ReplicatedCheckResult,
+ ShapeSnapshot,
+ )
+ from sglang.srt.debug_utils.comparator.utils import Pair
+
+Verbosity = Literal["minimal", "normal", "verbose"]
+
+
+def _esc_shape(shape: Optional[list[int]]) -> str:
+ return escape(str(shape))
+
+
+def _strip_torch_prefix(dtype: str) -> str:
+ return dtype.replace("torch.", "")
+
+
+# ---------------------------------------------------------------------------
+# Number formatting
+# ---------------------------------------------------------------------------
+
+
+def _fmt_val(value: float) -> str:
+ return f"{value:.2e}"
+
+
+def _fmt_diff_colored(diff: float, *, threshold: float = 1e-2) -> str:
+ formatted: str = f"{diff:+.2e}"
+ if abs(diff) >= threshold:
+ return f"[yellow]{formatted}[/]"
+ return f"[dim]{formatted}[/]"
+
+
+# ---------------------------------------------------------------------------
+# Passed / color / marker helper
+# ---------------------------------------------------------------------------
+
+
+def _category_marker(category: str) -> tuple[bool, str, str]:
+ passed: bool = category == "passed"
+ color: str = "green" if passed else "red"
+ marker: str = f"[{color}]✅[/]" if passed else f"[{color}]❌[/]"
+ return passed, color, marker
+
+
+# ---------------------------------------------------------------------------
+# Stats formatting helpers (shared between compact / verbose)
+# ---------------------------------------------------------------------------
+
+
+def _format_stat_line(stat_name: str, val_b: float, val_t: float, diff: float) -> str:
+ return (
+ f" [blue]{stat_name:10s}[/] {val_b:>10.4f} vs {val_t:>10.4f}"
+ f" Δ {_fmt_diff_colored(diff)}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Old text-only formatters (kept for to_text() backward compatibility)
+# ---------------------------------------------------------------------------
+
+
+def format_comparison(info: TensorComparisonInfo) -> str:
+ lines: list[str] = []
+ baseline = info.baseline
+ target = info.target
+
+ dtype_marker = "" if baseline.dtype == target.dtype else "🟠"
+ lines.append(
+ f"Raw "
+ f"[shape] {baseline.shape} vs {target.shape}\t"
+ f"[{dtype_marker}dtype] {baseline.dtype} vs {target.dtype}"
+ )
+
+ if info.unified_shape != baseline.shape:
+ lines.append(
+ f"Unify shape: {baseline.shape} -> {info.unified_shape} "
+ f"(to match {target.shape})"
+ )
+
+ lines.append(
+ f"After unify "
+ f"[shape] {info.unified_shape} vs {target.shape}\t"
+ f"[dtype] {baseline.dtype} vs {target.dtype}"
+ )
+
+ lines.extend(_format_stats_comparison(baseline=baseline.stats, target=target.stats))
+
+ if info.shape_mismatch:
+ lines.append("⚠️ Shape mismatch")
+ return "\n".join(lines)
+
+ if info.diff is not None:
+ lines.extend(_format_diff(diff=info.diff))
+
+ if info.diff_downcast is not None and info.downcast_dtype is not None:
+ lines.extend(
+ _format_diff(
+ diff=info.diff_downcast,
+ prefix_text=f"When downcast to {info.downcast_dtype}: ",
+ )
+ )
+
+ if baseline.sample is not None:
+ lines.append(f"x_baseline(sample)={baseline.sample}")
+ if target.sample is not None:
+ lines.append(f"x_target(sample)={target.sample}")
+
+ return "\n".join(lines)
+
+
+def format_replicated_checks(checks: list[ReplicatedCheckResult]) -> str:
+ lines: list[str] = ["Replicated checks:"]
+
+ for check in checks:
+ marker: str = "✅" if check.passed else "❌"
+
+ if check.diff is not None:
+ detail: str = (
+ f"rel_diff={check.diff.rel_diff:.6e} "
+ f"max_abs_diff={check.diff.max_abs_diff:.6e} "
+ f"mean_abs_diff={check.diff.mean_abs_diff:.6e}"
+ )
+ else:
+ detail = "n/a diff"
+
+ lines.append(
+ f" {marker} axis={check.axis} group={check.group_index} "
+ f"idx={check.compared_index} vs {check.baseline_index}: "
+ f"{detail}"
+ )
+
+ return "\n".join(lines)
+
+
+def _format_stats_comparison(baseline: TensorStats, target: TensorStats) -> list[str]:
+ lines: list[str] = []
+
+ for stat_name in TensorStats.model_fields:
+ if stat_name == "percentiles":
+ continue
+ value_baseline: float = getattr(baseline, stat_name)
+ value_target: float = getattr(target, stat_name)
+ lines.append(
+ f"[{stat_name}] {value_baseline:.4f} vs {value_target:.4f} "
+ f"(diff: {value_target - value_baseline:.4f})"
+ )
+
+ for p in sorted(set(baseline.percentiles) & set(target.percentiles)):
+ value_baseline = baseline.percentiles[p]
+ value_target = target.percentiles[p]
+ lines.append(
+ f"[p{p}] {value_baseline:.4f} vs {value_target:.4f} "
+ f"(diff: {value_target - value_baseline:.4f})"
+ )
+
+ return lines
+
+
+def _format_diff(diff: DiffInfo, prefix_text: str = "") -> list[str]:
+ rel_diff_marker: str = "❌" if diff.rel_diff > diff.diff_threshold else "✅"
+ lines: list[str] = [
+ prefix_text
+ + f"{rel_diff_marker} rel_diff={diff.rel_diff}\t"
+ + f"max_abs_diff={diff.max_abs_diff}\t"
+ + f"mean_abs_diff={diff.mean_abs_diff}",
+ f"max_abs_diff happens at coord={diff.max_diff_coord} with "
+ f"baseline={diff.baseline_at_max} "
+ f"target={diff.target_at_max}",
+ ]
+
+ if diff.abs_diff_percentiles:
+ quantile_parts: list[str] = [
+ f"p{p}={value:.4f}"
+ for p, value in sorted(diff.abs_diff_percentiles.items())
+ ]
+ lines.append("[abs_diff] " + " ".join(quantile_parts))
+
+ return lines
+
+
+# ---------------------------------------------------------------------------
+# New Rich markup formatters
+# ---------------------------------------------------------------------------
+
+
+def format_comparison_rich(
+ record: ComparisonTensorRecord,
+ verbosity: Verbosity = "normal",
+) -> str:
+ if verbosity == "minimal":
+ return _format_comparison_minimal(record)
+
+ return _format_comparison_normal_or_verbose(
+ record=record,
+ verbose=(verbosity == "verbose"),
+ )
+
+
+def _format_comparison_minimal(record: ComparisonTensorRecord) -> str:
+ passed, color, marker = _category_marker(record.category)
+
+ name_part: str = f"[bold {color}]{escape(record.name):30s}[/]"
+ if record.diff is not None:
+ return f"{marker} {name_part} rel_diff={_fmt_val(record.diff.rel_diff)}"
+ elif record.shape_mismatch:
+ return f"{marker} {name_part} [yellow]shape mismatch[/]"
+ else:
+ return f"{marker} {name_part}"
+
+
+def _format_comparison_normal_or_verbose(
+ *,
+ record: ComparisonTensorRecord,
+ verbose: bool,
+) -> str:
+ passed, color, marker = _category_marker(record.category)
+
+ baseline: TensorInfo = record.baseline
+ target: TensorInfo = record.target
+ aligned_shape: str = _esc_shape(record.unified_shape)
+ dtype_str: str = _strip_torch_prefix(baseline.dtype)
+
+ lines: list[str] = []
+
+ # L0: Header
+ lines.append(
+ f"{marker} [bold {color}]{escape(record.name)}[/] "
+ f"[dim cyan]── {dtype_str} {aligned_shape}[/]"
+ )
+
+ # L1: Key metrics
+ if record.diff is not None:
+ diff: DiffInfo = record.diff
+ rel_style: str = f"bold {color}" if not passed else color
+ lines.append(
+ f" [{rel_style}]rel_diff={_fmt_val(diff.rel_diff)}[/]"
+ f" max_abs={_fmt_val(diff.max_abs_diff)}"
+ f" mean_abs={_fmt_val(diff.mean_abs_diff)}"
+ )
+
+ if not passed:
+ lines.append(
+ f" max_abs @ {_esc_shape(diff.max_diff_coord)}: "
+ f"baseline={diff.baseline_at_max} target={diff.target_at_max}"
+ )
+ elif record.shape_mismatch:
+ lines.append(" [yellow]⚠ Shape mismatch[/]")
+
+ # Downcast info
+ if record.diff_downcast is not None and record.downcast_dtype is not None:
+ dc: DiffInfo = record.diff_downcast
+ dc_marker: str = "[green]✅[/]" if dc.passed else "[red]❌[/]"
+ lines.append(
+ f" {dc_marker} downcast to {record.downcast_dtype}: "
+ f"rel_diff={_fmt_val(dc.rel_diff)}"
+ )
+
+ # Bundle section
+ if record.raw_bundle_info is not None:
+ lines.append(" [dim]Bundle[/]")
+ lines.extend(
+ _format_bundle_section(bundle_info=record.raw_bundle_info, verbose=verbose)
+ )
+
+ # Plan section
+ if record.traced_plan is not None:
+ lines.append(" [dim]Plan[/]")
+ lines.extend(
+ _format_plan_section_rich(
+ traced_plan=record.traced_plan,
+ verbose=verbose,
+ )
+ )
+
+ # Aligned section
+ lines.append(" [dim]Aligned[/]")
+ lines.append(
+ f" {_esc_shape(record.unified_shape)} vs {_esc_shape(target.shape)}"
+ f" {baseline.dtype} vs {target.dtype}"
+ )
+
+ # Stats section
+ lines.append(" [dim]Stats[/]")
+ lines.extend(
+ _format_stats_rich(
+ baseline=baseline.stats, target=target.stats, verbose=verbose
+ )
+ )
+
+ show_detail: bool = verbose or not passed
+
+ # Abs diff percentiles
+ if show_detail and record.diff is not None and record.diff.abs_diff_percentiles:
+ lines.append(" [dim]Abs Diff Percentiles[/]")
+ lines.append(" " + _format_abs_diff_percentiles_rich(record.diff))
+
+ # Samples
+ if show_detail and baseline.sample is not None:
+ lines.append(" [dim]Samples[/]")
+ lines.append(f" baseline {escape(baseline.sample)}")
+ if target.sample is not None:
+ lines.append(f" target {escape(target.sample)}")
+
+ # Replicated checks
+ if show_detail and record.replicated_checks:
+ lines.append(" [dim]Replicated Checks[/]")
+ for check in record.replicated_checks:
+ chk_marker: str = "[green]✅[/]" if check.passed else "[red]❌[/]"
+ if check.diff is not None:
+ lines.append(
+ f" {chk_marker} axis={check.axis} group={check.group_index}"
+ f" idx={check.compared_index} vs {check.baseline_index}"
+ f" rel_diff={_fmt_val(check.diff.rel_diff)}"
+ f" max_abs={_fmt_val(check.diff.max_abs_diff)}"
+ )
+ else:
+ lines.append(
+ f" {chk_marker} axis={check.axis} group={check.group_index}"
+ f" idx={check.compared_index} vs {check.baseline_index}: n/a"
+ )
+
+ return "\n".join(lines)
+
+
+def _format_bundle_section(
+ bundle_info: Pair[BundleSideInfo], *, verbose: bool = False
+) -> list[str]:
+ lines: list[str] = []
+
+ for label, side in [("baseline", bundle_info.x), ("target", bundle_info.y)]:
+ if not side.files:
+ lines.append(f" {label} [dim](no files)[/]")
+ continue
+
+ dtype_desc: str = _strip_torch_prefix(side.files[0].dtype)
+
+ if verbose:
+ dims_part: str = f" dims: {side.dims}" if side.dims else ""
+ lines.append(
+ f" {label} [cyan]{side.num_files} files[/]"
+ f" {dtype_desc}{dims_part}"
+ )
+
+ for idx, f in enumerate(side.files):
+ rank_part: str = f"rank={f.rank}" if f.rank is not None else ""
+ par_part: str = ""
+ if f.parallel_info:
+ par_part = " " + " ".join(
+ f"{k}={v}" for k, v in f.parallel_info.items()
+ )
+ lines.append(
+ f" [{idx}] {_esc_shape(f.shape)} {rank_part}{par_part}"
+ )
+ else:
+ shapes: list[list[int]] = [f.shape for f in side.files]
+ unique_shapes: set[str] = {str(s) for s in shapes}
+ shape_desc: str
+ if len(unique_shapes) == 1:
+ shape_desc = _esc_shape(shapes[0])
+ else:
+ shape_desc = "mixed shapes"
+
+ dims_part = f" [dim]dims: {side.dims}[/]" if side.dims else ""
+ lines.append(
+ f" {label} [cyan]{side.num_files} files[/]"
+ f" × {shape_desc} {dtype_desc}{dims_part}"
+ )
+
+ return lines
+
+
+def _format_plan_section_rich(
+ *,
+ traced_plan: TracedAlignerPlan,
+ verbose: bool = False,
+) -> list[str]:
+ lines: list[str] = []
+
+ for side_label, traced_side in [
+ ("baseline", traced_plan.per_side.x),
+ ("target", traced_plan.per_side.y),
+ ]:
+ if not traced_side.step_plans:
+ lines.append(f" {side_label} [dim](passthrough)[/]")
+ continue
+
+ parts: list[str] = [
+ _format_sub_plan_rich(traced_sub)
+ for traced_step in traced_side.step_plans
+ for traced_sub in traced_step.sub_plans
+ ]
+ lines.append(f" {side_label} " + " → ".join(parts))
+
+ lines.extend(_format_cross_side_plan_rich(traced_plan.plan))
+ return lines
+
+
+def _format_sub_plan_rich(traced_sub: TracedSubPlan) -> str:
+ sub = traced_sub.plan
+ snapshot: Optional[ShapeSnapshot] = traced_sub.snapshot
+
+ op_name: str = sub.type
+ axis_str: str = ""
+ if isinstance(sub, UnsharderPlan):
+ axis_str = f"({sub.axis})"
+
+ shape_change: str = ""
+ if snapshot:
+ in_count: int = len(snapshot.input_shapes)
+ out_count: int = len(snapshot.output_shapes)
+ in_shape: str = (
+ _esc_shape(snapshot.input_shapes[0]) if snapshot.input_shapes else "?"
+ )
+ out_shape: str = (
+ _esc_shape(snapshot.output_shapes[0]) if snapshot.output_shapes else "?"
+ )
+ shape_change = f" {in_count}×{in_shape} → {out_count}×{out_shape}"
+
+ return f"[magenta]{op_name}{axis_str}[/]{shape_change}"
+
+
+def _format_cross_side_plan_rich(plan: AlignerPlan) -> list[str]:
+ lines: list[str] = []
+
+ if plan.token_aligner_plan is not None:
+ num_tokens: int = len(plan.token_aligner_plan.locators.x.steps)
+ lines.append(f" token_aligner [dim]{num_tokens} tokens[/]")
+
+ if plan.axis_aligner_plan is not None:
+ parts: list[str] = []
+ if plan.axis_aligner_plan.pattern.x:
+ parts.append(f"x={plan.axis_aligner_plan.pattern.x}")
+ if plan.axis_aligner_plan.pattern.y:
+ parts.append(f"y={plan.axis_aligner_plan.pattern.y}")
+ if parts:
+ lines.append(f" axis_aligner [dim]{', '.join(parts)}[/]")
+ else:
+ lines.append(" axis_aligner [dim](no-op)[/]")
+
+ return lines
+
+
+def _format_stats_rich(
+ *,
+ baseline: TensorStats,
+ target: TensorStats,
+ verbose: bool = False,
+) -> list[str]:
+ lines: list[str] = []
+
+ if verbose:
+ # All stat fields
+ for stat_name in TensorStats.model_fields:
+ if stat_name == "percentiles":
+ continue
+ val_b: float = getattr(baseline, stat_name)
+ val_t: float = getattr(target, stat_name)
+ lines.append(_format_stat_line(stat_name, val_b, val_t, val_t - val_b))
+
+ # Percentiles
+ for p in sorted(set(baseline.percentiles) & set(target.percentiles)):
+ val_b = baseline.percentiles[p]
+ val_t = target.percentiles[p]
+ lines.append(_format_stat_line(f"p{p}", val_b, val_t, val_t - val_b))
+ else:
+ # Compact: mean, std, range (min/max combined)
+ for stat_name in ("mean", "std"):
+ val_b = getattr(baseline, stat_name)
+ val_t = getattr(target, stat_name)
+ lines.append(_format_stat_line(stat_name, val_b, val_t, val_t - val_b))
+
+ # Range line: combine min/max (escape brackets to avoid Rich markup)
+ range_baseline: str = escape(f"[{baseline.min:.4f}, {baseline.max:.4f}]")
+ range_target: str = escape(f"[{target.min:.4f}, {target.max:.4f}]")
+ lines.append(f" [blue]{'range':10s}[/] {range_baseline} vs {range_target}")
+
+ return lines
+
+
+def _format_abs_diff_percentiles_rich(diff: DiffInfo) -> str:
+ parts: list[str] = []
+ for p, value in sorted(diff.abs_diff_percentiles.items()):
+ formatted: str = f"p{p}={_fmt_val(value)}"
+ if p >= 99 and value > 0.1:
+ formatted = f"[yellow]{formatted}[/]"
+ parts.append(formatted)
+ return " ".join(parts)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/types.py b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..e505d022ec662afbc81b1f9d12e0d707d28df3c8
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/tensor_comparator/types.py
@@ -0,0 +1,45 @@
+from typing import Optional
+
+from sglang.srt.debug_utils.comparator.utils import _StrictBase
+
+DEFAULT_PERCENTILES: tuple[int, ...] = (1, 5, 50, 95, 99)
+
+
+class TensorStats(_StrictBase):
+ mean: float
+ abs_mean: float
+ std: float
+ min: float
+ max: float
+ percentiles: dict[int, float] = {}
+
+
+class TensorInfo(_StrictBase):
+ shape: list[int]
+ dtype: str
+ stats: TensorStats
+ sample: Optional[str] = None
+
+
+class DiffInfo(_StrictBase):
+ rel_diff: float
+ max_abs_diff: float
+ mean_abs_diff: float
+ abs_diff_percentiles: dict[int, float] = {}
+ max_diff_coord: list[int]
+ baseline_at_max: float
+ target_at_max: float
+ diff_threshold: float
+ passed: bool
+ per_token_rel_diff: Optional[list[float]] = None
+
+
+class TensorComparisonInfo(_StrictBase):
+ name: str
+ baseline: TensorInfo
+ target: TensorInfo
+ unified_shape: Optional[list[int]]
+ shape_mismatch: bool
+ diff: Optional[DiffInfo] = None
+ diff_downcast: Optional[DiffInfo] = None
+ downcast_dtype: Optional[str] = None
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/utils.py b/sglang/python/sglang/srt/debug_utils/comparator/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e8cedc015b5944c157fbee95123f721e62ec42c
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/utils.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+import functools
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, Generic, Optional, Tuple, TypeVar
+
+import torch
+from pydantic import BaseModel, ConfigDict
+
+_T = TypeVar("_T")
+_U = TypeVar("_U")
+
+
+def _check_equal_lengths(**named_lists: list) -> None:
+ lengths: dict[str, int] = {name: len(lst) for name, lst in named_lists.items()}
+ unique: set[int] = set(lengths.values())
+ if len(unique) > 1:
+ details: str = ", ".join(f"{name}={length}" for name, length in lengths.items())
+ raise ValueError(f"Length mismatch: {details}")
+
+
+def auto_descend_dir(directory: Path, label: str) -> Path:
+ """If directory has no .pt files but exactly one subdirectory does, descend into it.
+
+ Raises ValueError when the layout is ambiguous (>=2 subdirs with .pt)
+ or when no .pt data is found at all.
+ """
+ if any(directory.glob("*.pt")):
+ return directory
+
+ candidates: list[Path] = [
+ sub for sub in directory.iterdir() if sub.is_dir() and any(sub.glob("*.pt"))
+ ]
+
+ if len(candidates) >= 2:
+ names: str = ", ".join(sorted(c.name for c in candidates))
+ raise ValueError(
+ f"{label}: directory {directory} has no .pt files at top level "
+ f"and multiple subdirectories contain data ({names}). "
+ f"Please specify the exact subdirectory."
+ )
+
+ if len(candidates) == 0:
+ raise ValueError(
+ f"{label}: no .pt files found in {directory} or any of its subdirectories."
+ )
+
+ resolved: Path = candidates[0]
+
+ from sglang.srt.debug_utils.comparator.log_sink import log_sink
+ from sglang.srt.debug_utils.comparator.output_types import InfoLog
+
+ log_sink.add(
+ InfoLog(
+ category="auto_descend",
+ message=f"auto-descend {label}: {directory} -> {resolved}",
+ )
+ )
+ return resolved
+
+
+class _StrictBase(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
+
+class _FrozenBase(BaseModel):
+ model_config = ConfigDict(frozen=True, extra="forbid")
+
+
+class Pair(_FrozenBase, Generic[_T]):
+ x: _T
+ y: _T
+
+ def map(self, fn: Callable[[_T], _U]) -> Pair[_U]:
+ return Pair(x=fn(self.x), y=fn(self.y))
+
+
+def argmax_coord(x: torch.Tensor) -> Tuple[int, ...]:
+ flat_idx = x.argmax()
+ return tuple(idx.item() for idx in torch.unravel_index(flat_idx, x.shape))
+
+
+def compute_smaller_dtype(
+ dtypes: Pair[torch.dtype],
+) -> Optional[torch.dtype]:
+ info_dict = {
+ (torch.float32, torch.bfloat16): torch.bfloat16,
+ # ... add more ...
+ }
+ return info_dict.get((dtypes.x, dtypes.y)) or info_dict.get((dtypes.y, dtypes.x))
+
+
+def try_unify_shape(x: torch.Tensor, target_shape: torch.Size) -> torch.Tensor:
+ x_shape = x.shape
+ num_dim_to_remove = len(x_shape) - len(target_shape)
+ if (x_shape[num_dim_to_remove:] == target_shape) and all(
+ val == 1 for val in x_shape[:num_dim_to_remove]
+ ):
+ return functools.reduce(lambda a, _: a.squeeze(0), range(num_dim_to_remove), x)
+
+ return x
+
+
+# Copied from DeepGEMM
+def calc_rel_diff(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+ x, y = x.double(), y.double()
+ denominator = (x * x + y * y).sum()
+ sim = 2 * (x * y).sum() / denominator
+ return 1 - sim
+
+
+def calc_per_token_rel_diff(
+ x: torch.Tensor, y: torch.Tensor, *, seq_dim: int
+) -> torch.Tensor:
+ """Cosine-distance-like metric per token position.
+
+ Sums over all dims except seq_dim.
+ """
+ x, y = x.double(), y.double()
+ other_dims: list[int] = [d for d in range(x.dim()) if d != seq_dim]
+
+ if other_dims:
+ denominator: torch.Tensor = (x * x + y * y).sum(dim=other_dims)
+ sim: torch.Tensor = 2 * (x * y).sum(dim=other_dims) / (denominator + 1e-10)
+ else:
+ denominator = x * x + y * y
+ sim = 2 * (x * y) / (denominator + 1e-10)
+
+ return (1 - sim).float()
+
+
+if TYPE_CHECKING:
+ from sglang.srt.debug_utils.comparator.output_types import SummaryRecord
+
+
+def compute_exit_code(
+ summary: SummaryRecord,
+ *,
+ allow_skipped_pattern: str,
+ skipped_names: list[str],
+ allow_failed_pattern: Optional[str],
+ failed_names: list[str],
+ errored_names: Optional[list[str]] = None,
+) -> int:
+ if summary.passed == 0:
+ return 1
+
+ if errored_names:
+ return 1
+
+ if not _is_all_match_pattern(pattern=allow_failed_pattern, strings=failed_names):
+ return 1
+
+ if not _is_all_match_pattern(pattern=allow_skipped_pattern, strings=skipped_names):
+ return 1
+
+ return 0
+
+
+def _is_all_match_pattern(*, pattern: Optional[str], strings: list[str]) -> bool:
+ if pattern is None:
+ return len(strings) == 0
+ compiled: re.Pattern[str] = re.compile(pattern)
+ return all(compiled.fullmatch(s) for s in strings)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/visualizer/__init__.py b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..476ddce36cada3c1ff037dd072ab82601800c602
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/__init__.py
@@ -0,0 +1,3 @@
+from sglang.srt.debug_utils.comparator.visualizer.figure import ( # noqa: F401
+ generate_comparison_figure,
+)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/visualizer/figure.py b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/figure.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c91928211fc292ac81c9268354299dcb83d312
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/figure.py
@@ -0,0 +1,116 @@
+"""Main orchestration logic for comparison figure generation."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.debug_utils.comparator.visualizer.preprocessing import (
+ _preprocess_tensor,
+)
+
+
+@dataclass(frozen=True)
+class _PanelContext:
+ baseline_2d: torch.Tensor
+ target_2d: torch.Tensor
+ diff: Optional[torch.Tensor] # None when shapes differ
+ name: str
+
+
+@dataclass(frozen=True)
+class _Panel:
+ label: str
+ requires_diff: bool
+ draw: Callable[[np.ndarray, int, _PanelContext], Optional[str]]
+
+
+def _build_panels() -> list[_Panel]:
+ from sglang.srt.debug_utils.comparator.visualizer.panels import (
+ _draw_baseline_heatmap,
+ _draw_diff_heatmap,
+ _draw_diff_histogram,
+ _draw_hist2d,
+ _draw_sampled,
+ _draw_target_heatmap,
+ )
+
+ return [
+ _Panel(
+ label="Baseline Heatmap", requires_diff=False, draw=_draw_baseline_heatmap
+ ),
+ _Panel(label="Target Heatmap", requires_diff=False, draw=_draw_target_heatmap),
+ _Panel(label="Abs Diff Heatmap", requires_diff=True, draw=_draw_diff_heatmap),
+ _Panel(label="Abs Diff Hist", requires_diff=True, draw=_draw_diff_histogram),
+ _Panel(label="Hist2D", requires_diff=True, draw=_draw_hist2d),
+ _Panel(label="Sampled", requires_diff=True, draw=_draw_sampled),
+ ]
+
+
+def generate_comparison_figure(
+ *,
+ baseline: torch.Tensor,
+ target: torch.Tensor,
+ name: str,
+ output_path: Path,
+) -> None:
+ """Generate a multi-panel comparison PNG for a baseline/target tensor pair.
+
+ Panels (6 rows x 2 cols, left=normal, right=log10):
+ Row 0: Baseline heatmap
+ Row 1: Target heatmap
+ Row 2: Abs Diff heatmap
+ Row 3: Abs Diff histogram
+ Row 4: Hist2D scatter (baseline vs target density)
+ Row 5: Sampled scatter (10k sampled mini-heatmap)
+ """
+ import matplotlib.pyplot as plt
+
+ baseline_f: torch.Tensor = baseline.detach().cpu().float()
+ target_f: torch.Tensor = target.detach().cpu().float()
+
+ can_diff: bool = baseline_f.shape == target_f.shape
+
+ baseline_2d: torch.Tensor = _preprocess_tensor(baseline_f)
+ target_2d: torch.Tensor = _preprocess_tensor(target_f)
+
+ diff: Optional[torch.Tensor] = (baseline_2d - target_2d).abs() if can_diff else None
+
+ ctx = _PanelContext(
+ baseline_2d=baseline_2d,
+ target_2d=target_2d,
+ diff=diff,
+ name=name,
+ )
+
+ panels: list[_Panel] = _build_panels()
+ active: list[_Panel] = [p for p in panels if not p.requires_diff or can_diff]
+
+ nrows: int = len(active)
+ ncols: int = 2
+ fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 3.5 * nrows))
+ if nrows == 1:
+ axes = axes.reshape(1, -1)
+
+ stats_lines: list[str] = []
+ for i, panel in enumerate(active):
+ stats_line: Optional[str] = panel.draw(axes, i, ctx)
+ if stats_line is not None:
+ stats_lines.append(stats_line)
+
+ num_stats: int = len(stats_lines)
+ title_height: float = 0.015 * num_stats + 0.015
+ fig.suptitle(
+ "\n".join(stats_lines),
+ fontsize=9,
+ family="monospace",
+ y=1 - title_height / 2,
+ )
+ plt.tight_layout(rect=[0, 0, 1, 1 - title_height])
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ plt.savefig(str(output_path), dpi=150, bbox_inches="tight")
+ plt.close(fig)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/visualizer/panels.py b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/panels.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff9a6d6148ae7b4085db77b37aa127f0e03dabe0
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/panels.py
@@ -0,0 +1,226 @@
+"""Panel draw functions for tensor comparison visualization."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.debug_utils.comparator.visualizer.figure import _PanelContext
+from sglang.srt.debug_utils.comparator.visualizer.preprocessing import (
+ _SCATTER_SAMPLE_SIZE,
+ _format_log_ticks,
+ _format_stats,
+ _maybe_downsample_numpy,
+ _safe_hist,
+ _to_log10,
+)
+
+
+def _draw_baseline_heatmap(
+ axes: np.ndarray, row_idx: int, ctx: _PanelContext
+) -> Optional[str]:
+ _draw_heatmap_pair(
+ axes, row_idx=row_idx, t=ctx.baseline_2d, title=f"{ctx.name} Baseline"
+ )
+ return _format_stats("Baseline", ctx.baseline_2d)
+
+
+def _draw_target_heatmap(
+ axes: np.ndarray, row_idx: int, ctx: _PanelContext
+) -> Optional[str]:
+ _draw_heatmap_pair(
+ axes, row_idx=row_idx, t=ctx.target_2d, title=f"{ctx.name} Target"
+ )
+ return _format_stats("Target", ctx.target_2d)
+
+
+def _draw_diff_heatmap(
+ axes: np.ndarray, row_idx: int, ctx: _PanelContext
+) -> Optional[str]:
+ assert ctx.diff is not None
+ _draw_heatmap_pair(axes, row_idx=row_idx, t=ctx.diff, title=f"{ctx.name} Abs Diff")
+ return _format_stats("Abs Diff", ctx.diff)
+
+
+def _draw_diff_histogram(
+ axes: np.ndarray, row_idx: int, ctx: _PanelContext
+) -> Optional[str]:
+ assert ctx.diff is not None
+ _draw_histogram_pair(
+ axes, row_idx=row_idx, diff=ctx.diff, label=f"{ctx.name} Abs Diff"
+ )
+ return None
+
+
+def _draw_hist2d(axes: np.ndarray, row_idx: int, ctx: _PanelContext) -> Optional[str]:
+ _draw_scatter_hist2d(
+ axes,
+ row_idx=row_idx,
+ baseline=ctx.baseline_2d,
+ target=ctx.target_2d,
+ label=ctx.name,
+ )
+ return None
+
+
+def _draw_sampled(axes: np.ndarray, row_idx: int, ctx: _PanelContext) -> Optional[str]:
+ _draw_scatter_sampled(
+ axes,
+ row_idx=row_idx,
+ baseline=ctx.baseline_2d,
+ target=ctx.target_2d,
+ label=ctx.name,
+ )
+ return None
+
+
+# ────────────────────── internal drawing helpers ──────────────────────
+
+
+def _draw_heatmap_pair(
+ axes: np.ndarray,
+ *,
+ row_idx: int,
+ t: torch.Tensor,
+ title: str,
+) -> None:
+ import matplotlib.pyplot as plt
+
+ ax_normal = axes[row_idx, 0]
+ ax_log = axes[row_idx, 1]
+
+ im = ax_normal.imshow(t.numpy(), aspect="auto", cmap="viridis")
+ ax_normal.set_title(title)
+ plt.colorbar(im, ax=ax_normal)
+
+ im_log = ax_log.imshow(_to_log10(t).numpy(), aspect="auto", cmap="viridis")
+ ax_log.set_title(f"{title} (Log10)")
+ cbar = plt.colorbar(im_log, ax=ax_log)
+ _format_log_ticks(cbar.ax, axis="y")
+
+
+def _draw_histogram_pair(
+ axes: np.ndarray,
+ *,
+ row_idx: int,
+ diff: torch.Tensor,
+ label: str,
+) -> None:
+
+ ax_normal = axes[row_idx, 0]
+ ax_log = axes[row_idx, 1]
+
+ diff_flat: np.ndarray = _maybe_downsample_numpy(diff.flatten())
+
+ _safe_hist(ax_normal, diff_flat, bins=100, edgecolor="none")
+ ax_normal.set_title(f"{label} Histogram")
+ ax_normal.set_xlabel("Abs Diff")
+ ax_normal.set_ylabel("Count")
+
+ log_flat: np.ndarray = np.log10(np.abs(diff_flat) + 1e-10)
+ _safe_hist(ax_log, log_flat, bins=100, edgecolor="none")
+ ax_log.set_title(f"{label} Histogram (Log10)")
+ ax_log.set_xlabel("Abs Diff")
+ ax_log.set_ylabel("Count")
+ _format_log_ticks(ax_log, axis="x")
+
+
+def _draw_scatter_hist2d(
+ axes: np.ndarray,
+ *,
+ row_idx: int,
+ baseline: torch.Tensor,
+ target: torch.Tensor,
+ label: str,
+) -> None:
+ import matplotlib.pyplot as plt
+
+ ax_normal = axes[row_idx, 0]
+ ax_log = axes[row_idx, 1]
+
+ b_flat: np.ndarray = _maybe_downsample_numpy(baseline.flatten())
+ t_flat: np.ndarray = _maybe_downsample_numpy(target.flatten())
+ min_len: int = min(len(b_flat), len(t_flat))
+ b_flat = b_flat[:min_len]
+ t_flat = t_flat[:min_len]
+
+ # Normal scale
+ lim: float = float(max(np.abs(b_flat).max(), np.abs(t_flat).max())) * 1.05
+ if lim == 0:
+ lim = 1.0
+ _h, _xe, _ye, im = ax_normal.hist2d(
+ b_flat,
+ t_flat,
+ bins=200,
+ range=[[-lim, lim], [-lim, lim]],
+ cmap="viridis",
+ norm="log",
+ )
+ ax_normal.plot([-lim, lim], [-lim, lim], "r--", linewidth=0.5)
+ ax_normal.set_title(f"{label} Hist2D")
+ ax_normal.set_xlabel("Baseline")
+ ax_normal.set_ylabel("Target")
+ ax_normal.set_aspect("equal")
+ plt.colorbar(im, ax=ax_normal)
+
+ # Log scale
+ b_log: np.ndarray = np.log10(np.abs(b_flat) + 1e-10)
+ t_log: np.ndarray = np.log10(np.abs(t_flat) + 1e-10)
+ vmin: float = float(min(b_log.min(), t_log.min())) - 0.5
+ vmax: float = float(max(b_log.max(), t_log.max())) + 0.5
+ _h2, _xe2, _ye2, im2 = ax_log.hist2d(
+ b_log,
+ t_log,
+ bins=200,
+ range=[[vmin, vmax], [vmin, vmax]],
+ cmap="viridis",
+ norm="log",
+ )
+ ax_log.plot([vmin, vmax], [vmin, vmax], "r--", linewidth=0.5)
+ ax_log.set_title(f"{label} Hist2D (Log10 Abs)")
+ ax_log.set_xlabel("Baseline")
+ ax_log.set_ylabel("Target")
+ ax_log.set_aspect("equal")
+ plt.colorbar(im2, ax=ax_log)
+ _format_log_ticks(ax_log, axis="both")
+
+
+def _draw_scatter_sampled(
+ axes: np.ndarray,
+ *,
+ row_idx: int,
+ baseline: torch.Tensor,
+ target: torch.Tensor,
+ label: str,
+) -> None:
+ import matplotlib.pyplot as plt
+
+ ax_baseline = axes[row_idx, 0]
+ ax_target = axes[row_idx, 1]
+
+ b_flat: np.ndarray = baseline.flatten().numpy()
+ t_flat: np.ndarray = target.flatten().numpy()
+
+ n_samples: int = min(_SCATTER_SAMPLE_SIZE, len(b_flat))
+ rng: np.random.Generator = np.random.default_rng(seed=42)
+ indices: np.ndarray = np.sort(rng.choice(len(b_flat), n_samples, replace=False))
+ b_sampled: np.ndarray = b_flat[indices]
+ t_sampled: np.ndarray = t_flat[indices]
+
+ side: int = int(np.sqrt(n_samples))
+ n_use: int = side * side
+ b_2d: np.ndarray = b_sampled[:n_use].reshape(side, side)
+ t_2d: np.ndarray = t_sampled[:n_use].reshape(side, side)
+
+ vmin: float = float(min(b_2d.min(), t_2d.min()))
+ vmax: float = float(max(b_2d.max(), t_2d.max()))
+
+ im_b = ax_baseline.imshow(b_2d, aspect="auto", cmap="viridis", vmin=vmin, vmax=vmax)
+ ax_baseline.set_title(f"{label} Baseline (10k sampled)")
+ plt.colorbar(im_b, ax=ax_baseline)
+
+ im_t = ax_target.imshow(t_2d, aspect="auto", cmap="viridis", vmin=vmin, vmax=vmax)
+ ax_target.set_title(f"{label} Target (10k sampled)")
+ plt.colorbar(im_t, ax=ax_target)
diff --git a/sglang/python/sglang/srt/debug_utils/comparator/visualizer/preprocessing.py b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e1b14b82b39faf31e3c7b9d771c87684f8332d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/comparator/visualizer/preprocessing.py
@@ -0,0 +1,101 @@
+"""Tensor preprocessing and utility functions for visualization."""
+
+from __future__ import annotations
+
+import math
+import re
+
+import numpy as np
+import torch
+
+_DOWNSAMPLE_THRESHOLD: int = 10_000_000
+_SCATTER_SAMPLE_SIZE: int = 10_000
+
+
+def _preprocess_tensor(tensor: torch.Tensor) -> torch.Tensor:
+ t: torch.Tensor = tensor.squeeze()
+
+ while t.ndim < 2:
+ t = t.unsqueeze(0)
+ if t.ndim > 2:
+ t = t.reshape(-1, t.shape[-1])
+
+ t = _reshape_to_balanced_aspect(t)
+ return t
+
+
+def _reshape_to_balanced_aspect(
+ t: torch.Tensor, max_ratio: float = 5.0
+) -> torch.Tensor:
+ assert t.ndim == 2
+
+ h, w = t.shape
+ ratio: float = h / w if w > 0 else float("inf")
+
+ if 1 / max_ratio <= ratio <= max_ratio:
+ return t
+
+ total: int = h * w
+ target_side: int = int(math.sqrt(total))
+
+ for new_h in range(target_side, 0, -1):
+ if total % new_h == 0:
+ new_w: int = total // new_h
+ new_ratio: float = new_h / new_w
+ if 1 / max_ratio <= new_ratio <= max_ratio:
+ return t.reshape(new_h, new_w)
+
+ return t.reshape(1, -1)
+
+
+# ────────────────────── utility ──────────────────────
+
+
+def _to_log10(t: torch.Tensor) -> torch.Tensor:
+ return t.abs().clamp(min=1e-10).log10()
+
+
+def _format_log_ticks(ax: object, axis: str = "both") -> None:
+ from matplotlib.ticker import FuncFormatter
+
+ formatter = FuncFormatter(
+ lambda x, _: f"1e{int(x)}" if x == int(x) else f"1e{x:.1f}"
+ )
+ if axis in ("x", "both"):
+ ax.xaxis.set_major_formatter(formatter)
+ if axis in ("y", "both"):
+ ax.yaxis.set_major_formatter(formatter)
+
+
+def _format_stats(name: str, t: torch.Tensor) -> str:
+ return (
+ f"{name}: shape={tuple(t.shape)}, "
+ f"min={t.min().item():.4g}, max={t.max().item():.4g}, "
+ f"mean={t.mean().item():.4g}, std={t.std().item():.4g}"
+ )
+
+
+def _safe_hist(
+ ax: object, data: np.ndarray, *, bins: int = 100, **kwargs: object
+) -> None:
+ data_f64: np.ndarray = data.astype(np.float64)
+ try:
+ ax.hist(data_f64, bins=bins, **kwargs)
+ except ValueError:
+ ax.hist(data_f64, bins=max(1, len(np.unique(data_f64[:1000]))), **kwargs)
+
+
+def _maybe_downsample_numpy(
+ t: torch.Tensor,
+ max_elements: int = _DOWNSAMPLE_THRESHOLD,
+) -> np.ndarray:
+ if t.numel() <= max_elements:
+ return t.numpy()
+
+ rng: np.random.Generator = np.random.default_rng(seed=0)
+ indices: np.ndarray = rng.choice(t.numel(), max_elements, replace=False)
+ return t.numpy()[indices]
+
+
+def _sanitize_filename(name: str) -> str:
+ return re.sub(r"[/\.\s]+", "_", name).strip("_")
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/__init__.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9c815de6a6457949441664504217d4776d37b68
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/__init__.py
@@ -0,0 +1,51 @@
+from sglang.srt.debug_utils.schedule_simulator.data_source import (
+ generate_gsp_requests,
+ generate_random_requests,
+ load_from_request_logger,
+)
+from sglang.srt.debug_utils.schedule_simulator.entrypoint import create_arg_parser, main
+from sglang.srt.debug_utils.schedule_simulator.gpu_state import GPUState, StepRecord
+from sglang.srt.debug_utils.schedule_simulator.metrics import (
+ AttentionComputeBalancednessRecorder,
+ AvgBatchSizeRecorder,
+ BatchSizeBalancednessRecorder,
+ MetricRecorder,
+)
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers import (
+ RandomRouter,
+ RoundRobinRouter,
+ RouterPolicy,
+ StickyRouter,
+)
+from sglang.srt.debug_utils.schedule_simulator.schedulers import (
+ FIFOScheduler,
+ SchedulerPolicy,
+)
+from sglang.srt.debug_utils.schedule_simulator.simulator import (
+ SimulationResult,
+ Simulator,
+)
+
+__all__ = [
+ "SimRequest",
+ "GPUState",
+ "Simulator",
+ "SimulationResult",
+ "StepRecord",
+ "RouterPolicy",
+ "RandomRouter",
+ "RoundRobinRouter",
+ "StickyRouter",
+ "SchedulerPolicy",
+ "FIFOScheduler",
+ "MetricRecorder",
+ "BatchSizeBalancednessRecorder",
+ "AttentionComputeBalancednessRecorder",
+ "AvgBatchSizeRecorder",
+ "load_from_request_logger",
+ "generate_random_requests",
+ "generate_gsp_requests",
+ "create_arg_parser",
+ "main",
+]
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/__main__.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff33faaad57421a1c95b5b893bfd81d7d3ac09b
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/__main__.py
@@ -0,0 +1,6 @@
+from sglang.srt.debug_utils.schedule_simulator.entrypoint import create_arg_parser, main
+
+if __name__ == "__main__":
+ parser = create_arg_parser()
+ args = parser.parse_args()
+ main(args)
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/__init__.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..116f4e4b52a466c7f0544c9dc14f544522f1f810
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/__init__.py
@@ -0,0 +1,13 @@
+from sglang.srt.debug_utils.schedule_simulator.data_source.data_loader import (
+ load_from_request_logger,
+)
+from sglang.srt.debug_utils.schedule_simulator.data_source.data_synthesis import (
+ generate_gsp_requests,
+ generate_random_requests,
+)
+
+__all__ = [
+ "load_from_request_logger",
+ "generate_random_requests",
+ "generate_gsp_requests",
+]
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_loader.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..17207b2a2ca4b9b549dfb643e7273f11772bcceb
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_loader.py
@@ -0,0 +1,34 @@
+import json
+from pathlib import Path
+from typing import List, Union
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+
+
+def load_from_request_logger(file_path: Union[str, Path]) -> List[SimRequest]:
+ requests = []
+ file_path = Path(file_path)
+
+ with file_path.open(encoding="utf-8") as f:
+ for line_num, line in enumerate(f):
+ line = line.strip()
+ if not line or not line.startswith("{"):
+ continue
+
+ data = json.loads(line)
+
+ if data.get("event") != "request.finished":
+ continue
+
+ rid = data.get("rid", f"req_{line_num}")
+ meta_info = data["out"]["meta_info"]
+
+ requests.append(
+ SimRequest(
+ request_id=rid,
+ input_len=meta_info["prompt_tokens"],
+ output_len=meta_info["completion_tokens"],
+ )
+ )
+
+ return requests
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_synthesis.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_synthesis.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce19d053814560288d39b37d11490915545286d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/data_source/data_synthesis.py
@@ -0,0 +1,79 @@
+import random
+from typing import List, Optional
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+
+
+def generate_random_requests(
+ num_requests: int,
+ input_len: int,
+ output_len: int,
+ range_ratio: float = 1.0,
+ seed: Optional[int] = None,
+) -> List[SimRequest]:
+ if seed is not None:
+ random.seed(seed)
+
+ requests = []
+ for i in range(num_requests):
+ isl = _random_len(input_len, range_ratio)
+ osl = _random_len(output_len, range_ratio)
+ requests.append(
+ SimRequest(
+ request_id=f"syn{i}",
+ input_len=isl,
+ output_len=osl,
+ )
+ )
+
+ print(
+ f"Generated {len(requests)} random requests "
+ f"(input_len={input_len}, output_len={output_len}, range_ratio={range_ratio})"
+ )
+ return requests
+
+
+def generate_gsp_requests(
+ num_groups: int,
+ prompts_per_group: int,
+ system_prompt_len: int,
+ question_len: int,
+ output_len: int,
+ range_ratio: float = 1.0,
+ seed: Optional[int] = None,
+) -> List[SimRequest]:
+ if seed is not None:
+ random.seed(seed)
+
+ requests = []
+ idx = 0
+ for group_idx in range(num_groups):
+ group_id = f"g{group_idx}"
+ prefix_len = _random_len(system_prompt_len, range_ratio)
+ for _ in range(prompts_per_group):
+ q_len = _random_len(question_len, range_ratio)
+ osl = _random_len(output_len, range_ratio)
+ requests.append(
+ SimRequest(
+ request_id=f"gsp{idx}",
+ input_len=prefix_len + q_len,
+ output_len=osl,
+ group_id=group_id,
+ prefix_len=prefix_len,
+ )
+ )
+ idx += 1
+
+ random.shuffle(requests)
+ print(
+ f"Generated {len(requests)} GSP requests "
+ f"({num_groups} groups x {prompts_per_group} prompts, "
+ f"system_prompt_len={system_prompt_len}, question_len={question_len}, "
+ f"output_len={output_len})"
+ )
+ return requests
+
+
+def _random_len(full_len: int, range_ratio: float) -> int:
+ min_len = max(int(full_len * range_ratio), 1)
+ return random.randint(min_len, full_len)
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/entrypoint.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/entrypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1118e67d9a26cb884b59a962ecf2b8e01b5a47e5
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/entrypoint.py
@@ -0,0 +1,168 @@
+import argparse
+import json
+import random
+from typing import List
+
+from sglang.srt.debug_utils.schedule_simulator.data_source.data_loader import (
+ load_from_request_logger,
+)
+from sglang.srt.debug_utils.schedule_simulator.data_source.data_synthesis import (
+ generate_gsp_requests,
+ generate_random_requests,
+)
+from sglang.srt.debug_utils.schedule_simulator.metrics import (
+ AttentionComputeBalancednessRecorder,
+ AvgBatchSizeRecorder,
+ BatchSizeBalancednessRecorder,
+)
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers import (
+ RandomRouter,
+ RoundRobinRouter,
+ StickyRouter,
+)
+from sglang.srt.debug_utils.schedule_simulator.schedulers import FIFOScheduler
+from sglang.srt.debug_utils.schedule_simulator.simulator import (
+ SimulationResult,
+ Simulator,
+)
+
+
+def create_arg_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Schedule Simulator for analyzing request scheduling across GPUs"
+ )
+
+ data_group = parser.add_mutually_exclusive_group(required=True)
+ data_group.add_argument(
+ "--input", type=str, help="Path to request_logger JSON file"
+ )
+ data_group.add_argument(
+ "--synthetic", action="store_true", help="Use synthetic data generation"
+ )
+ data_group.add_argument(
+ "--synth-gsp",
+ action="store_true",
+ help="Use generated-shared-prefix (GSP) data generation",
+ )
+
+ # Shared synthetic arguments
+ parser.add_argument("--synth-seed", type=int, default=None)
+
+ # Random dataset arguments (aligned with bench_serving.py --random-* options)
+ parser.add_argument("--synth-random-num-requests", type=int, default=1000)
+ parser.add_argument("--synth-random-input-len", type=int, default=1024)
+ parser.add_argument("--synth-random-output-len", type=int, default=256)
+ parser.add_argument("--synth-random-range-ratio", type=float, default=0.0)
+
+ # GSP dataset arguments (aligned with bench_serving.py --gsp-* options)
+ parser.add_argument("--synth-gsp-num-groups", type=int, default=64)
+ parser.add_argument("--synth-gsp-prompts-per-group", type=int, default=16)
+ parser.add_argument("--synth-gsp-system-prompt-len", type=int, default=2048)
+ parser.add_argument("--synth-gsp-question-len", type=int, default=128)
+ parser.add_argument("--synth-gsp-output-len", type=int, default=256)
+ parser.add_argument("--synth-gsp-range-ratio", type=float, default=1.0)
+
+ parser.add_argument("--num-gpus-per-engine", type=int, default=8)
+ parser.add_argument("--num-engines", type=int, default=1)
+ parser.add_argument(
+ "--router",
+ type=str,
+ choices=["random", "round_robin", "sticky"],
+ default="round_robin",
+ )
+ parser.add_argument("--scheduler", type=str, choices=["fifo"], default="fifo")
+ parser.add_argument("--max-total-tokens", type=int, default=100000)
+ parser.add_argument(
+ "--stop-criteria",
+ type=str,
+ choices=["all_done", "exist_no_pending"],
+ default="all_done",
+ help="all_done: run until all requests complete; exist_no_pending: stop when any GPU has no pending requests",
+ )
+ parser.add_argument("--max-steps", type=int, default=None)
+ parser.add_argument("--output", type=str, default=None)
+ parser.add_argument("--log-level", type=int, choices=[0, 1, 2], default=0)
+
+ return parser
+
+
+def _load_requests(args: argparse.Namespace) -> List[SimRequest]:
+ if args.input:
+ requests = load_from_request_logger(args.input)
+ print(f"Loaded {len(requests)} requests from {args.input}")
+ elif args.synth_gsp:
+ requests = generate_gsp_requests(
+ num_groups=args.synth_gsp_num_groups,
+ prompts_per_group=args.synth_gsp_prompts_per_group,
+ system_prompt_len=args.synth_gsp_system_prompt_len,
+ question_len=args.synth_gsp_question_len,
+ output_len=args.synth_gsp_output_len,
+ range_ratio=args.synth_gsp_range_ratio,
+ seed=args.synth_seed,
+ )
+ else:
+ requests = generate_random_requests(
+ num_requests=args.synth_random_num_requests,
+ input_len=args.synth_random_input_len,
+ output_len=args.synth_random_output_len,
+ range_ratio=args.synth_random_range_ratio,
+ seed=args.synth_seed,
+ )
+ return requests
+
+
+def _create_router(name: str, total_gpus: int):
+ if name == "random":
+ return RandomRouter(total_gpus)
+ if name == "round_robin":
+ return RoundRobinRouter(total_gpus)
+ if name == "sticky":
+ return StickyRouter(total_gpus)
+ raise ValueError(f"Unknown router: {name}")
+
+
+def _create_scheduler(name: str):
+ if name == "fifo":
+ return FIFOScheduler()
+ raise ValueError(f"Unknown scheduler: {name}")
+
+
+def main(args: argparse.Namespace) -> SimulationResult:
+ if args.synth_seed is not None:
+ random.seed(args.synth_seed)
+ requests = _load_requests(args)
+ total_gpus = args.num_gpus_per_engine * args.num_engines
+ router = _create_router(args.router, total_gpus)
+ scheduler = _create_scheduler(args.scheduler)
+
+ sim = Simulator(
+ num_gpus_per_engine=args.num_gpus_per_engine,
+ router=router,
+ scheduler=scheduler,
+ recorders=[
+ BatchSizeBalancednessRecorder(),
+ AttentionComputeBalancednessRecorder(),
+ AvgBatchSizeRecorder(),
+ ],
+ log_level=args.log_level,
+ max_total_tokens=args.max_total_tokens,
+ stop_criteria=args.stop_criteria,
+ max_steps=args.max_steps,
+ )
+
+ print(
+ f"Running simulation with {args.num_gpus_per_engine} GPUs/engine x {args.num_engines} engines, router={args.router}, scheduler={args.scheduler}"
+ )
+ result = sim.run(requests)
+
+ print("\n=== Summary ===")
+ for key, value in result.summary.items():
+ print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")
+
+ if args.output:
+ with open(args.output, "w") as f:
+ json.dump(result.summary, f, indent=2)
+ print(f"\nSummary saved to {args.output}")
+
+ return result
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/gpu_state.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/gpu_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..634d01fefdbb706a1ae4ef7c2a745a038a669eda
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/gpu_state.py
@@ -0,0 +1,70 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+
+
+@dataclass
+class StepRecord:
+ step: int
+ gpu_id: int
+ running_count: int
+ pending_count: int
+ total_seq_len: int
+ running_req_ids: List[str] = field(default_factory=list)
+ pending_req_ids: List[str] = field(default_factory=list)
+
+
+@dataclass
+class GPUState:
+ gpu_id: int
+ max_total_tokens: int
+ pending_requests: List[SimRequest] = field(default_factory=list)
+ running_requests: List[SimRequest] = field(default_factory=list)
+
+ def batch_size(self) -> int:
+ return len(self.running_requests)
+
+ def total_attention_compute(self) -> int:
+ return sum(req.seq_len() for req in self.running_requests)
+
+ def total_seq_len(self, extra_reqs: Optional[List[SimRequest]] = None) -> int:
+ seen_groups = set()
+ total = 0
+ for req in self.running_requests + (extra_reqs or []):
+ is_shared = req.group_id is not None and req.group_id in seen_groups
+ total += req.seq_len() - (req.prefix_len if is_shared else 0)
+ if req.group_id is not None:
+ seen_groups.add(req.group_id)
+ return total
+
+ def is_valid(self) -> bool:
+ return self.total_seq_len() <= self.max_total_tokens
+
+ def start_request(self, req: SimRequest) -> None:
+ assert req in self.pending_requests
+ self.pending_requests.remove(req)
+ self.running_requests.append(req)
+
+ def evict_request(self, req: SimRequest) -> None:
+ assert req in self.running_requests
+ self.running_requests.remove(req)
+ self.pending_requests.insert(0, req)
+
+ def execute_step(self) -> None:
+ for req in self.running_requests:
+ req.decoded_tokens += 1
+ self.running_requests = [
+ r for r in self.running_requests if not r.is_finished()
+ ]
+
+ def get_step_record(self, step: int) -> StepRecord:
+ return StepRecord(
+ step=step,
+ gpu_id=self.gpu_id,
+ running_count=len(self.running_requests),
+ pending_count=len(self.pending_requests),
+ total_seq_len=self.total_seq_len(),
+ running_req_ids=[r.request_id for r in self.running_requests],
+ pending_req_ids=[r.request_id for r in self.pending_requests],
+ )
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/metrics.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..d640072afe11f3d691aa9b7f25c50c1f1a1fcd65
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/metrics.py
@@ -0,0 +1,60 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List
+
+from sglang.srt.debug_utils.schedule_simulator.gpu_state import GPUState
+
+
+class MetricRecorder(ABC):
+ @abstractmethod
+ def on_step_end(self, step: int, gpu_states: List[GPUState]) -> None: ...
+
+ @abstractmethod
+ def get_summary(self) -> Dict[str, Any]: ...
+
+
+class BalancednessRecorder(MetricRecorder):
+ def __init__(self, name: str, value_fn: Callable[[GPUState], float]):
+ self._name = name
+ self._value_fn = value_fn
+ self._history: List[float] = []
+
+ def on_step_end(self, step: int, gpu_states: List[GPUState]) -> None:
+ values = [self._value_fn(gpu) for gpu in gpu_states]
+ max_val = max(values) if values else 0
+ mean_val = sum(values) / len(values) if values else 0
+ balancedness = mean_val / max_val if max_val > 0 else 1.0
+ self._history.append(balancedness)
+
+ def get_summary(self) -> Dict[str, Any]:
+ if not self._history:
+ return {f"{self._name}_mean": 0.0}
+ return {
+ f"{self._name}_mean": sum(self._history) / len(self._history),
+ f"{self._name}_min": min(self._history),
+ f"{self._name}_max": max(self._history),
+ }
+
+
+def BatchSizeBalancednessRecorder() -> BalancednessRecorder:
+ return BalancednessRecorder("batch_size_balancedness", lambda gpu: gpu.batch_size())
+
+
+def AttentionComputeBalancednessRecorder() -> BalancednessRecorder:
+ return BalancednessRecorder(
+ "attention_compute_balancedness", lambda gpu: gpu.total_attention_compute()
+ )
+
+
+class AvgBatchSizeRecorder(MetricRecorder):
+ def __init__(self):
+ self._total_running = 0
+ self._num_records = 0
+
+ def on_step_end(self, step: int, gpu_states: List[GPUState]) -> None:
+ for gpu in gpu_states:
+ self._total_running += gpu.batch_size()
+ self._num_records += 1
+
+ def get_summary(self) -> Dict[str, Any]:
+ avg = self._total_running / self._num_records if self._num_records else 0.0
+ return {"avg_batch_size": avg}
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/request.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/request.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0ec13567e97ee18542b78bc0d6935e5cb51b83
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/request.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class SimRequest:
+ request_id: str
+ input_len: int
+ output_len: int
+ decoded_tokens: int = 0
+ group_id: Optional[str] = None
+ prefix_len: int = 0
+
+ def seq_len(self) -> int:
+ return self.input_len + self.decoded_tokens
+
+ def is_finished(self) -> bool:
+ return self.decoded_tokens >= self.output_len
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/__init__.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd174ff56c974722a4e1356f0a58f747acc39eb
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/__init__.py
@@ -0,0 +1,8 @@
+from sglang.srt.debug_utils.schedule_simulator.routers.base import RouterPolicy
+from sglang.srt.debug_utils.schedule_simulator.routers.random_router import RandomRouter
+from sglang.srt.debug_utils.schedule_simulator.routers.round_robin_router import (
+ RoundRobinRouter,
+)
+from sglang.srt.debug_utils.schedule_simulator.routers.sticky_router import StickyRouter
+
+__all__ = ["RouterPolicy", "RandomRouter", "RoundRobinRouter", "StickyRouter"]
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/base.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa4f0a2ce5ce5096f61371ee7fe2eb843d77fe3d
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/base.py
@@ -0,0 +1,8 @@
+from abc import ABC, abstractmethod
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+
+
+class RouterPolicy(ABC):
+ @abstractmethod
+ def route(self, incoming_request: SimRequest) -> int: ...
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/random_router.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/random_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3047ff13fe7098a872ca9671dc2198c2e25cdd0
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/random_router.py
@@ -0,0 +1,12 @@
+import random
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers.base import RouterPolicy
+
+
+class RandomRouter(RouterPolicy):
+ def __init__(self, num_gpus: int):
+ self._num_gpus = num_gpus
+
+ def route(self, incoming_request: SimRequest) -> int:
+ return random.randint(0, self._num_gpus - 1)
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/round_robin_router.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/round_robin_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..e639f0f16f902138dc20afdaaad2ae3221cfbdd0
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/round_robin_router.py
@@ -0,0 +1,13 @@
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers.base import RouterPolicy
+
+
+class RoundRobinRouter(RouterPolicy):
+ def __init__(self, num_gpus: int):
+ self._num_gpus = num_gpus
+ self._counter = 0
+
+ def route(self, incoming_request: SimRequest) -> int:
+ gpu_id = self._counter % self._num_gpus
+ self._counter += 1
+ return gpu_id
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/sticky_router.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/sticky_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..f34288b396ff81f7f1ef17e2f59cfe926f2528ba
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/routers/sticky_router.py
@@ -0,0 +1,20 @@
+import random
+from collections import defaultdict
+
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers.base import RouterPolicy
+
+
+class StickyRouter(RouterPolicy):
+ def __init__(self, num_gpus: int):
+ self._num_gpus = num_gpus
+ self._group_to_gpu = defaultdict(self._assign_gpu)
+
+ def _assign_gpu(self) -> int:
+ return random.randint(0, self._num_gpus - 1)
+
+ def route(self, incoming_request: SimRequest) -> int:
+ group_id = incoming_request.group_id
+ if group_id is None:
+ return random.randint(0, self._num_gpus - 1)
+ return self._group_to_gpu[group_id]
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/__init__.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be84e5be3c533062a32ef662cd1bdd77d53c54e
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.debug_utils.schedule_simulator.schedulers.base import SchedulerPolicy
+from sglang.srt.debug_utils.schedule_simulator.schedulers.fifo_scheduler import (
+ FIFOScheduler,
+)
+
+__all__ = ["SchedulerPolicy", "FIFOScheduler"]
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/base.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df142d7f7852ed51167efcaf3c39e23d7aa6dc9
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/base.py
@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from sglang.srt.debug_utils.schedule_simulator.gpu_state import GPUState
+
+
+class SchedulerPolicy(ABC):
+ @abstractmethod
+ def schedule(self, gpu_state: "GPUState") -> None: ...
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/fifo_scheduler.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/fifo_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb20714e4c04caa79fe5700c72bf79153e4f1bc
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/schedulers/fifo_scheduler.py
@@ -0,0 +1,16 @@
+from typing import TYPE_CHECKING
+
+from sglang.srt.debug_utils.schedule_simulator.schedulers.base import SchedulerPolicy
+
+if TYPE_CHECKING:
+ from sglang.srt.debug_utils.schedule_simulator.gpu_state import GPUState
+
+
+class FIFOScheduler(SchedulerPolicy):
+ def schedule(self, gpu_state: "GPUState") -> None:
+ while not gpu_state.is_valid() and gpu_state.running_requests:
+ gpu_state.evict_request(gpu_state.running_requests[-1])
+
+ for req in list(gpu_state.pending_requests):
+ if gpu_state.total_seq_len(extra_reqs=[req]) <= gpu_state.max_total_tokens:
+ gpu_state.start_request(req)
diff --git a/sglang/python/sglang/srt/debug_utils/schedule_simulator/simulator.py b/sglang/python/sglang/srt/debug_utils/schedule_simulator/simulator.py
new file mode 100644
index 0000000000000000000000000000000000000000..157b16109c71cdfeb650a58693f95fed3a64a712
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/schedule_simulator/simulator.py
@@ -0,0 +1,122 @@
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from sglang.srt.debug_utils.schedule_simulator.gpu_state import GPUState, StepRecord
+from sglang.srt.debug_utils.schedule_simulator.metrics import MetricRecorder
+from sglang.srt.debug_utils.schedule_simulator.request import SimRequest
+from sglang.srt.debug_utils.schedule_simulator.routers.base import RouterPolicy
+from sglang.srt.debug_utils.schedule_simulator.schedulers.base import SchedulerPolicy
+
+
+@dataclass
+class SimulationResult:
+ step_records: List[StepRecord]
+ summary: Dict[str, Any]
+
+
+class Simulator:
+ def __init__(
+ self,
+ num_gpus_per_engine: int,
+ router: RouterPolicy,
+ scheduler: SchedulerPolicy,
+ recorders: Optional[List[MetricRecorder]] = None,
+ log_level: int = 0,
+ max_total_tokens: int = 100000,
+ stop_criteria: str = "all_done",
+ max_steps: Optional[int] = None,
+ ):
+ self.num_gpus_per_engine = num_gpus_per_engine
+ self.router = router
+ self.scheduler = scheduler
+ self.recorders = recorders or []
+ self.log_level = log_level
+ self.max_total_tokens = max_total_tokens
+ self.stop_criteria = stop_criteria
+ self.max_steps = max_steps
+ self.gpu_states: List[GPUState] = []
+ self.step = 0
+
+ def run(self, requests: List[SimRequest]) -> SimulationResult:
+ self.gpu_states = [
+ GPUState(gpu_id=i, max_total_tokens=self.max_total_tokens)
+ for i in range(self.num_gpus_per_engine)
+ ]
+ self.step = 0
+ step_records: List[StepRecord] = []
+ incoming_requests = list(requests)
+
+ while True:
+ self._route_requests(incoming_requests)
+ incoming_requests.clear()
+ self._schedule_all_gpus()
+ if self._should_stop():
+ break
+ self._execute_step()
+ step_records.extend(
+ gpu.get_step_record(self.step) for gpu in self.gpu_states
+ )
+ self._log_step()
+ self._record_metrics()
+ self.step += 1
+
+ return SimulationResult(step_records=step_records, summary=self._get_summary())
+
+ def _should_stop(self) -> bool:
+ if self.max_steps is not None and self.step >= self.max_steps:
+ return True
+ if self.stop_criteria == "exist_no_pending":
+ return any(not gpu.pending_requests for gpu in self.gpu_states)
+ if self.stop_criteria == "all_done":
+ return not any(
+ gpu.pending_requests or gpu.running_requests for gpu in self.gpu_states
+ )
+ raise ValueError(f"Unknown stop criteria: {self.stop_criteria}")
+
+ def _route_requests(self, incoming_requests: List[SimRequest]) -> None:
+ for req in incoming_requests:
+ gpu_id = self.router.route(req)
+ if gpu_id < self.num_gpus_per_engine:
+ self.gpu_states[gpu_id].pending_requests.append(req)
+
+ def _schedule_all_gpus(self) -> None:
+ for gpu in self.gpu_states:
+ self.scheduler.schedule(gpu)
+ assert gpu.is_valid(), (
+ f"GPU{gpu.gpu_id} invalid after scheduling "
+ f"({gpu.total_seq_len()=}, {gpu.max_total_tokens=})"
+ )
+
+ def _execute_step(self) -> None:
+ for gpu in self.gpu_states:
+ gpu.execute_step()
+
+ def _log_step(self) -> None:
+ if self.log_level == 0 and self.step % 100 != 0:
+ return
+ parts = [f"step={self.step:<4}"]
+ for gpu in self.gpu_states:
+ r, q = len(gpu.running_requests), len(gpu.pending_requests)
+ if self.log_level <= 1:
+ parts.append(f"GPU{gpu.gpu_id}[R={r:<3} Q={q:<3}]")
+ else:
+ run_ids = _format_ids(gpu.running_requests)
+ queue_ids = _format_ids(gpu.pending_requests)
+ parts.append(f"GPU{gpu.gpu_id}[R={r}:{run_ids} Q={q}:{queue_ids}]")
+ print(" | ".join(parts))
+
+ def _record_metrics(self) -> None:
+ for recorder in self.recorders:
+ recorder.on_step_end(self.step, self.gpu_states)
+
+ def _get_summary(self) -> Dict[str, Any]:
+ return {k: v for r in self.recorders for k, v in r.get_summary().items()}
+
+
+def _format_ids(requests: List[SimRequest], limit: int = 5) -> str:
+ if not requests:
+ return "-"
+ ids = ",".join(r.request_id for r in requests[:limit])
+ if len(requests) > limit:
+ ids += f"...+{len(requests) - limit}"
+ return ids
diff --git a/sglang/python/sglang/srt/debug_utils/source_patcher/__init__.py b/sglang/python/sglang/srt/debug_utils/source_patcher/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c853fad17a753a5e242b097c8df72f3e75465fe6
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/source_patcher/__init__.py
@@ -0,0 +1,12 @@
+from sglang.srt.debug_utils.source_patcher.code_patcher import (
+ CodePatcher,
+ apply_patches_from_config,
+ patch_function,
+)
+from sglang.srt.debug_utils.source_patcher.types import (
+ EditSpec,
+ PatchApplicationError,
+ PatchConfig,
+ PatchSpec,
+ PatchState,
+)
diff --git a/sglang/python/sglang/srt/debug_utils/source_patcher/code_patcher.py b/sglang/python/sglang/srt/debug_utils/source_patcher/code_patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f2de063203f97b4a7404476cfdbf437aebd197
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/source_patcher/code_patcher.py
@@ -0,0 +1,188 @@
+import importlib
+import inspect
+import textwrap
+import types
+from collections.abc import Callable
+from typing import Any, Optional
+
+import yaml
+
+from sglang.srt.debug_utils.source_patcher.source_editor import apply_edits
+from sglang.srt.debug_utils.source_patcher.types import (
+ EditSpec,
+ PatchConfig,
+ PatchSpec,
+ PatchState,
+)
+
+
+def apply_patches_from_config(
+ yaml_content: str,
+ *,
+ extra_imports: Optional[list[str]] = None,
+) -> list[PatchState]:
+ """Parse a YAML config string and apply all patches.
+
+ Args:
+ yaml_content: YAML string with patch specifications.
+ extra_imports: Import lines inserted once at the top of each patched
+ function body (e.g. ["from pkg import foo"]). The caller (dumper)
+ uses this so users don't have to write boilerplate in YAML.
+ """
+ raw: dict[str, Any] = yaml.safe_load(yaml_content)
+ config: PatchConfig = PatchConfig(**raw)
+
+ if extra_imports:
+ config = _inject_preamble(config=config, extra_imports=extra_imports)
+
+ return _apply_specs(config.patches)
+
+
+class CodePatcher:
+ """Context manager that patches functions on enter and restores on exit."""
+
+ def __init__(self, *, patches: list[PatchSpec]) -> None:
+ self._patches = patches
+ self._states: list[PatchState] = []
+
+ def __enter__(self) -> "CodePatcher":
+ self._states = _apply_specs(self._patches)
+ return self
+
+ def __exit__(
+ self,
+ exc_type: Optional[type],
+ exc_val: Optional[BaseException],
+ exc_tb: Optional[Any],
+ ) -> None:
+ for state in reversed(self._states):
+ state.restore()
+ self._states.clear()
+
+
+def patch_function(
+ *,
+ target: Callable[..., Any],
+ edits: list[EditSpec],
+ preamble: str = "",
+) -> PatchState:
+ """Patch a function by modifying its source and replacing __code__.
+
+ 1. inspect.getsource -> get original source
+ 2. apply_edits -> modify source text
+ 3. optionally prepend preamble (e.g. import lines) inside the function body
+ 4. compile + exec -> get new code object
+ 5. replace target.__code__
+
+ Returns PatchState that can restore the original code.
+ """
+ original_code: types.CodeType = target.__code__
+
+ source: str = inspect.getsource(target)
+ modified_source: str = apply_edits(source=source, edits=edits)
+ modified_source = textwrap.dedent(modified_source)
+
+ if preamble.strip():
+ modified_source = _insert_preamble(source=modified_source, preamble=preamble)
+
+ code: types.CodeType = compile(modified_source, inspect.getfile(target), "exec")
+ temp_namespace: dict[str, Any] = {}
+ exec(code, target.__globals__, temp_namespace)
+
+ new_fn: Any = temp_namespace[target.__name__]
+ target.__code__ = new_fn.__code__
+
+ return PatchState(target_fn=target, original_code=original_code)
+
+
+# --------------------------------- private ---------------------------------
+
+
+def _apply_specs(specs: list[PatchSpec]) -> list[PatchState]:
+ states: list[PatchState] = []
+ for spec in specs:
+ target_fn: Callable[..., Any] = _resolve_target(spec.target)
+ print(f"[source_patcher] patching {spec.target}")
+ state: PatchState = patch_function(
+ target=target_fn, edits=spec.edits, preamble=spec.preamble
+ )
+ states.append(state)
+ return states
+
+
+def _inject_preamble(*, config: PatchConfig, extra_imports: list[str]) -> PatchConfig:
+ """Set preamble on every PatchSpec so imports are inserted once at function top."""
+ import_block: str = "\n".join(extra_imports)
+ new_patches: list[PatchSpec] = []
+
+ for spec in config.patches:
+ existing: str = spec.preamble
+ combined: str = (
+ import_block + "\n" + existing if existing.strip() else import_block
+ )
+ new_patches.append(
+ PatchSpec(target=spec.target, edits=spec.edits, preamble=combined)
+ )
+
+ return PatchConfig(patches=new_patches)
+
+
+def _insert_preamble(*, source: str, preamble: str) -> str:
+ """Insert preamble lines right after the function signature (and optional docstring)."""
+ lines: list[str] = source.splitlines()
+
+ signature_end: int = _find_signature_end(lines)
+
+ body_start: int = signature_end + 1
+ body_indent: str = ""
+ for i in range(body_start, len(lines)):
+ if lines[i].strip():
+ body_indent = " " * (len(lines[i]) - len(lines[i].lstrip()))
+ body_start = i
+ break
+
+ preamble_lines: list[str] = [
+ body_indent + pl for pl in preamble.strip().splitlines()
+ ]
+ return "\n".join(lines[:body_start] + preamble_lines + lines[body_start:])
+
+
+def _find_signature_end(lines: list[str]) -> int:
+ """Find the line index where the function signature ends (the line with trailing colon)."""
+ for i, line in enumerate(lines):
+ if line.rstrip().endswith(":"):
+ return i
+ return 0
+
+
+def _resolve_target(qualified_name: str) -> Callable[..., Any]:
+ """Resolve 'pkg.mod.Class.method' to the actual function object.
+
+ Tries progressively shorter module paths from right to left,
+ then uses getattr for the remaining attribute chain.
+ """
+ parts: list[str] = qualified_name.split(".")
+
+ target: Any = None
+ for split_idx in range(len(parts), 0, -1):
+ module_path: str = ".".join(parts[:split_idx])
+ try:
+ target = importlib.import_module(module_path)
+ attr_parts: list[str] = parts[split_idx:]
+ break
+ except ImportError:
+ continue
+ else:
+ raise ImportError(f"could not import any module prefix of '{qualified_name}'")
+
+ for attr_name in attr_parts:
+ target = getattr(target, attr_name)
+
+ if isinstance(target, classmethod):
+ target = target.__func__
+ if not callable(target):
+ raise TypeError(
+ f"resolved target '{qualified_name}' is not callable: {type(target)}"
+ )
+
+ return target
diff --git a/sglang/python/sglang/srt/debug_utils/source_patcher/source_editor.py b/sglang/python/sglang/srt/debug_utils/source_patcher/source_editor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4b0805a7652cfb97bbc630a3c16d59afefa8ff
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/source_patcher/source_editor.py
@@ -0,0 +1,108 @@
+from sglang.srt.debug_utils.source_patcher.types import EditSpec, PatchApplicationError
+
+
+def apply_edits(*, source: str, edits: list[EditSpec]) -> str:
+ """Apply a sequence of match/replacement edits to source text.
+
+ Each edit is applied sequentially so later edits see the result of earlier ones.
+ """
+ result: str = source
+ for edit in edits:
+ result = _apply_single_edit(source=result, edit=edit)
+ return result
+
+
+def _apply_single_edit(*, source: str, edit: EditSpec) -> str:
+ """Apply a single match/replacement edit to the source text."""
+ match_text: str = edit.match.strip()
+ if not match_text:
+ raise PatchApplicationError("empty match text")
+
+ source_lines: list[str] = source.splitlines()
+ match_lines: list[str] = match_text.splitlines()
+
+ start_idx: int = _find_match(source_lines=source_lines, match_lines=match_lines)
+ match_len: int = len(match_lines)
+
+ original_indent: int = _leading_spaces(source_lines[start_idx])
+
+ effective_replacement: str = _resolve_replacement(edit=edit, match_text=match_text)
+ replacement_lines: list[str] = (
+ effective_replacement.splitlines() if effective_replacement else []
+ )
+ aligned: list[str] = _realign_replacement(
+ replacement_lines=replacement_lines, original_indent=original_indent
+ )
+ new_lines: list[str] = (
+ source_lines[:start_idx] + aligned + source_lines[start_idx + match_len :]
+ )
+
+ trailing_newline: str = "\n" if source.endswith("\n") else ""
+ return "\n".join(new_lines) + trailing_newline
+
+
+def _resolve_replacement(*, edit: EditSpec, match_text: str) -> str:
+ """Return the effective replacement text, handling replacement, prepend, and append modes."""
+ if edit.prepend.strip():
+ return edit.prepend.strip() + "\n" + match_text
+ if edit.append.strip():
+ return match_text + "\n" + edit.append.strip()
+ return edit.replacement.strip()
+
+
+def _find_match(*, source_lines: list[str], match_lines: list[str]) -> int:
+ """Find the start index of match_lines in source_lines (strip-compared).
+
+ Returns the index of the first matching line.
+ Raises PatchApplicationError if not found or found multiple times.
+ """
+ stripped_source: list[str] = [line.strip() for line in source_lines]
+ stripped_match: list[str] = [line.strip() for line in match_lines]
+ match_len: int = len(stripped_match)
+
+ found_indices: list[int] = [
+ i
+ for i in range(len(stripped_source) - match_len + 1)
+ if stripped_source[i : i + match_len] == stripped_match
+ ]
+
+ if len(found_indices) == 0:
+ preview: str = "\n".join(match_lines)
+ raise PatchApplicationError(f"match text not found in source:\n{preview}")
+ if len(found_indices) > 1:
+ preview = "\n".join(match_lines)
+ raise PatchApplicationError(
+ f"match text found multiple times ({len(found_indices)} occurrences) in source:\n{preview}"
+ )
+
+ return found_indices[0]
+
+
+def _realign_replacement(
+ *, replacement_lines: list[str], original_indent: int
+) -> list[str]:
+ """Realign replacement lines to the original indentation level.
+
+ Strategy:
+ - Take the leading spaces of the first non-empty replacement line as base_indent
+ - For each replacement line: remove base_indent, add original_indent
+ """
+ non_empty: list[str] = [line for line in replacement_lines if line.strip()]
+ if not non_empty:
+ return []
+
+ base_indent: int = _leading_spaces(non_empty[0])
+ result: list[str] = []
+
+ for line in replacement_lines:
+ if not line.strip():
+ result.append("")
+ else:
+ stripped = line[min(base_indent, len(line) - len(line.lstrip())) :]
+ result.append(" " * original_indent + stripped)
+
+ return result
+
+
+def _leading_spaces(line: str) -> int:
+ return len(line) - len(line.lstrip(" "))
diff --git a/sglang/python/sglang/srt/debug_utils/source_patcher/types.py b/sglang/python/sglang/srt/debug_utils/source_patcher/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff44cba6e007af4e408a9622b2b145595593bb8
--- /dev/null
+++ b/sglang/python/sglang/srt/debug_utils/source_patcher/types.py
@@ -0,0 +1,63 @@
+import types
+from collections.abc import Callable
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, model_validator
+
+
+class PatchApplicationError(Exception):
+ """match text not found or not unique in source."""
+
+
+class _StrictBase(BaseModel):
+ model_config = ConfigDict(extra="forbid")
+
+
+class EditSpec(_StrictBase):
+ """Specify one edit: replace, prepend before, or append after the matched text.
+
+ Use ``replacement`` to substitute the matched text (empty string = delete).
+ Use ``prepend`` to keep the matched text and add lines before it.
+ Use ``append`` to keep the matched text and add lines after it.
+ Only one of ``replacement``, ``prepend``, and ``append`` may be set.
+ """
+
+ match: str
+ replacement: str = ""
+ prepend: str = ""
+ append: str = ""
+
+ @model_validator(mode="after")
+ def _check_modes_mutually_exclusive(self) -> "EditSpec":
+ active: list[str] = [
+ name
+ for name in ("replacement", "prepend", "append")
+ if getattr(self, name).strip()
+ ]
+ if len(active) > 1:
+ raise ValueError(
+ f"only one of 'replacement', 'prepend', 'append' may be set, "
+ f"got: {', '.join(active)}"
+ )
+ return self
+
+
+class PatchSpec(_StrictBase):
+ target: str
+ edits: list[EditSpec]
+ preamble: str = ""
+
+
+class PatchConfig(_StrictBase):
+ patches: list[PatchSpec]
+
+
+class PatchState:
+ def __init__(
+ self, *, target_fn: Callable[..., Any], original_code: types.CodeType
+ ) -> None:
+ self.target_fn = target_fn
+ self.original_code = original_code
+
+ def restore(self) -> None:
+ self.target_fn.__code__ = self.original_code
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/decode.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/decode.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..826613acff4bec0726731bb264de0f7bfc71da3b
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/decode.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/decode_kvcache_offload_manager.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/decode_kvcache_offload_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..164f3a16c1e071f6a1679baa9a3ba6b15841813a
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/decode_kvcache_offload_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/decode_schedule_batch_mixin.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/decode_schedule_batch_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc77bca7494b8196d8196cf7560a605cbb4a82f9
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/decode_schedule_batch_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/encode_receiver.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/encode_receiver.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f320660df5894d2faad7c61adea97253ae4623ce
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/encode_receiver.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/kv_events.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/kv_events.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83f2ec9f8868bc95cfcab05e5bedc6a5b7eb3ecb
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/kv_events.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/prefill.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/prefill.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..854714bcb84c48d847d829f9e75aa18299ebcc24
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/prefill.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec4f02e9a096f40323653a4d309370fdb27b638e
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/ascend/__init__.py b/sglang/python/sglang/srt/disaggregation/ascend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2550f91a4e7cd7871c1e5cafe8c9c2ca81cdc6f5
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/ascend/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.ascend.conn import (
+ AscendKVBootstrapServer,
+ AscendKVManager,
+ AscendKVReceiver,
+ AscendKVSender,
+)
diff --git a/sglang/python/sglang/srt/disaggregation/ascend/conn.py b/sglang/python/sglang/srt/disaggregation/ascend/conn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eff99404eecc5097ac14de0ca19556136b3dc8c
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/ascend/conn.py
@@ -0,0 +1,138 @@
+import concurrent.futures
+import logging
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
+from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
+from sglang.srt.disaggregation.mooncake.conn import (
+ MooncakeKVBootstrapServer,
+ MooncakeKVManager,
+ MooncakeKVReceiver,
+ MooncakeKVSender,
+)
+from sglang.srt.utils import get_local_ip_auto
+
+logger = logging.getLogger(__name__)
+
+
+class AscendKVManager(MooncakeKVManager):
+ def init_engine(self):
+ # TransferEngine initialized on ascend.
+ local_ip = get_local_ip_auto()
+ self.engine = AscendTransferEngine(
+ hostname=local_ip,
+ npu_id=self.kv_args.gpu_id,
+ disaggregation_mode=self.disaggregation_mode,
+ )
+
+ def register_buffer_to_engine(self):
+ self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
+ # The Ascend backend optimize batch registration for small memory blocks.
+ self.engine.batch_register(
+ self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+ )
+
+ def send_kvcache(
+ self,
+ mooncake_session_id: str,
+ prefill_kv_indices: npt.NDArray[np.int32],
+ dst_kv_ptrs: list[int],
+ dst_kv_indices: npt.NDArray[np.int32],
+ executor: concurrent.futures.ThreadPoolExecutor,
+ ):
+ # Group by indices
+ prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+ prefill_kv_indices, dst_kv_indices
+ )
+
+ if self.pp_size > 1:
+ src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
+ self.get_mha_kv_ptrs_with_pp(self.kv_args.kv_data_ptrs, dst_kv_ptrs)
+ )
+
+ layers_params = [
+ (
+ src_k_ptrs[layer_id],
+ dst_k_ptrs[layer_id],
+ self.kv_args.kv_item_lens[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ] + [
+ (
+ src_v_ptrs[layer_id],
+ dst_v_ptrs[layer_id],
+ self.kv_args.kv_item_lens[layers_current_pp_stage + layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+ else:
+ num_layers = len(self.kv_args.kv_data_ptrs)
+ layers_params = [
+ (
+ self.kv_args.kv_data_ptrs[layer_id],
+ dst_kv_ptrs[layer_id],
+ self.kv_args.kv_item_lens[layer_id],
+ )
+ for layer_id in range(num_layers)
+ ]
+
+ def set_transfer_blocks(
+ src_ptr: int, dst_ptr: int, item_len: int
+ ) -> List[Tuple[int, int, int]]:
+ transfer_blocks = []
+ for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
+ src_addr = src_ptr + int(prefill_index[0]) * item_len
+ dst_addr = dst_ptr + int(decode_index[0]) * item_len
+ length = item_len * len(prefill_index)
+ transfer_blocks.append((src_addr, dst_addr, length))
+ return transfer_blocks
+
+ # Worker function for processing a single layer
+ def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
+ transfer_blocks = set_transfer_blocks(src_ptr, dst_ptr, item_len)
+ return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+ # Worker function for processing all layers in a batch
+ def process_layers(layers_params: List[Tuple[int, int, int]]) -> int:
+ transfer_blocks = []
+ for src_ptr, dst_ptr, item_len in layers_params:
+ transfer_blocks.extend(set_transfer_blocks(src_ptr, dst_ptr, item_len))
+ return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+ if self.enable_custom_mem_pool:
+ futures = [
+ executor.submit(
+ process_layer,
+ src_ptr,
+ dst_ptr,
+ item_len,
+ )
+ for (src_ptr, dst_ptr, item_len) in layers_params
+ ]
+ for future in concurrent.futures.as_completed(futures):
+ status = future.result()
+ if status != 0:
+ for f in futures:
+ f.cancel()
+ return status
+ else:
+ # Combining all layers' params in one batch transfer is more efficient
+ # compared to using multiple threads
+ return process_layers(layers_params)
+
+ return 0
+
+
+class AscendKVSender(MooncakeKVSender):
+ pass
+
+
+class AscendKVReceiver(MooncakeKVReceiver):
+ pass
+
+
+class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
+ pass
diff --git a/sglang/python/sglang/srt/disaggregation/ascend/transfer_engine.py b/sglang/python/sglang/srt/disaggregation/ascend/transfer_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c402df1059e8d290f05dbb631b4685212af607
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/ascend/transfer_engine.py
@@ -0,0 +1,100 @@
+import logging
+import os
+from typing import List
+
+import torch
+
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.distributed.device_communicators.mooncake_transfer_engine import (
+ MooncakeTransferEngine,
+)
+
+try:
+ from memfabric_hybrid import TransferEngine
+
+ import_error = None
+except ImportError as e:
+ import_error = e
+ pass
+
+logger = logging.getLogger(__name__)
+
+
+class AscendTransferEngine(MooncakeTransferEngine):
+
+ def __init__(
+ self,
+ hostname: str,
+ npu_id: int,
+ disaggregation_mode: DisaggregationMode,
+ ):
+ if import_error is not None:
+ logger.warning(
+ "Please install memfabric_hybrid, for details, see docs/backend/pd_disaggregation.md"
+ )
+ raise import_error
+
+ self.engine = TransferEngine()
+ self.hostname = hostname
+ self.npu_id = npu_id
+
+ # Centralized storage address of the AscendTransferEngine
+ self.store_url = os.getenv("ASCEND_MF_STORE_URL")
+ if disaggregation_mode == DisaggregationMode.PREFILL:
+ self.role = "Prefill"
+ elif disaggregation_mode == DisaggregationMode.DECODE:
+ self.role = "Decode"
+ else:
+ logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+ raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+ self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
+ self.initialize()
+
+ def initialize(self) -> None:
+ from sglang.srt.distributed.parallel_state import (
+ get_world_group,
+ get_world_size,
+ )
+
+ transfer_protocol = self._get_transfer_protocol()
+ if transfer_protocol is None or transfer_protocol == "sdma":
+ trans_op_type = TransferEngine.TransDataOpType.SDMA
+ else:
+ trans_op_type = TransferEngine.TransDataOpType.DEVICE_RDMA
+ """with device RDMA for PD transfer"""
+ tmp_tensor = torch.zeros(1, device="npu")
+ output_tensor_list = [
+ torch.empty_like(tmp_tensor) for _ in range(get_world_size())
+ ]
+ # Initialize hccl in advance through all_gather to avoid conflicts with rdma initialization.
+ torch.distributed.all_gather(
+ output_tensor_list, tmp_tensor, group=get_world_group().device_group
+ )
+ """Initialize the ascend transfer instance."""
+ ret_value = self.engine.initialize(
+ self.store_url, self.session_id, self.role, self.npu_id, trans_op_type
+ )
+ if ret_value != 0:
+ logger.error("Ascend Transfer Engine initialization failed.")
+ raise RuntimeError("Ascend Transfer Engine initialization failed.")
+
+ def batch_register(self, ptrs: List[int], lengths: List[int]):
+ try:
+ ret_value = self.engine.batch_register_memory(ptrs, lengths)
+ except Exception:
+ # Mark register as failed
+ ret_value = -1
+ if ret_value != 0:
+ logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")
+
+ @staticmethod
+ def _get_transfer_protocol():
+ protocol = os.getenv("ASCEND_MF_TRANSFER_PROTOCOL")
+ allowed_protocols = {"device_rdma", "sdma"}
+ if protocol and protocol.lower() in allowed_protocols:
+ return protocol.lower()
+ else:
+ logger.warning(
+ "Invalid or no transfer protocol specified, using default protocol."
+ )
+ return None
diff --git a/sglang/python/sglang/srt/disaggregation/base/__init__.py b/sglang/python/sglang/srt/disaggregation/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0f797fcf74a675cc1daf9d94db7ab68ce048a7
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/base/__init__.py
@@ -0,0 +1,8 @@
+from sglang.srt.disaggregation.base.conn import (
+ BaseKVBootstrapServer,
+ BaseKVManager,
+ BaseKVReceiver,
+ BaseKVSender,
+ KVArgs,
+ KVPoll,
+)
diff --git a/sglang/python/sglang/srt/disaggregation/base/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/base/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a63d6323d03c0bf0c3f22a72ed725bd69f49f8ac
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/base/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/base/__pycache__/conn.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/base/__pycache__/conn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f555959f710c34c022b19ad4f6673ce39171522
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/base/__pycache__/conn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/base/conn.py b/sglang/python/sglang/srt/disaggregation/base/conn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2309e8a83c3c85f9fec7677d0c7d561487508fb4
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/base/conn.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.server_args import ServerArgs
+
+if TYPE_CHECKING:
+ from sglang.srt.disaggregation.utils import DisaggregationMode
+
+
+class KVArgs:
+ engine_rank: int
+ kv_data_ptrs: List[int]
+ kv_data_lens: List[int]
+ kv_item_lens: List[int]
+ aux_data_ptrs: List[int]
+ aux_data_lens: List[int]
+ aux_item_lens: List[int]
+ state_data_ptrs: List[int]
+ state_data_lens: List[int]
+ state_item_lens: List[int]
+ state_type: str # "none", "mamba", "swa"
+ # for mamba state different tp slice transfer
+ state_dim_per_tensor: List[int] # dimension to slice for each state tensor
+ ib_device: str
+ ib_traffic_class: str
+ gpu_id: int
+ kv_head_num: int
+ total_kv_head_num: int
+ page_size: int
+ # for pp prefill
+ pp_rank: int
+ prefill_start_layer: int
+ # for system dp
+ system_dp_rank: int
+
+
+class KVPoll:
+ Failed = 0
+ Bootstrapping = 1
+ WaitingForInput = 2
+ Transferring = 3
+ Success = 4
+
+
+class BaseKVManager(ABC):
+ """Base class for managing transfer states"""
+
+ @abstractmethod
+ def __init__(
+ self,
+ args: KVArgs,
+ disaggregation_mode: DisaggregationMode,
+ server_args: ServerArgs,
+ is_mla_backend: Optional[bool] = False,
+ ): ...
+
+ @abstractmethod
+ def register_to_bootstrap(self):
+ """Register prefill server info to the bootstrap server."""
+ ...
+
+
+class BaseKVSender(ABC):
+
+ @abstractmethod
+ def __init__(
+ self,
+ mgr: BaseKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: int,
+ dest_tp_ranks: List[int],
+ pp_rank: int,
+ ): ...
+
+ @abstractmethod
+ def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
+ """
+ Set req's index metadata locally or notify the decoder server about the kv indices length and aux index.
+ """
+ ...
+
+ @abstractmethod
+ def send(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ state_indices: Optional[List[int]] = None,
+ ):
+ """
+ Send the kv cache at the given kv indices and the extra cache/state at the given indices to the decoder server.
+ """
+ ...
+
+ @abstractmethod
+ def poll(self) -> KVPoll:
+ """
+ Check the status of the kv cache transfer.
+ """
+ ...
+
+ @abstractmethod
+ def failure_exception(self):
+ """
+ Raise an exception if the kv cache transfer fails.
+ """
+ ...
+
+
+class BaseKVReceiver(ABC):
+
+ @abstractmethod
+ def __init__(
+ self,
+ mgr: BaseKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: Optional[int] = None,
+ ): ...
+
+ @abstractmethod
+ def init(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ aux_index: Optional[int] = None,
+ state_indices: Optional[List[int]] = None,
+ ):
+ """
+ Set req's index metadata locally or notify the prefill server about the kv indices, aux index, and state_indices.
+ """
+ ...
+
+ @abstractmethod
+ def poll(self) -> KVPoll:
+ """
+ Check the status of the kv cache transfer.
+ """
+ ...
+
+ @abstractmethod
+ def failure_exception(self):
+ """
+ Raise an exception if the kv cache transfer fails.
+ """
+ ...
+
+ def clear(self):
+ """
+ Clear any internal states.
+ """
+ pass
+
+ def abort(self):
+ """
+ Abort the current transfer.
+ """
+ pass
+
+
+class BaseKVBootstrapServer(ABC):
+ @abstractmethod
+ def __init__(self, host: str, port: int): ...
diff --git a/sglang/python/sglang/srt/disaggregation/common/__init__.py b/sglang/python/sglang/srt/disaggregation/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8294c3892087fd697a2e6cb6e90c974e12bec7e7
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/common/__init__.py
@@ -0,0 +1,5 @@
+from sglang.srt.disaggregation.common.conn import (
+ CommonKVBootstrapServer,
+ CommonKVManager,
+ CommonKVReceiver,
+)
diff --git a/sglang/python/sglang/srt/disaggregation/common/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/common/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f790c07413ae79370f75c736ef0e350e43801822
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/common/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/common/__pycache__/conn.cpython-311.pyc b/sglang/python/sglang/srt/disaggregation/common/__pycache__/conn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa92b37f089edc01a4adaba8acf1b6797580a5ea
Binary files /dev/null and b/sglang/python/sglang/srt/disaggregation/common/__pycache__/conn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/disaggregation/common/conn.py b/sglang/python/sglang/srt/disaggregation/common/conn.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc21f88826c3c100dc36f3a383fd9acc2f7c51d1
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/common/conn.py
@@ -0,0 +1,942 @@
+from __future__ import annotations
+
+import asyncio
+import dataclasses
+import logging
+import socket
+import threading
+import time
+from collections import defaultdict
+from functools import cache
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+import requests
+import zmq
+from aiohttp import web
+
+from sglang.srt.disaggregation.base.conn import (
+ BaseKVBootstrapServer,
+ BaseKVManager,
+ BaseKVReceiver,
+ BaseKVSender,
+ KVArgs,
+ KVPoll,
+)
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import (
+ get_attention_cp_rank,
+ get_attention_cp_size,
+ get_attention_dp_rank,
+ get_attention_dp_size,
+ get_attention_tp_rank,
+ get_attention_tp_size,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import (
+ format_tcp_address,
+ get_local_ip_auto,
+ get_zmq_socket_on_host,
+ is_valid_ipv6_address,
+ maybe_wrap_ipv6_address,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class PrefillServerInfo:
+ attn_tp_size: int
+ attn_cp_size: int
+ dp_size: int
+ pp_size: int
+ page_size: Optional[int]
+ kv_cache_dtype: Optional[str]
+ follow_bootstrap_room: bool
+
+ def __post_init__(self):
+ self.attn_tp_size = int(self.attn_tp_size)
+ self.attn_cp_size = int(self.attn_cp_size)
+ self.dp_size = int(self.dp_size)
+ self.pp_size = int(self.pp_size)
+ self.page_size = int(self.page_size) if self.page_size is not None else None
+ self.kv_cache_dtype = (
+ str(self.kv_cache_dtype) if self.kv_cache_dtype is not None else None
+ )
+ self.follow_bootstrap_room = bool(self.follow_bootstrap_room)
+
+
+@dataclasses.dataclass
+class PrefillRankInfo:
+ rank_ip: str
+ rank_port: int
+
+ def __post_init__(self):
+ self.rank_ip = str(self.rank_ip)
+ self.rank_port = int(self.rank_port)
+
+
+class CommonKVManager(BaseKVManager):
+ def __init__(
+ self,
+ args: KVArgs,
+ disaggregation_mode: DisaggregationMode,
+ server_args: ServerArgs,
+ is_mla_backend: Optional[bool] = False,
+ ):
+ self.kv_args = args
+ self.is_mla_backend = is_mla_backend
+ self.disaggregation_mode = disaggregation_mode
+ self.server_args = server_args
+ # for p/d multi node infer
+ self.bootstrap_host = server_args.host
+ self.bootstrap_port = server_args.disaggregation_bootstrap_port
+ self.dist_init_addr = server_args.dist_init_addr
+ self.attn_tp_size = get_attention_tp_size()
+ self.attn_tp_rank = get_attention_tp_rank()
+ self.attn_cp_size = get_attention_cp_size()
+ self.attn_cp_rank = get_attention_cp_rank()
+ self.attn_dp_size = get_attention_dp_size()
+ self.attn_dp_rank = get_attention_dp_rank()
+ self.system_dp_size = (
+ 1 if server_args.enable_dp_attention else server_args.dp_size
+ )
+ self.system_dp_rank = (
+ self.kv_args.system_dp_rank if self.kv_args.system_dp_rank else 0
+ )
+ self.pp_size = server_args.pp_size
+ self.pp_rank = self.kv_args.pp_rank
+ self.local_ip = get_local_ip_auto()
+ self.enable_all_cp_ranks_for_transfer = (
+ envs.SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER.get()
+ )
+
+ # bind zmq socket
+ context = zmq.Context()
+ zmq_bind_host = maybe_wrap_ipv6_address(self.local_ip)
+ self.rank_port, self.server_socket = get_zmq_socket_on_host(
+ context, zmq.PULL, host=zmq_bind_host
+ )
+ logger.debug(f"kv manager bind to {zmq_bind_host}:{self.rank_port}")
+
+ self.request_status: Dict[int, KVPoll] = {}
+ self.failure_records: Dict[int, str] = {}
+ self.failure_lock = threading.Lock()
+
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
+ # When SGLANG_DISAGGREGATION_ALL_CP_RANKS_TRANSFER is True, all CP ranks
+ # participate in KV transfer; Otherwise only CP rank 0 sends.
+ self.is_dummy_cp_rank = (
+ not self.enable_all_cp_ranks_for_transfer
+ and self.attn_cp_size > 1
+ and self.attn_cp_rank != 0
+ )
+ self.register_to_bootstrap()
+ self.transfer_infos = {}
+ self.decode_kv_args_table = {}
+ self.pp_group = get_pp_group()
+ # If a timeout happens on the prefill side, it means prefill instances
+ # fail to receive the KV indices from the decode instance of this request.
+ # These timeout requests should be aborted to release the tree cache.
+ self.bootstrap_timeout = envs.SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT.get()
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
+ self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
+ self.connection_lock = threading.Lock()
+ self.required_prefill_response_num_table: Dict[int, int] = {}
+ self.prefill_info_table: Dict[str, PrefillServerInfo] = {}
+ self.heartbeat_failures: Dict[str, int] = {}
+ self.session_pool: Dict = defaultdict(requests.Session)
+ self.session_pool_lock = threading.Lock()
+ self.addr_to_rooms_tracker: Dict[str, Set[int]] = defaultdict(set)
+ self.prefill_response_tracker: Dict[int, Set[int]] = defaultdict(set)
+ # Heartbeat interval should be at least 2 seconds
+ self.heartbeat_interval = max(
+ envs.SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL.get(), 2.0
+ )
+ # Heartbeat failure should be at least 1
+ self.max_failures = max(
+ envs.SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE.get(), 1
+ )
+ # If a timeout happens on the decode side, it means decode instances
+ # fail to receive the KV Cache transfer done signal after bootstrapping.
+ # These timeout requests should be aborted to release the tree cache.
+ self.waiting_timeout = envs.SGLANG_DISAGGREGATION_WAITING_TIMEOUT.get()
+ else:
+ raise ValueError(
+ f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
+ )
+
+ def check_status(self, bootstrap_room: int) -> KVPoll:
+ return self.request_status[bootstrap_room]
+
+ def update_status(self, bootstrap_room: int, status: KVPoll):
+ if bootstrap_room not in self.request_status:
+ self.request_status[bootstrap_room] = status
+ else:
+ if status == KVPoll.Failed:
+ self.request_status[bootstrap_room] = KVPoll.Failed
+ else:
+ self.request_status[bootstrap_room] = max(
+ self.request_status[bootstrap_room], status
+ )
+
+ def record_failure(self, bootstrap_room: int, failure_reason: str):
+ with self.failure_lock:
+ self.failure_records[bootstrap_room] = failure_reason
+
+ def ensure_parallel_info(
+ self, bootstrap_addr: str, max_retries: int = 20, retry_interval: float = 1.0
+ ) -> bool:
+ """Fetch and cache prefill parallel info if not yet available.
+ Returns True if info is available (cached or freshly fetched).
+ Retries with backoff if the prefill server hasn't registered yet.
+ """
+ if bootstrap_addr in self.prefill_info_table:
+ return True
+ info = None
+ for attempt in range(max_retries):
+ info = self._fetch_prefill_server_info(bootstrap_addr)
+ if info is not None:
+ break
+ if attempt < max_retries - 1:
+ logger.info(
+ f"Prefill server info not available from {bootstrap_addr}, "
+ f"retrying ({attempt + 1}/{max_retries})..."
+ )
+ time.sleep(retry_interval)
+ if info is None:
+ return False
+
+ if info.page_size is not None and info.page_size != self.kv_args.page_size:
+ raise RuntimeError(
+ f"Page size mismatch: prefill server has page_size={info.page_size}, "
+ f"but decode server has page_size={self.kv_args.page_size}. "
+ f"Both servers must use the same --page-size value."
+ )
+
+ if (
+ info.kv_cache_dtype is not None
+ and info.kv_cache_dtype != self.server_args.kv_cache_dtype
+ ):
+ raise RuntimeError(
+ f"KV cache dtype mismatch: prefill server has kv_cache_dtype={info.kv_cache_dtype}, "
+ f"but decode server has kv_cache_dtype={self.server_args.kv_cache_dtype}. "
+ f"Both servers must use the same --kv-cache-dtype value."
+ )
+
+ self.prefill_info_table[bootstrap_addr] = info
+ logger.debug(f"Prefill parallel info for [{bootstrap_addr}]: {info}")
+ return True
+
+ @staticmethod
+ def _fetch_prefill_server_info(
+ bootstrap_addr: str,
+ ) -> Optional[PrefillServerInfo]:
+ """Fetch the prefill server info from the bootstrap server."""
+ try:
+ url = f"http://{bootstrap_addr}/route?prefill_dp_rank={-1}&prefill_cp_rank={-1}&target_tp_rank={-1}&target_pp_rank={-1}"
+ response = requests.get(url, timeout=5)
+ if response.status_code == 200:
+ data = response.json()
+ return PrefillServerInfo(**data)
+ else:
+ logger.error(
+ f"Failed to get prefill server info: {response.status_code}, {response.text}"
+ )
+ return None
+ except Exception as e:
+ logger.error(f"Error fetching prefill server info from bootstrap: {e}")
+ return None
+
+ def register_to_bootstrap(self):
+ """Register prefill server info to bootstrap server via HTTP POST."""
+ if self.dist_init_addr:
+ # Multi-node case: bootstrap server's host is dist_init_addr
+ if self.dist_init_addr.startswith("["): # [ipv6]:port or [ipv6]
+ if self.dist_init_addr.endswith("]"):
+ host = self.dist_init_addr
+ else:
+ host, _ = self.dist_init_addr.rsplit(":", 1)
+ else:
+ host = socket.gethostbyname(self.dist_init_addr.rsplit(":", 1)[0])
+ else:
+ # Single-node case: bootstrap server's host is the same as http server's host
+ host = self.bootstrap_host
+ host = maybe_wrap_ipv6_address(host)
+
+ bootstrap_server_url = f"{host}:{self.bootstrap_port}"
+ url = f"http://{bootstrap_server_url}/route"
+ payload = {
+ "attn_tp_size": self.attn_tp_size,
+ "attn_tp_rank": self.attn_tp_rank,
+ "attn_cp_size": self.attn_cp_size,
+ "attn_cp_rank": self.attn_cp_rank,
+ "attn_dp_size": self.attn_dp_size,
+ "attn_dp_rank": self.attn_dp_rank,
+ "pp_size": self.pp_size,
+ "pp_rank": self.pp_rank,
+ "system_dp_size": self.system_dp_size,
+ "system_dp_rank": self.system_dp_rank,
+ "rank_ip": self.local_ip,
+ "rank_port": self.rank_port,
+ "page_size": self.kv_args.page_size,
+ "kv_cache_dtype": self.server_args.kv_cache_dtype,
+ "load_balance_method": self.server_args.load_balance_method,
+ }
+
+ try:
+ response = requests.put(url, json=payload, timeout=5)
+ if response.status_code == 200:
+ logger.debug("Prefill successfully registered to bootstrap server.")
+ else:
+ logger.error(
+ f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
+ )
+ except Exception as e:
+ logger.error(
+ f"Prefill instance failed to register to bootstrap server: {e}"
+ )
+
+ @cache
+ def _connect(self, endpoint: str, is_ipv6: bool = False):
+ socket = zmq.Context().socket(zmq.PUSH)
+ if is_ipv6:
+ socket.setsockopt(zmq.IPV6, 1)
+ socket.connect(endpoint)
+ return socket
+
+ def get_mha_kv_ptrs_with_pp(
+ self, src_kv_ptrs: List[int], dst_kv_ptrs: List[int]
+ ) -> Tuple[List[int], List[int], List[int], List[int], int]:
+ start_layer = self.kv_args.prefill_start_layer
+ num_kv_layers = len(src_kv_ptrs) // 2
+ end_layer = start_layer + num_kv_layers
+ dst_num_total_layers = len(dst_kv_ptrs) // 2
+ src_k_ptrs = src_kv_ptrs[:num_kv_layers]
+ src_v_ptrs = src_kv_ptrs[num_kv_layers:]
+ if num_kv_layers == dst_num_total_layers:
+ dst_k_ptrs = dst_kv_ptrs[:dst_num_total_layers]
+ dst_v_ptrs = dst_kv_ptrs[dst_num_total_layers:]
+ elif (
+ num_kv_layers < dst_num_total_layers
+ and dst_num_total_layers % num_kv_layers != 0
+ ):
+ # Case: Decode has draft model KV while Prefill is deployed without speculative decoding
+ # dst_kv_ptrs layout: [K_main..., V_main..., draft_K..., draft_V...]
+ multiplier_ratio = dst_num_total_layers // num_kv_layers
+ dst_k_ptrs = dst_kv_ptrs[start_layer:end_layer]
+ v_ptr_offset = num_kv_layers * multiplier_ratio
+ dst_v_ptrs = dst_kv_ptrs[
+ v_ptr_offset + start_layer : v_ptr_offset + end_layer
+ ]
+ else:
+ # Decode pp size should be equal to prefill pp size or 1
+ dst_k_ptrs = dst_kv_ptrs[start_layer:end_layer]
+ dst_v_ptrs = dst_kv_ptrs[
+ dst_num_total_layers + start_layer : dst_num_total_layers + end_layer
+ ]
+ layers_current_pp_stage = len(src_k_ptrs)
+ return src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage
+
+ def get_mla_kv_ptrs_with_pp(
+ self, src_kv_ptrs: List[int], dst_kv_ptrs: List[int]
+ ) -> Tuple[List[int], List[int], int]:
+ start_layer = self.kv_args.prefill_start_layer
+ end_layer = start_layer + len(src_kv_ptrs)
+ if len(src_kv_ptrs) == len(dst_kv_ptrs):
+ sliced_dst_kv_ptrs = dst_kv_ptrs
+ else:
+ # Decode pp size should be equal to prefill pp size or 1
+ sliced_dst_kv_ptrs = dst_kv_ptrs[start_layer:end_layer]
+ layers_current_pp_stage = len(src_kv_ptrs)
+ return src_kv_ptrs, sliced_dst_kv_ptrs, layers_current_pp_stage
+
+
+class CommonKVSender(BaseKVSender):
+ def __init__(
+ self,
+ mgr: CommonKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: int,
+ dest_tp_ranks: List[int],
+ pp_rank: int,
+ ):
+ self.kv_mgr = mgr
+ self.bootstrap_room = bootstrap_room
+ self.aux_index = None
+ self.bootstrap_server_url = bootstrap_addr
+ # inner state
+ self.curr_idx = 0
+ if self.kv_mgr.is_dummy_cp_rank:
+ # Non-authoritative CP ranks are dummy participants.
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
+ return
+
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
+ if (
+ self.kv_mgr.server_args.dp_size > 1
+ and self.kv_mgr.server_args.load_balance_method != "follow_bootstrap_room"
+ ):
+ self._register_prefill_dp_rank()
+
+ def _register_prefill_dp_rank(self):
+ """Register this request's prefill dp_rank to the bootstrap server."""
+ url = f"http://{self.bootstrap_server_url}/register_dp_rank"
+ payload = {
+ "bootstrap_room": self.bootstrap_room,
+ "dp_rank": self.kv_mgr.attn_dp_rank,
+ }
+ try:
+ response = requests.post(url, json=payload, timeout=5)
+ if response.status_code != 200:
+ logger.error(
+ f"Failed to register prefill dp_rank: {response.status_code}, {response.text}"
+ )
+ except Exception as e:
+ logger.error(f"Failed to register prefill dp_rank: {e}")
+
+ def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
+ self.num_kv_indices = num_kv_indices
+ self.aux_index = aux_index
+ logger.debug(
+ f"CommonKVSender init with num_kv_indices: {num_kv_indices} and aux_index: {aux_index}"
+ )
+
+ def send(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ state_indices: Optional[List[int]] = None,
+ ):
+ pass
+
+ def poll(self) -> KVPoll:
+ pass
+
+ def failure_exception(self):
+ raise Exception("Fake KVReceiver Exception")
+
+
+class CommonKVReceiver(BaseKVReceiver):
+ _ctx = zmq.Context()
+ _socket_cache = {}
+ _socket_locks = {}
+ _global_lock = threading.Lock()
+
+ def __init__(
+ self,
+ mgr: CommonKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: Optional[int] = None,
+ prefill_dp_rank: Optional[int] = None,
+ ):
+ self.bootstrap_room = bootstrap_room
+ self.bootstrap_addr = bootstrap_addr
+ self.kv_mgr = mgr
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
+
+ if not self.kv_mgr.ensure_parallel_info(self.bootstrap_addr):
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
+ )
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+ self.bootstrap_infos = None
+ return
+
+ self.prefill_info = self.kv_mgr.prefill_info_table[self.bootstrap_addr]
+
+ # Rank mapping for PD with different TP sizes per rank for target DP/CP group
+ if self.kv_mgr.attn_tp_size == self.prefill_info.attn_tp_size:
+ self.target_tp_rank = (
+ self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size
+ )
+ self.required_dst_info_num = 1
+ self.required_prefill_response_num = 1
+ self.target_tp_ranks = [self.target_tp_rank]
+ elif self.kv_mgr.attn_tp_size > self.prefill_info.attn_tp_size:
+ if not self.kv_mgr.is_mla_backend:
+ logger.warning_once(
+ "Performance is NOT guaranteed when using different TP sizes for non-MLA models. "
+ )
+ self.target_tp_rank = (
+ self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size
+ ) // (self.kv_mgr.attn_tp_size // self.prefill_info.attn_tp_size)
+ self.required_dst_info_num = (
+ self.kv_mgr.attn_tp_size // self.prefill_info.attn_tp_size
+ )
+ self.required_prefill_response_num = 1
+ self.target_tp_ranks = [self.target_tp_rank]
+ else:
+ if not self.kv_mgr.is_mla_backend:
+ logger.warning_once(
+ "Performance is NOT guaranteed when using different TP sizes for non-MLA models. "
+ )
+ # For non-MLA models, one decode rank needs to retrieve KVCache from multiple prefill ranks for non MLA models;
+ self.target_tp_ranks = [
+ rank
+ for rank in range(
+ (self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size)
+ * (self.prefill_info.attn_tp_size // self.kv_mgr.attn_tp_size),
+ (self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size + 1)
+ * (self.prefill_info.attn_tp_size // self.kv_mgr.attn_tp_size),
+ )
+ ]
+
+ # For MLA models, we can retrieve KVCache from only one prefill rank, but we still need to maintain
+ # multiple connections in the connection pool and have to send dummy requests to other prefill ranks,
+ # or the KVPoll will never be set correctly
+ self.target_tp_rank = self.target_tp_ranks[0]
+ self.required_dst_info_num = 1
+ if self.kv_mgr.is_mla_backend:
+ self.required_prefill_response_num = 1
+ else:
+ self.required_prefill_response_num = (
+ self.prefill_info.attn_tp_size // self.kv_mgr.attn_tp_size
+ )
+
+ # Decode cp size should be equal to 1
+ assert self.kv_mgr.attn_cp_size == 1, (
+ f"Decode cp size ({self.kv_mgr.attn_cp_size}) should be equal to 1",
+ )
+ if self.kv_mgr.attn_cp_size == self.prefill_info.attn_cp_size:
+ # This means that the prefill cp size is 1
+ assert self.prefill_info.attn_cp_size == 1, (
+ f"When prefill cp size is 1, attn cp size should be 1, but got {self.kv_mgr.attn_cp_size}",
+ )
+ self.target_cp_ranks = [self.kv_mgr.attn_cp_rank]
+ else:
+ self.target_cp_ranks = [
+ rank for rank in range(self.prefill_info.attn_cp_size)
+ ]
+ if not self.kv_mgr.enable_all_cp_ranks_for_transfer:
+ # Only retrieve from prefill CP rank 0 when not using all ranks
+ self.target_cp_ranks = self.target_cp_ranks[:1]
+ self.required_prefill_response_num *= 1
+ else:
+ self.required_prefill_response_num *= (
+ self.prefill_info.attn_cp_size // self.kv_mgr.attn_cp_size
+ )
+
+ # Decode pp size should be equal to prefill pp size or 1
+ assert (
+ self.kv_mgr.pp_size == self.prefill_info.pp_size or self.kv_mgr.pp_size == 1
+ ), (
+ f"Decode pp size ({self.kv_mgr.pp_size}) should be equal to prefill pp size ({self.prefill_info.pp_size}) or 1",
+ )
+ if self.prefill_info.pp_size == self.kv_mgr.pp_size:
+ self.target_pp_ranks = [self.kv_mgr.pp_rank]
+ else:
+ self.target_pp_ranks = [rank for rank in range(self.prefill_info.pp_size)]
+ self.required_prefill_response_num *= (
+ self.prefill_info.pp_size // self.kv_mgr.pp_size
+ )
+
+ self.kv_mgr.required_prefill_response_num_table[self.bootstrap_room] = (
+ self.required_prefill_response_num
+ )
+
+ assert (
+ prefill_dp_rank is not None
+ ), "prefill_dp_rank must be resolved before creating receiver"
+ self.prefill_dp_rank = prefill_dp_rank
+ self._setup_bootstrap_infos()
+
+ def _setup_bootstrap_infos(self):
+ all_bootstrap_infos = []
+ # NOTE: key distinguished by bootstrap_addr, prefill_dp_rank, prefill_cp_rank, and target_tp_rank
+ for target_cp_rank in self.target_cp_ranks:
+ bootstrap_key = f"{self.bootstrap_addr}_{self.prefill_dp_rank}_{target_cp_rank}_{self.target_tp_rank}"
+
+ if bootstrap_key not in self.kv_mgr.connection_pool:
+ bootstrap_infos = []
+ for target_tp_rank in self.target_tp_ranks:
+ # Enable higher PP ranks to be bootstrapped earlier to make PP PD requests bootstrap more robust
+ for target_pp_rank in reversed(self.target_pp_ranks):
+ bootstrap_info = self._get_bootstrap_info_from_server(
+ self.prefill_dp_rank,
+ target_cp_rank,
+ target_tp_rank,
+ target_pp_rank,
+ )
+ if bootstrap_info is not None:
+ if self.kv_mgr.is_mla_backend:
+ # For MLA: target_tp_rank is the selected real rank, others are dummy ranks
+ bootstrap_info["is_dummy"] = not bool(
+ target_tp_rank == self.target_tp_rank
+ or self.target_tp_rank is None
+ )
+ else:
+ # For non-MLA: all target_tp_ranks are selected real ranks
+ bootstrap_info["is_dummy"] = False
+ logger.debug(
+ f"Fetched bootstrap info: {bootstrap_info} for DP {self.prefill_dp_rank} CP {target_cp_rank} TP {target_tp_rank} PP {target_pp_rank}"
+ )
+ bootstrap_infos.append(bootstrap_info)
+ else:
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Could not fetch bootstrap info for: prefill_dp_rank: {self.prefill_dp_rank} prefill_cp_rank: {target_cp_rank} target_tp_rank: {target_tp_rank} and target_pp_rank {target_pp_rank}",
+ )
+ self.kv_mgr.update_status(
+ self.bootstrap_room, KVPoll.Failed
+ )
+ return
+
+ self.bootstrap_infos = bootstrap_infos
+ self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
+
+ # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
+ self._register_kv_args()
+ else:
+ self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
+
+ assert len(self.bootstrap_infos) > 0
+ all_bootstrap_infos.extend(self.bootstrap_infos)
+
+ self.bootstrap_infos = all_bootstrap_infos
+
+ def _get_bootstrap_info_from_server(
+ self, prefill_dp_rank, prefill_cp_rank, target_tp_rank, target_pp_rank
+ ):
+ """Fetch the bootstrap info from the bootstrap server."""
+ try:
+ url = f"http://{self.bootstrap_addr}/route?prefill_dp_rank={prefill_dp_rank}&prefill_cp_rank={prefill_cp_rank}&target_tp_rank={target_tp_rank}&target_pp_rank={target_pp_rank}"
+ response = requests.get(url, timeout=5)
+ if response.status_code == 200:
+ bootstrap_info = response.json()
+ return bootstrap_info
+ else:
+ logger.error(
+ f"Failed to get prefill server info: {response.status_code}, {response.text}"
+ )
+ return None
+ except Exception as e:
+ logger.error(f"Error fetching prefill info from bootstrap: {e}")
+ return None
+
+ @staticmethod
+ def query_prefill_dp_ranks(
+ bootstrap_addr: str, bootstrap_rooms: List[int]
+ ) -> Dict[str, int]:
+ """Batch query prefill dp_ranks for given bootstrap_rooms."""
+ try:
+ url = f"http://{bootstrap_addr}/query_dp_ranks"
+ response = requests.post(
+ url,
+ json={"bootstrap_rooms": bootstrap_rooms},
+ timeout=5,
+ )
+ if response.status_code == 200:
+ return response.json()
+ else:
+ logger.error(
+ f"Failed to query dp_ranks: {response.status_code}, {response.text}"
+ )
+ return {}
+ except Exception as e:
+ logger.error(f"Error querying dp_ranks from bootstrap: {e}")
+ return {}
+
+ @classmethod
+ def _connect(cls, endpoint: str, is_ipv6: bool = False):
+ with cls._global_lock:
+ if endpoint not in cls._socket_cache:
+ sock = cls._ctx.socket(zmq.PUSH)
+ if is_ipv6:
+ sock.setsockopt(zmq.IPV6, 1)
+ sock.connect(endpoint)
+ cls._socket_cache[endpoint] = sock
+ cls._socket_locks[endpoint] = threading.Lock()
+ return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
+
+ @classmethod
+ def _connect_to_bootstrap_server(cls, bootstrap_info: dict):
+ ip_address = bootstrap_info["rank_ip"]
+ port = bootstrap_info["rank_port"]
+ is_ipv6_address = is_valid_ipv6_address(ip_address)
+ sock, lock = cls._connect(
+ format_tcp_address(ip_address, port), is_ipv6=is_ipv6_address
+ )
+ return sock, lock
+
+ def _register_kv_args(self):
+ pass
+
+ def failure_exception(self):
+ raise Exception("Fake KVReceiver Exception")
+
+
+class CommonKVBootstrapServer(BaseKVBootstrapServer):
+ def __init__(self, host: str, port: int):
+ self.host = host
+ self.port = port
+ self.app = web.Application()
+ self.store = dict()
+ self.lock = asyncio.Lock()
+ self._setup_routes()
+ self.pp_size = None
+ self.attn_tp_size = None
+ self.attn_cp_size = None
+ self.dp_size = None
+ self.page_size = None
+ self.kv_cache_dtype: Optional[str] = None
+ self.follow_bootstrap_room: Optional[bool] = None
+ self.prefill_port_table: Dict[
+ int, Dict[int, Dict[int, Dict[int, PrefillRankInfo]]]
+ ] = {}
+ self.room_to_dp_rank: Dict[int, Dict[str, Union[int, float]]] = {}
+ self._registered_count = 0
+ self.entry_cleanup_interval = (
+ envs.SGLANG_DISAGGREGATION_BOOTSTRAP_ENTRY_CLEANUP_INTERVAL.get()
+ )
+
+ # Start bootstrap server
+ self.thread = threading.Thread(target=self._run_server, daemon=True)
+ self.run()
+
+ def run(self):
+ self.thread.start()
+
+ def _is_ready(self) -> bool:
+ if (
+ self.attn_tp_size is None
+ or self.attn_cp_size is None
+ or self.pp_size is None
+ or self.dp_size is None
+ ):
+ return False
+ expected = self.dp_size * self.attn_cp_size * self.attn_tp_size * self.pp_size
+ logger.debug(
+ f"Expected {expected} prefill servers to be registered, {self._registered_count} registered so far"
+ )
+ return self._registered_count >= expected
+
+ def _setup_routes(self):
+ self.app.router.add_route("*", "/route", self._handle_route)
+ self.app.router.add_post("/register_dp_rank", self._handle_register_dp_rank)
+ self.app.router.add_post("/query_dp_ranks", self._handle_query_dp_ranks)
+ self.app.router.add_get("/health", self._handle_health_check)
+
+ async def _handle_health_check(self, request):
+ return web.Response(text="OK", status=200)
+
+ async def _handle_route(self, request: web.Request):
+ method = request.method
+ if method == "PUT":
+ return await self._handle_route_put(request)
+ elif method == "GET":
+ return await self._handle_route_get(request)
+ else:
+ return web.Response(
+ text="Method not allowed", status=405, content_type="application/json"
+ )
+
+ async def _handle_route_put(self, request: web.Request):
+ data = await request.json()
+ attn_tp_size = data["attn_tp_size"]
+ attn_tp_rank = data["attn_tp_rank"]
+ attn_cp_size = data["attn_cp_size"]
+ attn_cp_rank = data["attn_cp_rank"]
+ attn_dp_size = data["attn_dp_size"]
+ attn_dp_rank = data["attn_dp_rank"]
+ pp_size = data["pp_size"]
+ pp_rank = data["pp_rank"]
+ system_dp_size = data["system_dp_size"]
+ system_dp_rank = data["system_dp_rank"]
+ rank_ip = data["rank_ip"]
+ rank_port = int(data["rank_port"])
+ page_size = int(data["page_size"])
+ kv_cache_dtype = data["kv_cache_dtype"]
+
+ if self.attn_tp_size is None:
+ self.attn_tp_size = attn_tp_size
+
+ if self.attn_cp_size is None:
+ self.attn_cp_size = attn_cp_size
+
+ if self.dp_size is None:
+ self.dp_size = attn_dp_size if system_dp_size == 1 else system_dp_size
+
+ if self.pp_size is None:
+ self.pp_size = pp_size
+
+ if self.page_size is None and page_size is not None:
+ self.page_size = page_size
+
+ if self.kv_cache_dtype is None and kv_cache_dtype is not None:
+ self.kv_cache_dtype = kv_cache_dtype
+
+ if self.follow_bootstrap_room is None:
+ load_balance_method = data.get(
+ "load_balance_method", "follow_bootstrap_room"
+ )
+ self.follow_bootstrap_room = load_balance_method == "follow_bootstrap_room"
+
+ if system_dp_size == 1:
+ dp_group = attn_dp_rank
+ else:
+ dp_group = system_dp_rank
+
+ # Add lock to make sure thread-safe
+ async with self.lock:
+ dp_group_table = self.prefill_port_table.setdefault(dp_group, {})
+ cp_group_table = dp_group_table.setdefault(attn_cp_rank, {})
+ tp_group_table = cp_group_table.setdefault(attn_tp_rank, {})
+
+ tp_group_table[pp_rank] = PrefillRankInfo(
+ rank_ip=rank_ip,
+ rank_port=rank_port,
+ )
+
+ self._registered_count += 1
+
+ expected = self.dp_size * self.attn_cp_size * self.attn_tp_size * self.pp_size
+ logger.debug(
+ f"Register prefill bootstrap: DP{dp_group} CP{attn_cp_rank} TP{attn_tp_rank} PP{pp_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
+ f" ({self._registered_count}/{expected} registered)"
+ )
+
+ return web.Response(text="OK", status=200)
+
+ async def _handle_route_get(self, request: web.Request):
+ prefill_dp_rank = request.query.get("prefill_dp_rank")
+ prefill_cp_rank = request.query.get("prefill_cp_rank")
+ target_tp_rank = request.query.get("target_tp_rank")
+ target_pp_rank = request.query.get("target_pp_rank")
+ if (
+ not prefill_dp_rank
+ or not prefill_cp_rank
+ or not target_tp_rank
+ or not target_pp_rank
+ ):
+ return web.Response(text="Missing inputs for bootstrap server.", status=400)
+
+ if (
+ int(prefill_dp_rank) == -1
+ and int(prefill_cp_rank) == -1
+ and int(target_tp_rank) == -1
+ and int(target_pp_rank) == -1
+ ):
+ if not self._is_ready():
+ return web.Response(
+ text=f"Prefill server not fully registered yet"
+ f" ({self._registered_count} workers registered).",
+ status=503,
+ )
+ info = PrefillServerInfo(
+ attn_tp_size=self.attn_tp_size,
+ attn_cp_size=self.attn_cp_size,
+ dp_size=self.dp_size,
+ pp_size=self.pp_size,
+ page_size=self.page_size,
+ kv_cache_dtype=self.kv_cache_dtype,
+ follow_bootstrap_room=(
+ self.follow_bootstrap_room
+ if self.follow_bootstrap_room is not None
+ else True
+ ),
+ )
+ return web.json_response(dataclasses.asdict(info), status=200)
+
+ if not self._is_ready():
+ return web.Response(
+ text=f"Prefill server not fully registered yet"
+ f" ({self._registered_count} workers registered).",
+ status=503,
+ )
+
+ # Find corresponding prefill info
+ try:
+ async with self.lock:
+ bootstrap_info = self.prefill_port_table[int(prefill_dp_rank)][
+ int(prefill_cp_rank)
+ ][int(target_tp_rank)][int(target_pp_rank)]
+ except KeyError:
+ return web.Response(
+ text=f"Bootstrap info not found for dp_rank={prefill_dp_rank} cp_rank={prefill_cp_rank} "
+ f"tp_rank={target_tp_rank} pp_rank={target_pp_rank}",
+ status=404,
+ )
+
+ return web.json_response(dataclasses.asdict(bootstrap_info), status=200)
+
+ async def _handle_register_dp_rank(self, request: web.Request):
+ data = await request.json()
+ bootstrap_room = int(data["bootstrap_room"])
+ dp_rank = int(data["dp_rank"])
+ async with self.lock:
+ self.room_to_dp_rank[bootstrap_room] = {
+ "dp_rank": dp_rank,
+ "timestamp": time.time(),
+ }
+ logger.debug(f"Registered dp_rank={dp_rank} for {bootstrap_room=}")
+ return web.Response(text="OK", status=200)
+
+ async def _handle_query_dp_ranks(self, request: web.Request):
+ data = await request.json()
+ bootstrap_rooms = data["bootstrap_rooms"]
+ result = {}
+ async with self.lock:
+ for room in bootstrap_rooms:
+ room_int = int(room)
+ if room_int in self.room_to_dp_rank:
+ result[str(room_int)] = self.room_to_dp_rank[room_int]["dp_rank"]
+ return web.json_response(result, status=200)
+
+ async def _cleanup_expired_entries(self):
+ """Remove entries older than cleanup interval from room_to_dp_rank."""
+ while True:
+ await asyncio.sleep(self.entry_cleanup_interval)
+ current_time = time.time()
+ async with self.lock:
+ expired_keys = [
+ key
+ for key, value in self.room_to_dp_rank.items()
+ if current_time - value["timestamp"] > self.entry_cleanup_interval
+ ]
+ for key in expired_keys:
+ del self.room_to_dp_rank[key]
+ if expired_keys:
+ logger.debug(
+ f"Cleaned up {len(expired_keys)} expired entries from room_to_dp_rank"
+ )
+
+ def _run_server(self):
+ try:
+ # Event Loop
+ self._loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(self._loop)
+
+ self._loop.create_task(self._cleanup_expired_entries())
+
+ access_log = None
+ if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
+ access_log = self.app.logger
+
+ self._runner = web.AppRunner(self.app, access_log=access_log)
+ self._loop.run_until_complete(self._runner.setup())
+
+ site = web.TCPSite(self._runner, host=self.host, port=self.port)
+ self._loop.run_until_complete(site.start())
+ self._loop.run_forever()
+ except Exception as e:
+ logger.error(f"Server error: {str(e)}")
+ finally:
+ # Cleanup
+ self._loop.run_until_complete(self._runner.cleanup())
+ self._loop.close()
+
+ def close(self):
+ """Shutdown"""
+ if self._loop is not None and self._loop.is_running():
+ self._loop.call_soon_threadsafe(self._loop.stop)
+ logger.info("Stopping server loop...")
+
+ if self.thread.is_alive():
+ self.thread.join(timeout=2)
+ logger.info("Server thread stopped")
+
+ def poll(self) -> KVPoll: ...
diff --git a/sglang/python/sglang/srt/disaggregation/common/utils.py b/sglang/python/sglang/srt/disaggregation/common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3da21285a43c2f8a09a250e876253fc67fbba3
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/common/utils.py
@@ -0,0 +1,42 @@
+import threading
+from collections import deque
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+
+class FastQueue:
+ def __init__(self):
+ self._buf = deque()
+ self._cond = threading.Condition()
+
+ def put(self, item):
+ with self._cond:
+ self._buf.append(item)
+ # wake up a thread of wait()
+ self._cond.notify()
+
+ def get(self):
+ with self._cond:
+ # if queue is empty ,block until is notified()
+ while not self._buf:
+ self._cond.wait()
+ return self._buf.popleft()
+
+
+def group_concurrent_contiguous(
+ src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
+) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
+ """Vectorised NumPy implementation."""
+ if src_indices.size == 0:
+ return [], []
+
+ brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+ src_groups = np.split(src_indices, brk)
+ dst_groups = np.split(dst_indices, brk)
+
+ src_groups = [g.tolist() for g in src_groups]
+ dst_groups = [g.tolist() for g in dst_groups]
+
+ return src_groups, dst_groups
diff --git a/sglang/python/sglang/srt/disaggregation/mooncake/__init__.py b/sglang/python/sglang/srt/disaggregation/mooncake/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bea967e4e976123cf41d904e6536fb5f578ce0ad
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/mooncake/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.mooncake.conn import (
+ MooncakeKVBootstrapServer,
+ MooncakeKVManager,
+ MooncakeKVReceiver,
+ MooncakeKVSender,
+)
diff --git a/sglang/python/sglang/srt/disaggregation/mooncake/conn.py b/sglang/python/sglang/srt/disaggregation/mooncake/conn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42ebfffc0a7e916db6dc7597192cf1bddeab3bd
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/mooncake/conn.py
@@ -0,0 +1,1397 @@
+from __future__ import annotations
+
+import concurrent.futures
+import ctypes
+import dataclasses
+import logging
+import os
+import struct
+import threading
+import time
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.disaggregation.base.conn import KVArgs, KVPoll
+from sglang.srt.disaggregation.common.conn import (
+ CommonKVBootstrapServer,
+ CommonKVManager,
+ CommonKVReceiver,
+ CommonKVSender,
+)
+from sglang.srt.disaggregation.common.utils import (
+ FastQueue,
+ group_concurrent_contiguous,
+)
+from sglang.srt.disaggregation.mooncake.utils import (
+ check_mooncake_custom_mem_pool_enabled,
+)
+from sglang.srt.disaggregation.utils import (
+ DisaggregationMode,
+ filter_kv_indices_for_cp_rank,
+)
+from sglang.srt.distributed.parallel_state import get_mooncake_transfer_engine
+from sglang.srt.environ import envs
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import format_tcp_address, is_valid_ipv6_address
+
+logger = logging.getLogger(__name__)
+
+
+class KVTransferError(Exception):
+ def __init__(self, bootstrap_room: int, failure_reason: str):
+ super().__init__(failure_reason)
+ self.bootstrap_room = bootstrap_room
+ self.failure_reason = failure_reason
+
+ def __str__(self):
+ return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
+
+
+# prefill
+@dataclasses.dataclass
+class TransferKVChunk:
+ room: int
+ prefill_kv_indices: npt.NDArray[np.int32]
+ index_slice: slice
+ is_last_chunk: bool
+ prefill_aux_index: Optional[int]
+ state_indices: Optional[List[int]]
+
+
+# decode
+@dataclasses.dataclass
+class TransferInfo:
+ room: int
+ endpoint: str
+ dst_port: int
+ mooncake_session_id: str
+ dst_kv_indices: npt.NDArray[np.int32]
+ dst_aux_index: int
+ dst_state_indices: List[int]
+ required_dst_info_num: int
+ is_dummy: bool
+
+ @classmethod
+ def from_zmq(cls, msg: List[bytes]):
+ if msg[4] == b"" and msg[5] == b"":
+ is_dummy = True
+ dst_kv_indices = np.array([], dtype=np.int32)
+ dst_aux_index = None
+ dst_state_indices = []
+ else:
+ dst_kv_indices = np.frombuffer(msg[4], dtype=np.int32)
+ dst_aux_index = int(msg[5].decode("ascii"))
+ if msg[6] == b"":
+ dst_state_indices = []
+ else:
+ dst_state_indices = list(np.frombuffer(msg[6], dtype=np.int32))
+ is_dummy = False
+ return cls(
+ room=int(msg[0].decode("ascii")),
+ endpoint=msg[1].decode("ascii"),
+ dst_port=int(msg[2].decode("ascii")),
+ mooncake_session_id=msg[3].decode("ascii"),
+ dst_kv_indices=dst_kv_indices,
+ dst_aux_index=dst_aux_index,
+ dst_state_indices=dst_state_indices,
+ required_dst_info_num=int(msg[7].decode("ascii")),
+ is_dummy=is_dummy,
+ )
+
+
+# decode
+@dataclasses.dataclass
+class KVArgsRegisterInfo:
+ room: str
+ endpoint: str
+ dst_port: int
+ mooncake_session_id: str
+ dst_kv_ptrs: list[int]
+ dst_aux_ptrs: list[int]
+ dst_state_data_ptrs: list[int]
+ dst_tp_rank: int
+ dst_attn_tp_size: int
+ dst_kv_item_len: int
+ # for mamba state different tp slice transfer
+ dst_state_item_lens: list[int]
+ dst_state_dim_per_tensor: list[int]
+
+ @classmethod
+ def from_zmq(cls, msg: List[bytes]):
+ return cls(
+ room=str(msg[0].decode("ascii")),
+ endpoint=msg[1].decode("ascii"),
+ dst_port=int(msg[2].decode("ascii")),
+ mooncake_session_id=msg[3].decode("ascii"),
+ dst_kv_ptrs=list(struct.unpack(f"{len(msg[4])//8}Q", msg[4])),
+ dst_aux_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
+ dst_state_data_ptrs=list(struct.unpack(f"{len(msg[6])//8}Q", msg[6])),
+ dst_tp_rank=int(msg[7].decode("ascii")),
+ dst_attn_tp_size=int(msg[8].decode("ascii")),
+ dst_kv_item_len=int(msg[9].decode("ascii")),
+ dst_state_item_lens=(
+ list(struct.unpack(f"{len(msg[10])//4}I", msg[10]))
+ if len(msg) > 10 and len(msg[10]) > 0
+ else []
+ ),
+ dst_state_dim_per_tensor=(
+ list(struct.unpack(f"{len(msg[11])//4}I", msg[11]))
+ if len(msg) > 11 and len(msg[11]) > 0
+ else []
+ ),
+ )
+
+
+class AuxDataCodec:
+ """Handles serialization and deserialization of auxiliary data buffers"""
+
+ @staticmethod
+ def serialize_data_from_buffer(src_addr, data_length):
+ """Serialize data from memory buffer to bytes"""
+ buffer = (ctypes.c_byte * data_length).from_address(src_addr)
+ return bytes(buffer)
+
+ @staticmethod
+ def deserialize_data_to_buffer(kv_args, buffer_index, aux_index, data):
+ """Deserialize bytes into target memory buffer"""
+ dst_aux_ptr = kv_args.aux_data_ptrs[buffer_index]
+ item_len = kv_args.aux_item_lens[buffer_index]
+ dst_addr = dst_aux_ptr + item_len * aux_index
+ buffer = (ctypes.c_byte * len(data)).from_address(dst_addr)
+ buffer[:] = data
+ return
+
+
+class MooncakeKVManager(CommonKVManager):
+ AUX_DATA_HEADER = b"AUX_DATA"
+
+ def __init__(
+ self,
+ args: KVArgs,
+ disaggregation_mode: DisaggregationMode,
+ server_args: ServerArgs,
+ is_mla_backend: Optional[bool] = False,
+ ):
+ super().__init__(args, disaggregation_mode, server_args, is_mla_backend)
+ self.init_engine()
+ self.register_buffer_to_engine()
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
+ self.start_prefill_thread()
+ self.session_failures = defaultdict(int)
+ self.failed_sessions = set()
+ self.session_lock = threading.Lock()
+ # Determine the number of threads to use for kv sender
+ cpu_count = os.cpu_count()
+ transfer_thread_pool_size = (
+ envs.SGLANG_DISAGGREGATION_THREAD_POOL_SIZE.get()
+ )
+ if transfer_thread_pool_size is None:
+ transfer_thread_pool_size = min(max(4, int(0.5 * cpu_count) // 8), 12)
+ transfer_queue_size = envs.SGLANG_DISAGGREGATION_QUEUE_SIZE.get()
+ self.transfer_queues: List[FastQueue] = [
+ FastQueue() for _ in range(transfer_queue_size)
+ ]
+ assert transfer_thread_pool_size >= transfer_queue_size, (
+ f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
+ f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
+ )
+ self.executors = [
+ concurrent.futures.ThreadPoolExecutor(
+ transfer_thread_pool_size // transfer_queue_size
+ )
+ for _ in range(transfer_queue_size)
+ ]
+ for queue, executor in zip(self.transfer_queues, self.executors):
+ threading.Thread(
+ target=self.transfer_worker, args=(queue, executor), daemon=True
+ ).start()
+ self.enable_custom_mem_pool, self.custom_mem_pool_type = (
+ check_mooncake_custom_mem_pool_enabled()
+ )
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
+ self.start_decode_thread()
+
+ def init_engine(self):
+ self.engine = get_mooncake_transfer_engine()
+
+ def register_buffer_to_engine(self):
+ # Batch register KV data buffers
+ if self.kv_args.kv_data_ptrs and self.kv_args.kv_data_lens:
+ self.engine.batch_register(
+ self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
+ )
+
+ # Batch register auxiliary data buffers
+ if self.kv_args.aux_data_ptrs and self.kv_args.aux_data_lens:
+ self.engine.batch_register(
+ self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+ )
+
+ # Batch register state/extra pool data buffers
+ if self.kv_args.state_data_ptrs and self.kv_args.state_data_lens:
+ self.engine.batch_register(
+ self.kv_args.state_data_ptrs, self.kv_args.state_data_lens
+ )
+
+ def _transfer_data(self, mooncake_session_id, transfer_blocks):
+ if not transfer_blocks:
+ return 0
+
+ src_addrs, dst_addrs, lengths = zip(*transfer_blocks)
+ return self.engine.batch_transfer_sync(
+ mooncake_session_id, list(src_addrs), list(dst_addrs), list(lengths)
+ )
+
+ def _send_kvcache_generic(
+ self,
+ mooncake_session_id: str,
+ src_data_ptrs: list[int],
+ dst_data_ptrs: list[int],
+ item_lens: list[int],
+ prefill_data_indices: npt.NDArray[np.int32],
+ dst_data_indices: npt.NDArray[np.int32],
+ executor: concurrent.futures.ThreadPoolExecutor,
+ ) -> int:
+ """
+ Generic KV cache transfer supporting both MHA and MLA architectures.
+ This method is used by both send_kvcache (full pool) and maybe_send_extra.
+ """
+ # Group by indices for optimization
+ prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+ prefill_data_indices, dst_data_indices
+ )
+
+ layers_params = None
+
+ # Decode pp size should be equal to prefill pp size or 1
+ if self.is_mla_backend:
+ src_kv_ptrs, dst_kv_ptrs, layers_current_pp_stage = (
+ self.get_mla_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
+ )
+ layers_params = [
+ (
+ src_kv_ptrs[layer_id],
+ dst_kv_ptrs[layer_id],
+ item_lens[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+ else:
+ src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
+ self.get_mha_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
+ )
+ # item_lens structure: [k_layer0, k_layer1, ..., k_layerN, v_layer0, v_layer1, ..., v_layerN]
+ # Use correct item lengths for K and V separately
+ if layers_current_pp_stage > len(dst_k_ptrs):
+ logger.error(
+ "Prefill transfer kvcache error, layers_current_pp_stage is out of range: "
+ f"layers_current_pp_stage={layers_current_pp_stage}, len(dst_k_ptrs)={len(dst_k_ptrs)}"
+ )
+ return -1
+ layers_params = [
+ (
+ src_k_ptrs[layer_id],
+ dst_k_ptrs[layer_id],
+ item_lens[layer_id], # K item length
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ] + [
+ (
+ src_v_ptrs[layer_id],
+ dst_v_ptrs[layer_id],
+ item_lens[layers_current_pp_stage + layer_id], # V item length
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+ assert layers_params is not None
+
+ def set_transfer_blocks(
+ src_ptr: int, dst_ptr: int, item_len: int
+ ) -> List[Tuple[int, int, int]]:
+ transfer_blocks = []
+ for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
+ src_addr = src_ptr + int(prefill_index[0]) * item_len
+ dst_addr = dst_ptr + int(decode_index[0]) * item_len
+ length = item_len * len(prefill_index)
+ transfer_blocks.append((src_addr, dst_addr, length))
+ return transfer_blocks
+
+ # Worker function for processing a single layer
+ def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
+ transfer_blocks = set_transfer_blocks(src_ptr, dst_ptr, item_len)
+ return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+ # Worker function for processing all layers in a batch
+ def process_layers(layers_params: List[Tuple[int, int, int]]) -> int:
+ transfer_blocks = []
+ for src_ptr, dst_ptr, item_len in layers_params:
+ transfer_blocks.extend(set_transfer_blocks(src_ptr, dst_ptr, item_len))
+ return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+ if self.enable_custom_mem_pool:
+ futures = [
+ executor.submit(
+ process_layer,
+ src_ptr,
+ dst_ptr,
+ item_len,
+ )
+ for (src_ptr, dst_ptr, item_len) in layers_params
+ ]
+ for future in concurrent.futures.as_completed(futures):
+ status = future.result()
+ if status != 0:
+ for f in futures:
+ f.cancel()
+ return status
+ return 0
+ else:
+ # Combining all layers' params in one batch transfer is more efficient
+ # compared to using multiple threads
+ return process_layers(layers_params)
+
+ def send_kvcache(
+ self,
+ mooncake_session_id: str,
+ prefill_kv_indices: npt.NDArray[np.int32],
+ dst_kv_ptrs: list[int],
+ dst_kv_indices: npt.NDArray[np.int32],
+ executor: concurrent.futures.ThreadPoolExecutor,
+ ):
+ return self._send_kvcache_generic(
+ mooncake_session_id=mooncake_session_id,
+ src_data_ptrs=self.kv_args.kv_data_ptrs,
+ dst_data_ptrs=dst_kv_ptrs,
+ item_lens=self.kv_args.kv_item_lens,
+ prefill_data_indices=prefill_kv_indices,
+ dst_data_indices=dst_kv_indices,
+ executor=executor,
+ )
+
+ def send_kvcache_slice(
+ self,
+ mooncake_session_id: str,
+ prefill_kv_indices: npt.NDArray[np.int32],
+ dst_kv_ptrs: list[int],
+ dst_kv_indices: npt.NDArray[np.int32],
+ dst_tp_rank: int,
+ dst_attn_tp_size: int,
+ dst_kv_item_len: int,
+ executor: concurrent.futures.ThreadPoolExecutor,
+ ):
+ """
+ Sends KV cache slices from this Prefill rank to a target Decode rank,
+ supporting generic M-to-N TP size configurations.
+
+ NOTE: This implementation calls the transfer engine for each token slot within
+ each page to ensure correctness for any page_size and head-slicing configuration.
+ This may introduce performance overhead (increased TTFT) for long sequences.
+ """
+ # Extract configuration
+ local_tp_rank_in_group = self.kv_args.engine_rank % self.attn_tp_size
+ src_kv_item_len = self.kv_args.kv_item_lens[0]
+ dst_tp_rank_in_group = dst_tp_rank % dst_attn_tp_size
+ page_size = self.kv_args.page_size
+
+ # Use total KV head count (not per-rank) for correct head distribution.
+ # Per-rank kv_head_num is max(1, total//tp) which loses info when total < tp.
+ total_kv_heads = getattr(self.kv_args, "total_kv_head_num", 0)
+ if total_kv_heads <= 0:
+ total_kv_heads = self.kv_args.kv_head_num * self.attn_tp_size
+
+ src_heads_per_rank = max(1, total_kv_heads // self.attn_tp_size)
+ dst_heads_per_rank = max(1, total_kv_heads // dst_attn_tp_size)
+ bytes_per_head_slice_to_send = (
+ dst_kv_item_len // page_size // dst_heads_per_rank
+ )
+
+ # GQA replication: how many prefill ranks share the same KV head
+ src_replication = max(1, self.attn_tp_size // total_kv_heads)
+
+ # Determine slicing parameters based on TP configuration
+ if self.attn_tp_size > dst_attn_tp_size:
+ # Send KVCache from multiple prefill instances to 1 decode instance
+ src_head_start_offset = 0
+ num_heads_to_send = src_heads_per_rank
+ unique_head_idx = local_tp_rank_in_group // src_replication
+ dst_head_start_offset = (
+ unique_head_idx * src_heads_per_rank
+ ) % dst_heads_per_rank
+ else:
+ # Send KVCache from 1 prefill instance to multiple decode instances
+ src_head_start_offset = (
+ dst_tp_rank_in_group * dst_heads_per_rank
+ ) % src_heads_per_rank
+ num_heads_to_send = dst_heads_per_rank
+ dst_head_start_offset = 0
+
+ src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
+ self.get_mha_kv_ptrs_with_pp(self.kv_args.kv_data_ptrs, dst_kv_ptrs)
+ )
+
+ # Calculate precise byte offset and length for the sub-slice within the token
+ src_head_slice_offset = src_head_start_offset * bytes_per_head_slice_to_send
+ dst_head_slice_offset = dst_head_start_offset * bytes_per_head_slice_to_send
+ heads_bytes_per_token_to_send = num_heads_to_send * bytes_per_head_slice_to_send
+
+ # Sanity check: The data sub-slice to be sent should fit into the dst buffer.
+ # This means heads_bytes_per_token_to_send <= (dst_kv_item_len // page_size)
+ if heads_bytes_per_token_to_send > (dst_kv_item_len // page_size):
+ logger.error(
+ f"[{mooncake_session_id}] slice size ({heads_bytes_per_token_to_send}) exceeds "
+ f"target token slot size ({dst_kv_item_len // page_size})"
+ )
+ return -1
+
+ prefill_page_indices = prefill_kv_indices.reshape(-1, 1).astype(np.int64)
+ decode_page_indices = dst_kv_indices.reshape(-1, 1).astype(np.int64)
+ tokens_per_page = np.arange(page_size, dtype=np.int64).reshape(1, -1)
+ bytes_per_token_on_prefill = src_kv_item_len // page_size
+ bytes_per_token_on_decode = dst_kv_item_len // page_size
+ src_token_slot_offsets = (
+ tokens_per_page * bytes_per_token_on_prefill + src_head_slice_offset
+ )
+ dst_token_slot_offsets = (
+ tokens_per_page * bytes_per_token_on_decode + dst_head_slice_offset
+ )
+
+ def process_layer_tp_aware(src_layer_ptr, dst_layer_ptr):
+ src_page_base_addrs = src_layer_ptr + prefill_page_indices * src_kv_item_len
+ dst_page_base_addrs = dst_layer_ptr + decode_page_indices * dst_kv_item_len
+ src_slice_addrs = src_page_base_addrs + src_token_slot_offsets
+ dst_slice_addrs = dst_page_base_addrs + dst_token_slot_offsets
+
+ src_addr_list = src_slice_addrs.reshape(-1).tolist()
+ if not src_addr_list:
+ # Nothing to transfer for this layer.
+ return 0
+ dst_addr_list = dst_slice_addrs.reshape(-1).tolist()
+ total_slices = len(src_addr_list)
+ length_list = [heads_bytes_per_token_to_send] * total_slices
+ return self.engine.batch_transfer_sync(
+ mooncake_session_id, src_addr_list, dst_addr_list, length_list
+ )
+
+ futures = []
+ for i in range(layers_current_pp_stage):
+ futures.append(
+ executor.submit(process_layer_tp_aware, src_k_ptrs[i], dst_k_ptrs[i])
+ )
+ for i in range(layers_current_pp_stage):
+ futures.append(
+ executor.submit(process_layer_tp_aware, src_v_ptrs[i], dst_v_ptrs[i])
+ )
+
+ for future in concurrent.futures.as_completed(futures):
+ status = future.result()
+ if status != 0:
+ for f in futures:
+ f.cancel()
+ return status
+
+ return 0
+
+ def send_aux(
+ self,
+ req: TransferInfo,
+ prefill_aux_index: int,
+ dst_aux_ptrs: list[int],
+ ):
+ # TODO(shangming): Fix me when nvlink_transport of Mooncake is bug-free
+ if (
+ self.enable_custom_mem_pool and self.custom_mem_pool_type == "NVLINK"
+ ) or envs.SGLANG_MOONCAKE_SEND_AUX_TCP.get():
+ return self.send_aux_tcp(req, prefill_aux_index, dst_aux_ptrs)
+
+ transfer_blocks = []
+ prefill_aux_ptrs = self.kv_args.aux_data_ptrs
+ prefill_aux_item_lens = self.kv_args.aux_item_lens
+
+ for i, dst_aux_ptr in enumerate(dst_aux_ptrs):
+ length = prefill_aux_item_lens[i]
+ src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
+ dst_addr = dst_aux_ptrs[i] + length * req.dst_aux_index
+ transfer_blocks.append((src_addr, dst_addr, length))
+
+ return self._transfer_data(req.mooncake_session_id, transfer_blocks)
+
+ def send_aux_tcp(
+ self,
+ req: TransferInfo,
+ prefill_aux_index: int,
+ dst_aux_ptrs: list[int],
+ ):
+ prefill_aux_ptrs = self.kv_args.aux_data_ptrs
+ prefill_aux_item_lens = self.kv_args.aux_item_lens
+
+ for i in range(len(prefill_aux_ptrs)):
+ length = prefill_aux_item_lens[i]
+ src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
+ data = AuxDataCodec.serialize_data_from_buffer(src_addr, length)
+
+ self.send_aux_data_to_endpoint(
+ remote=req.endpoint,
+ dst_port=req.dst_port,
+ room=req.room,
+ buffer_index=i,
+ aux_index=req.dst_aux_index,
+ data=data,
+ )
+
+ return 0
+
+ def send_aux_data_to_endpoint(
+ self,
+ remote: str,
+ dst_port: int,
+ room: int,
+ buffer_index: int,
+ aux_index: int,
+ data: bytes,
+ ):
+ socket = self._connect(
+ format_tcp_address(remote, dst_port), is_ipv6=is_valid_ipv6_address(remote)
+ )
+
+ socket.send_multipart(
+ [
+ MooncakeKVManager.AUX_DATA_HEADER,
+ str(room).encode("ascii"),
+ str(buffer_index).encode("ascii"),
+ str(aux_index).encode("ascii"),
+ struct.pack(">I", len(data)),
+ data,
+ ]
+ )
+
+ def _handle_aux_data(self, msg: List[bytes]):
+ """Handle AUX_DATA messages received by the decode thread."""
+ room = int(msg[1].decode("ascii"))
+ buffer_index = int(msg[2].decode("ascii"))
+ aux_index = int(msg[3].decode("ascii"))
+ data_length = struct.unpack(">I", msg[4])[0]
+ data = msg[5]
+
+ if len(data) != data_length:
+ logger.error(f"AUX_DATA length mismatch for bootstrap_room {room}")
+ return
+
+ AuxDataCodec.deserialize_data_to_buffer(
+ self.kv_args, buffer_index, aux_index, data
+ )
+
+ logger.debug(
+ f"Received AUX_DATA for bootstrap_room {room} with length:{len(data)}"
+ )
+
+ def maybe_send_extra(
+ self,
+ req: TransferInfo,
+ prefill_state_indices: list[int],
+ dst_state_data_ptrs: list[int],
+ executor: concurrent.futures.ThreadPoolExecutor,
+ target_rank_registration_info: Optional[KVArgsRegisterInfo] = None,
+ ):
+ """Send state or extra pool data with type-specific handling."""
+ state_type = getattr(self.kv_args, "state_type", "none")
+
+ if state_type == "mamba":
+ # Check if we need slice transfer for different TP sizes
+ if (
+ target_rank_registration_info is not None
+ and self.attn_tp_size != target_rank_registration_info.dst_attn_tp_size
+ ):
+ return self._send_mamba_state_slice(
+ req,
+ prefill_state_indices,
+ dst_state_data_ptrs,
+ target_rank_registration_info.dst_state_item_lens,
+ target_rank_registration_info.dst_state_dim_per_tensor,
+ target_rank_registration_info.dst_tp_rank,
+ target_rank_registration_info.dst_attn_tp_size,
+ )
+ else:
+ return self._send_mamba_state(
+ req,
+ prefill_state_indices,
+ dst_state_data_ptrs,
+ )
+ elif state_type in ["swa", "nsa"]:
+ # SWA and NSA hybrid models do not support different TP sizes yet
+ if (
+ target_rank_registration_info is not None
+ and not self.is_mla_backend
+ and self.attn_tp_size != target_rank_registration_info.dst_attn_tp_size
+ ):
+ raise RuntimeError(
+ f"PD Disaggregation does NOT support PD different TP sizes for non-MLA {state_type.upper()} hybrid models yet."
+ )
+ if len(prefill_state_indices) < len(req.dst_state_indices):
+ logger.warning(
+ f"len(prefill_state_indices) = {len(prefill_state_indices)}, len(dst_state_indices) = {len(req.dst_state_indices)}"
+ )
+ prefill_state_indices = prefill_state_indices[
+ : len(req.dst_state_indices)
+ ]
+ # Reuse _send_kvcache_generic interface to send extra pool data
+ prefill_state_indices = np.array(prefill_state_indices, dtype=np.int32)
+ dst_state_indices = np.array(req.dst_state_indices, dtype=np.int32)
+ return self._send_kvcache_generic(
+ mooncake_session_id=req.mooncake_session_id,
+ src_data_ptrs=self.kv_args.state_data_ptrs,
+ dst_data_ptrs=dst_state_data_ptrs,
+ item_lens=self.kv_args.state_item_lens,
+ prefill_data_indices=prefill_state_indices,
+ dst_data_indices=dst_state_indices,
+ executor=executor,
+ )
+ else:
+ return 0
+
+ def _send_mamba_state(
+ self,
+ req: TransferInfo,
+ prefill_mamba_index: list[int],
+ dst_state_data_ptrs: list[int],
+ ):
+ """Transfer Mamba states."""
+ assert len(prefill_mamba_index) == 1, "Mamba should have single state index"
+
+ transfer_blocks = []
+ prefill_state_data_ptrs = self.kv_args.state_data_ptrs
+ prefill_state_item_lens = self.kv_args.state_item_lens
+
+ for i, dst_state_ptr in enumerate(dst_state_data_ptrs):
+ length = prefill_state_item_lens[i]
+ src_addr = prefill_state_data_ptrs[i] + length * int(prefill_mamba_index[0])
+ dst_addr = dst_state_ptr + length * int(req.dst_state_indices[0])
+ transfer_blocks.append((src_addr, dst_addr, length))
+
+ return self._transfer_data(req.mooncake_session_id, transfer_blocks)
+
+ def _send_mamba_state_slice(
+ self,
+ req: TransferInfo,
+ prefill_mamba_index: list[int],
+ dst_state_data_ptrs: list[int],
+ dst_state_item_lens: list[int],
+ dst_state_dim_per_tensor: list[int],
+ dst_tp_rank: int,
+ dst_attn_tp_size: int,
+ ):
+ """Transfer Mamba states with TP slice support.
+
+ Mamba state layout:
+ - conv_state: [num_layers, size+1, conv_dim/tp, conv_kernel-1]
+ - temporal_state: [num_layers, size+1, num_heads/tp, head_dim, state_size]
+
+ The 3rd dimension is sliced by TP. When prefill and decode have different
+ attn_tp_size, we need to slice the state accordingly.
+ """
+ logger.warning_once(
+ "Using Mamba state slice transfer for different TP sizes between prefill and decode. "
+ f"Prefill attn_tp_size={self.attn_tp_size}, Decode attn_tp_size={dst_attn_tp_size}. "
+ "Performance may be affected."
+ )
+ assert len(prefill_mamba_index) == 1, "Mamba should have single state index"
+
+ transfer_blocks = []
+ prefill_state_data_ptrs = self.kv_args.state_data_ptrs
+ prefill_state_item_lens = self.kv_args.state_item_lens
+ src_state_dim_per_tensor = getattr(self.kv_args, "state_dim_per_tensor", [])
+
+ # If no dimension info available, fall back to regular transfer
+ if not src_state_dim_per_tensor or not dst_state_dim_per_tensor:
+ return self._send_mamba_state(req, prefill_mamba_index, dst_state_data_ptrs)
+
+ local_tp_rank_in_group = self.kv_args.engine_rank % self.attn_tp_size
+ dst_tp_rank_in_group = dst_tp_rank % dst_attn_tp_size
+
+ for i, dst_state_ptr in enumerate(dst_state_data_ptrs):
+ src_item_len = prefill_state_item_lens[i]
+ dst_item_len = dst_state_item_lens[i]
+ src_dim = src_state_dim_per_tensor[i]
+ dst_dim = dst_state_dim_per_tensor[i]
+
+ # Calculate bytes per dimension slice
+ # item_len = dim * trailing_dims_size, so trailing_dims_size = item_len / dim
+ src_bytes_per_dim = src_item_len // src_dim
+ dst_bytes_per_dim = dst_item_len // dst_dim
+
+ # Determine slicing parameters based on TP configuration
+ if self.attn_tp_size > dst_attn_tp_size:
+ # Multiple prefill ranks send to 1 decode rank
+ # Each prefill sends all its dims to the appropriate offset in decode
+ src_dim_start = 0
+ num_dims_to_send = src_dim
+ dst_dim_start = local_tp_rank_in_group * src_dim
+ else:
+ # 1 prefill rank sends to multiple decode ranks
+ # Prefill sends a slice of its dims to each decode rank
+ src_dim_start = (dst_tp_rank_in_group * dst_dim) % src_dim
+ num_dims_to_send = dst_dim
+ dst_dim_start = 0
+
+ # Calculate byte offsets
+ src_dim_offset = src_dim_start * src_bytes_per_dim
+ dst_dim_offset = dst_dim_start * dst_bytes_per_dim
+ bytes_to_send = num_dims_to_send * src_bytes_per_dim
+
+ # Calculate addresses for this state tensor
+ src_addr = (
+ prefill_state_data_ptrs[i]
+ + src_item_len * int(prefill_mamba_index[0])
+ + src_dim_offset
+ )
+ dst_addr = (
+ dst_state_ptr
+ + dst_item_len * int(req.dst_state_indices[0])
+ + dst_dim_offset
+ )
+
+ transfer_blocks.append((src_addr, dst_addr, bytes_to_send))
+
+ return self._transfer_data(req.mooncake_session_id, transfer_blocks)
+
+ def sync_status_to_decode_endpoint(
+ self, remote: str, dst_port: int, room: int, status: int, prefill_rank: int
+ ):
+ self._connect(
+ format_tcp_address(remote, dst_port), is_ipv6=is_valid_ipv6_address(remote)
+ ).send_multipart(
+ [
+ str(room).encode("ascii"),
+ str(status).encode("ascii"),
+ str(prefill_rank).encode("ascii"),
+ ]
+ )
+
+ def transfer_worker(
+ self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
+ ):
+ while True:
+ try:
+ kv_chunk: TransferKVChunk = queue.get()
+ reqs_to_be_processed = (
+ self.transfer_infos[kv_chunk.room].values()
+ if kv_chunk.room in self.transfer_infos
+ else []
+ )
+ polls = []
+ dst_ranks_infos = []
+ # Unique id per prefill sender so decode's response set size matches expected_response_num.
+ prefill_unique_rank = (
+ self.attn_tp_rank * (self.pp_size * self.attn_cp_size)
+ + self.pp_rank * self.attn_cp_size
+ + self.attn_cp_rank
+ )
+ for req in reqs_to_be_processed:
+ if not req.is_dummy:
+ # Early exit if the request has failed
+ with self.session_lock:
+ if req.mooncake_session_id in self.failed_sessions:
+ self.record_failure(
+ kv_chunk.room,
+ f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
+ )
+ self.update_status(kv_chunk.room, KVPoll.Failed)
+ self.sync_status_to_decode_endpoint(
+ req.endpoint,
+ req.dst_port,
+ req.room,
+ KVPoll.Failed,
+ prefill_unique_rank,
+ )
+ break
+
+ chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
+
+ # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
+ # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
+ if len(chunked_dst_kv_indice) < len(
+ kv_chunk.prefill_kv_indices
+ ):
+ logger.warning(
+ f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
+ )
+ kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
+ : len(chunked_dst_kv_indice)
+ ]
+
+ target_rank_registration_info: KVArgsRegisterInfo = (
+ self.decode_kv_args_table[req.mooncake_session_id]
+ )
+ if self.is_mla_backend or (
+ self.attn_tp_size
+ == target_rank_registration_info.dst_attn_tp_size
+ ):
+ ret = self.send_kvcache(
+ req.mooncake_session_id,
+ kv_chunk.prefill_kv_indices,
+ target_rank_registration_info.dst_kv_ptrs,
+ chunked_dst_kv_indice,
+ executor,
+ )
+ else:
+ ret = self.send_kvcache_slice(
+ req.mooncake_session_id,
+ kv_chunk.prefill_kv_indices,
+ target_rank_registration_info.dst_kv_ptrs,
+ chunked_dst_kv_indice,
+ target_rank_registration_info.dst_tp_rank,
+ target_rank_registration_info.dst_attn_tp_size,
+ target_rank_registration_info.dst_kv_item_len,
+ executor,
+ )
+ if ret != 0:
+ with self.session_lock:
+ self.session_failures[req.mooncake_session_id] += 1
+ # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
+ if self.session_failures[req.mooncake_session_id] >= 1:
+ self.failed_sessions.add(req.mooncake_session_id)
+ logger.error(
+ f"Session {req.mooncake_session_id} failed."
+ )
+ self.record_failure(
+ kv_chunk.room,
+ f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
+ )
+ self.update_status(kv_chunk.room, KVPoll.Failed)
+ self.sync_status_to_decode_endpoint(
+ req.endpoint,
+ req.dst_port,
+ req.room,
+ KVPoll.Failed,
+ prefill_unique_rank,
+ )
+ break
+
+ if kv_chunk.is_last_chunk:
+ if kv_chunk.state_indices is not None:
+ self.maybe_send_extra(
+ req,
+ kv_chunk.state_indices,
+ target_rank_registration_info.dst_state_data_ptrs,
+ executor,
+ target_rank_registration_info,
+ )
+
+ # Only the last chunk we need to send the aux data
+ ret = self.send_aux(
+ req,
+ kv_chunk.prefill_aux_index,
+ target_rank_registration_info.dst_aux_ptrs,
+ )
+ polls.append(True if ret == 0 else False)
+ dst_ranks_infos.append(
+ (req.endpoint, req.dst_port, req.room)
+ )
+
+ # Only sync status when all the dst ranks have received the kvcache
+ if len(polls) == req.required_dst_info_num:
+ status = KVPoll.Success if all(polls) else KVPoll.Failed
+ self.update_status(req.room, status)
+ for endpoint, dst_port, room in dst_ranks_infos:
+ self.sync_status_to_decode_endpoint(
+ endpoint,
+ dst_port,
+ room,
+ status,
+ prefill_unique_rank,
+ )
+ else:
+ # Dummy request means the decode instance is not used, so its status can be marked as success directly
+ # Dummy request does not need to sync status to decode endpoint
+ if kv_chunk.is_last_chunk and req.room in self.request_status:
+ self.update_status(req.room, KVPoll.Success)
+
+ if (
+ kv_chunk.room not in self.request_status
+ or self.check_status(kv_chunk.room) == KVPoll.Success
+ ):
+ if kv_chunk.room in self.transfer_infos:
+ self.transfer_infos.pop(kv_chunk.room)
+
+ except Exception as e:
+ # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
+ raise RuntimeError(
+ f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
+ )
+
+ def start_prefill_thread(self):
+ def bootstrap_thread():
+ """This thread recvs pre-alloc notification from the decode engine"""
+ # KVPoll.Bootstrapping -> KVPoll.WaitingForInput
+ while True:
+ waiting_req_bytes = self.server_socket.recv_multipart()
+ room = waiting_req_bytes[0].decode("ascii")
+ mooncake_session_id = waiting_req_bytes[3].decode("ascii")
+ if room == "None":
+ self.decode_kv_args_table[mooncake_session_id] = (
+ KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
+ )
+ with self.session_lock:
+ if mooncake_session_id in self.failed_sessions:
+ self.failed_sessions.remove(mooncake_session_id)
+ if mooncake_session_id in self.session_failures:
+ del self.session_failures[mooncake_session_id]
+ logger.debug(
+ f"Register KVArgs from {mooncake_session_id} successfully"
+ )
+ continue
+ else:
+ required_dst_info_num = int(waiting_req_bytes[7].decode("ascii"))
+ room = int(room)
+ if room not in self.transfer_infos:
+ self.transfer_infos[room] = {}
+
+ self.transfer_infos[room][mooncake_session_id] = (
+ TransferInfo.from_zmq(waiting_req_bytes)
+ )
+ # NOTE: after bootstrapping we can mark the req as waiting for input
+ if len(self.transfer_infos[room]) == required_dst_info_num:
+ self.update_status(room, KVPoll.WaitingForInput)
+
+ threading.Thread(target=bootstrap_thread).start()
+
+ def start_decode_thread(self):
+ def decode_thread():
+ while True:
+ msg = self.server_socket.recv_multipart()
+ if msg[0] == MooncakeKVManager.AUX_DATA_HEADER:
+ self._handle_aux_data(msg)
+ continue
+
+ bootstrap_room, status, prefill_rank = msg
+ status = int(status.decode("ascii"))
+ bootstrap_room = int(bootstrap_room.decode("ascii"))
+ prefill_rank = int(prefill_rank.decode("ascii"))
+
+ if status == KVPoll.Success:
+ if bootstrap_room in self.request_status:
+ self.prefill_response_tracker[bootstrap_room].add(prefill_rank)
+ expected_response_num = (
+ self.required_prefill_response_num_table[bootstrap_room]
+ )
+ arrived_response_num = len(
+ self.prefill_response_tracker[bootstrap_room]
+ )
+ if arrived_response_num == expected_response_num:
+ self.update_status(bootstrap_room, KVPoll.Success)
+ elif status == KVPoll.Failed:
+ self.record_failure(
+ bootstrap_room,
+ "Failed to get kvcache from prefill instance, it might be dead",
+ )
+ self.update_status(bootstrap_room, status)
+
+ def heartbeat_checker():
+ while True:
+ time.sleep(self.heartbeat_interval)
+ with self.connection_lock:
+ addresses = list(self.prefill_info_table.keys())
+
+ for bootstrap_addr in addresses:
+ session = None
+ try:
+ with self.session_pool_lock:
+ session = self.session_pool[bootstrap_addr]
+ response = session.get(
+ f"http://{bootstrap_addr}/health",
+ timeout=(2, 3),
+ headers={"Connection": "keep-alive"},
+ )
+ if response.status_code == 200:
+ self.heartbeat_failures[bootstrap_addr] = 0
+
+ current_rooms = self.addr_to_rooms_tracker[
+ bootstrap_addr
+ ].copy()
+
+ for bootstrap_room in current_rooms:
+ # Remove KVPoll.Success requests from the tracker
+ if bootstrap_room not in self.request_status:
+ self.addr_to_rooms_tracker[bootstrap_addr].discard(
+ bootstrap_room
+ )
+ else:
+ logger.info(
+ f"Attempting to reconnect to {bootstrap_addr}..."
+ )
+ self.heartbeat_failures[bootstrap_addr] = (
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+ )
+ with self.session_pool_lock:
+ if bootstrap_addr in self.session_pool:
+ del self.session_pool[bootstrap_addr]
+ except Exception:
+ logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
+ self.heartbeat_failures[bootstrap_addr] = (
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+ )
+
+ if (
+ self.heartbeat_failures.get(bootstrap_addr, 0)
+ >= self.max_failures
+ ):
+ self._handle_node_failure(bootstrap_addr)
+ with self.session_pool_lock:
+ if bootstrap_addr in self.session_pool:
+ del self.session_pool[bootstrap_addr]
+
+ threading.Thread(target=decode_thread).start()
+ threading.Thread(target=heartbeat_checker).start()
+
+ def add_transfer_request(
+ self,
+ bootstrap_room: int,
+ kv_indices: npt.NDArray[np.int32],
+ index_slice: slice,
+ is_last_chunk: bool,
+ aux_index: Optional[int] = None,
+ state_indices: Optional[List[int]] = None,
+ ):
+ assert self.disaggregation_mode == DisaggregationMode.PREFILL
+ assert not is_last_chunk or (is_last_chunk and aux_index is not None)
+
+ if (
+ bootstrap_room not in self.request_status
+ or self.check_status(bootstrap_room) == KVPoll.Failed
+ ):
+ logger.debug(
+ "Request with bootstrap_room=%s already failed", bootstrap_room
+ )
+ return
+
+ if bootstrap_room not in self.transfer_infos:
+ # This means that the current rank is a dummy rank for this request,
+ # and it has already been marked as success, so there is no need to
+ # add further chunks into the transfer queue.
+ return
+
+ # NOTE(shangming): sharding according to the dst_infos to make sure
+ # requests with the same dst_sessions will be added into the same
+ # queue, which enables early abort with failed sessions.
+ dst_infos = self.transfer_infos[bootstrap_room].keys()
+ session_port_sum = sum(int(session.rsplit(":", 1)[1]) for session in dst_infos)
+ shard_idx = session_port_sum % len(self.transfer_queues)
+
+ self.transfer_queues[shard_idx].put(
+ TransferKVChunk(
+ room=bootstrap_room,
+ prefill_kv_indices=kv_indices,
+ index_slice=index_slice,
+ is_last_chunk=is_last_chunk,
+ prefill_aux_index=aux_index,
+ state_indices=state_indices,
+ )
+ )
+
+ def get_session_id(self):
+ return self.engine.get_session_id()
+
+ def _handle_node_failure(self, failed_bootstrap_addr):
+ with self.connection_lock:
+ keys_to_remove = [
+ k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
+ ]
+ for k in keys_to_remove:
+ del self.connection_pool[k]
+
+ possible_affected_rooms = self.addr_to_rooms_tracker.get(
+ failed_bootstrap_addr, []
+ )
+ self.prefill_info_table.pop(failed_bootstrap_addr, None)
+ self.addr_to_rooms_tracker.pop(failed_bootstrap_addr, None)
+
+ # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
+ affected_rooms = []
+ for room in possible_affected_rooms:
+ if (
+ room in self.request_status
+ and self.check_status(room) != KVPoll.Success
+ ):
+ self.record_failure(
+ room,
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
+ )
+ self.update_status(room, KVPoll.Failed)
+ affected_rooms.append(room)
+ logger.error(
+ f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), {len(affected_rooms)} requests affected"
+ )
+
+
+class MooncakeKVSender(CommonKVSender):
+
+ def __init__(
+ self,
+ mgr: MooncakeKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: int,
+ dest_tp_ranks: List[int],
+ pp_rank: int,
+ ):
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, dest_tp_ranks, pp_rank)
+ self.conclude_state = None
+ self.init_time = time.time()
+
+ def send(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ state_indices: Optional[List[int]] = None,
+ ):
+ index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
+ self.curr_idx += len(kv_indices)
+ is_last_chunk = self.curr_idx == self.num_kv_indices
+
+ # Special handling for cp
+ if self.kv_mgr.enable_all_cp_ranks_for_transfer:
+ kv_indices, index_slice = filter_kv_indices_for_cp_rank(
+ self.kv_mgr,
+ kv_indices,
+ index_slice,
+ )
+ elif self.kv_mgr.is_dummy_cp_rank:
+ if not is_last_chunk:
+ return
+ else:
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Success)
+ return
+
+ if not is_last_chunk:
+ self.kv_mgr.add_transfer_request(
+ self.bootstrap_room,
+ kv_indices,
+ index_slice,
+ False,
+ )
+ else:
+ self.kv_mgr.add_transfer_request(
+ self.bootstrap_room,
+ kv_indices,
+ index_slice,
+ True,
+ aux_index=self.aux_index,
+ state_indices=state_indices,
+ )
+
+ def poll(self) -> KVPoll:
+ if self.conclude_state is None:
+ status = self.kv_mgr.check_status(self.bootstrap_room)
+ if status in (KVPoll.Success, KVPoll.Failed):
+ self.conclude_state = status
+ elif status == KVPoll.Bootstrapping:
+ if self.init_time is not None:
+ now = time.time()
+ elapsed = now - self.init_time
+ if elapsed >= self.kv_mgr.bootstrap_timeout:
+ logger.warning_once(
+ "Some requests timed out when bootstrapping, "
+ "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+ )
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
+ )
+ self.conclude_state = KVPoll.Failed
+ return KVPoll.Failed
+
+ return status
+ else:
+ return self.conclude_state
+
+ def clear(self) -> None:
+ if self.bootstrap_room in self.kv_mgr.request_status:
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
+
+ def failure_exception(self):
+ # Explicitly set the status to failure since this request has failed in another rank
+ if self.conclude_state is None:
+ self.conclude_state = KVPoll.Failed
+
+ self.clear()
+
+ with self.kv_mgr.failure_lock:
+ failure_reason = self.kv_mgr.failure_records.pop(
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
+ )
+ raise KVTransferError(self.bootstrap_room, failure_reason)
+
+ def abort(self):
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ "Aborted by AbortReq.",
+ )
+ # Explicitly set the status to failure since this request has been aborted
+ self.conclude_state = KVPoll.Failed
+
+
+class MooncakeKVReceiver(CommonKVReceiver):
+ def __init__(
+ self,
+ mgr: MooncakeKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: Optional[int] = None,
+ prefill_dp_rank: Optional[int] = None,
+ ):
+ self.session_id = mgr.get_session_id()
+ self.conclude_state = None
+ self.init_time = None
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, prefill_dp_rank)
+
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
+
+ def _register_kv_args(self):
+ for bootstrap_info in self.bootstrap_infos:
+ packed_kv_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
+ )
+ packed_aux_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.aux_data_ptrs
+ )
+ packed_state_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.state_data_ptrs
+ )
+ # Pack state_item_lens and state_dim_per_tensor for mamba state slice transfer
+ packed_state_item_lens = b"".join(
+ struct.pack("I", item_len)
+ for item_len in self.kv_mgr.kv_args.state_item_lens
+ )
+ state_dim_per_tensor = getattr(
+ self.kv_mgr.kv_args, "state_dim_per_tensor", []
+ )
+ packed_state_dim_per_tensor = b"".join(
+ struct.pack("I", dim) for dim in state_dim_per_tensor
+ )
+ # Note(shangming): No need to add pp rank here since decode pp size should be equal to prefill pp size or 1
+ tp_rank = self.kv_mgr.kv_args.engine_rank
+ kv_item_len = self.kv_mgr.kv_args.kv_item_lens[0]
+ dst_tp_rank = str(tp_rank).encode("ascii")
+ dst_attn_tp_size = str(self.kv_mgr.attn_tp_size).encode("ascii")
+ dst_kv_item_len = str(kv_item_len).encode("ascii")
+
+ sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+ with lock:
+ sock.send_multipart(
+ [
+ "None".encode("ascii"),
+ self.kv_mgr.local_ip.encode("ascii"),
+ str(self.kv_mgr.rank_port).encode("ascii"),
+ self.session_id.encode("ascii"),
+ packed_kv_data_ptrs,
+ packed_aux_data_ptrs,
+ packed_state_data_ptrs,
+ dst_tp_rank,
+ dst_attn_tp_size,
+ dst_kv_item_len,
+ packed_state_item_lens,
+ packed_state_dim_per_tensor,
+ ]
+ )
+
+ def init(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ aux_index: Optional[int] = None,
+ state_indices: Optional[List[int]] = None,
+ ):
+ if self.bootstrap_infos is None:
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
+ )
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+ return
+
+ for bootstrap_info in self.bootstrap_infos:
+ sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+ is_dummy = bootstrap_info["is_dummy"]
+
+ with lock:
+ sock.send_multipart(
+ [
+ str(self.bootstrap_room).encode("ascii"),
+ self.kv_mgr.local_ip.encode("ascii"),
+ str(self.kv_mgr.rank_port).encode("ascii"),
+ self.session_id.encode("ascii"),
+ kv_indices.tobytes() if not is_dummy else b"",
+ str(aux_index).encode("ascii") if not is_dummy else b"",
+ (
+ np.array(
+ state_indices,
+ dtype=np.int32,
+ ).tobytes()
+ if not is_dummy and state_indices is not None
+ else b""
+ ),
+ str(self.required_dst_info_num).encode("ascii"),
+ ]
+ )
+ self.init_time = time.time()
+
+ def poll(self) -> KVPoll:
+ if self.conclude_state is None:
+ status = self.kv_mgr.check_status(self.bootstrap_room)
+ if status in (KVPoll.Success, KVPoll.Failed):
+ self.conclude_state = status
+ elif status == KVPoll.WaitingForInput:
+ if self.init_time is not None:
+ now = time.time()
+ elapsed = now - self.init_time
+ if elapsed >= self.kv_mgr.waiting_timeout:
+ logger.warning_once(
+ "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
+ "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+ )
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
+ )
+ self.conclude_state = KVPoll.Failed
+ return KVPoll.Failed
+
+ return status
+
+ else:
+ return self.conclude_state
+
+ def clear(self) -> None:
+ if self.bootstrap_room in self.kv_mgr.request_status:
+ self.kv_mgr.request_status.pop(self.bootstrap_room)
+
+ if self.bootstrap_room in self.kv_mgr.required_prefill_response_num_table:
+ self.kv_mgr.required_prefill_response_num_table.pop(self.bootstrap_room)
+
+ if self.bootstrap_room in self.kv_mgr.prefill_response_tracker:
+ self.kv_mgr.prefill_response_tracker.pop(self.bootstrap_room)
+
+ def failure_exception(self):
+ # Explicitly set the status to failure since this request has failed in another rank
+ if self.conclude_state is None:
+ self.conclude_state = KVPoll.Failed
+
+ self.clear()
+
+ with self.kv_mgr.failure_lock:
+ failure_reason = self.kv_mgr.failure_records.pop(
+ self.bootstrap_room, "Failed due to an unknown reason from another rank"
+ )
+ raise KVTransferError(self.bootstrap_room, failure_reason)
+
+ def abort(self):
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ "Aborted by AbortReq.",
+ )
+ # Explicitly set the status to failure since this request has been aborted
+ self.conclude_state = KVPoll.Failed
+
+
+class MooncakeKVBootstrapServer(CommonKVBootstrapServer):
+ pass
diff --git a/sglang/python/sglang/srt/disaggregation/nixl/__init__.py b/sglang/python/sglang/srt/disaggregation/nixl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4df7baba2dfae9aeb50ff3247e2cc2a38e722c4e
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/nixl/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.nixl.conn import (
+ NixlKVBootstrapServer,
+ NixlKVManager,
+ NixlKVReceiver,
+ NixlKVSender,
+)
diff --git a/sglang/python/sglang/srt/disaggregation/nixl/conn.py b/sglang/python/sglang/srt/disaggregation/nixl/conn.py
new file mode 100644
index 0000000000000000000000000000000000000000..764fd9e4268956f7c7bdd75e7152f47e827ca3a0
--- /dev/null
+++ b/sglang/python/sglang/srt/disaggregation/nixl/conn.py
@@ -0,0 +1,1097 @@
+from __future__ import annotations
+
+import dataclasses
+import logging
+import struct
+import threading
+import time
+import uuid
+from collections import defaultdict
+from typing import Dict, List, Optional, Set
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.disaggregation.base.conn import KVArgs, KVPoll
+from sglang.srt.disaggregation.common.conn import (
+ CommonKVBootstrapServer,
+ CommonKVManager,
+ CommonKVReceiver,
+ CommonKVSender,
+)
+from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
+from sglang.srt.disaggregation.utils import (
+ DisaggregationMode,
+ filter_kv_indices_for_cp_rank,
+)
+from sglang.srt.environ import envs
+from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+GUARD = "NixlMsgGuard".encode("ascii")
+
+
+@dataclasses.dataclass
+class TransferInfo:
+ """Contains indices for a transfer, sent by KVReceiver. Received by prefill bootstrap thread."""
+
+ room: int
+ endpoint: str
+ dst_port: int
+ agent_name: str
+ dst_kv_indices: npt.NDArray[np.int32]
+ dst_aux_index: int
+ required_dst_info_num: int
+ dst_state_indices: List[int]
+
+ def is_dummy(self):
+ return self.dst_kv_indices.size == 0
+
+ @classmethod
+ def from_zmq(cls, msg: List[bytes]):
+ # Parse state_indices from msg[7] if present
+ if len(msg) > 7 and msg[7] != b"":
+ dst_state_indices = list(np.frombuffer(msg[7], dtype=np.int32))
+ else:
+ dst_state_indices = []
+
+ return cls(
+ room=int(msg[0].decode("ascii")),
+ endpoint=msg[1].decode("ascii"),
+ dst_port=int(msg[2].decode("ascii")),
+ agent_name=msg[3].decode("ascii"),
+ dst_kv_indices=np.frombuffer(msg[4], dtype=np.int32),
+ dst_aux_index=int(msg[5].decode("ascii")),
+ required_dst_info_num=int(msg[6].decode("ascii")),
+ dst_state_indices=dst_state_indices,
+ )
+
+
+@dataclasses.dataclass
+class KVArgsRegisterInfo:
+ """Contains base pointers and other info which only needs to be sent once by KVReceiver. Received by prefill bootstrap thread."""
+
+ room: str
+ endpoint: str
+ dst_port: int
+ agent_name: str
+ agent_metadata: bytes
+ dst_kv_ptrs: list[int]
+ dst_aux_ptrs: list[int]
+ dst_state_data_ptrs: list[int]
+ gpu_id: int
+ decode_tp_size: int
+ decode_tp_rank: int
+ dst_kv_item_len: int
+
+ @classmethod
+ def from_zmq(cls, msg: List[bytes]):
+ # Parse state_data_ptrs from msg[7] if present
+ if len(msg) > 7 and msg[7] != b"":
+ dst_state_data_ptrs = list(struct.unpack(f"{len(msg[7]) // 8}Q", msg[7]))
+ else:
+ dst_state_data_ptrs = []
+
+ return cls(
+ room=str(msg[0].decode("ascii")),
+ endpoint=msg[1].decode("ascii"),
+ dst_port=int(msg[2].decode("ascii")),
+ agent_name=msg[3].decode("ascii"),
+ agent_metadata=msg[4],
+ dst_kv_ptrs=list(struct.unpack(f"{len(msg[5]) // 8}Q", msg[5])),
+ dst_aux_ptrs=list(struct.unpack(f"{len(msg[6]) // 8}Q", msg[6])),
+ dst_state_data_ptrs=dst_state_data_ptrs,
+ gpu_id=int(msg[8].decode("ascii")),
+ decode_tp_size=int(msg[9].decode("ascii")),
+ decode_tp_rank=int(msg[10].decode("ascii")),
+ dst_kv_item_len=int(msg[11].decode("ascii")),
+ )
+
+
+@dataclasses.dataclass
+class TransferStatus:
+ """Used by KV Receiver to know when a transfer is done."""
+
+ # KV chunks received per pp_rank: {pp_rank: set of chunk_ids}
+ received_kvs_per_pp: Dict[int, Set[int]] = dataclasses.field(
+ default_factory=lambda: defaultdict(set)
+ )
+ # Expected chunk count per pp_rank (set when is_last=True): {pp_rank: expected_count}
+ expected_kvs_per_pp: Dict[int, int] = dataclasses.field(default_factory=dict)
+ # Number of PP ranks expected to send data.
+ num_pp_ranks_expected: Optional[int] = None
+ # Whether aux data has been received.
+ received_aux: bool = False
+ # PP ranks that have sent state data (state is layer-specific, each PP rank sends its portion).
+ received_state_per_pp: Set[int] = dataclasses.field(default_factory=set)
+ # Whether state data is expected (set based on state_type).
+ expects_state: bool = False
+ # Mark as failed
+ is_failure: bool = False
+
+ def is_done(self):
+ if self.is_failure:
+ return True
+ if self.num_pp_ranks_expected is None or not self.received_aux:
+ return False
+ # If state data is expected, check all PP ranks have sent it
+ if (
+ self.expects_state
+ and len(self.received_state_per_pp) < self.num_pp_ranks_expected
+ ):
+ return False
+ # All PP ranks must have reported their expected count
+ if len(self.expected_kvs_per_pp) < self.num_pp_ranks_expected:
+ return False
+ # Each PP rank must have received all expected chunks
+ for pp_rank, expected in self.expected_kvs_per_pp.items():
+ if len(self.received_kvs_per_pp[pp_rank]) != expected:
+ return False
+ return True
+
+ def is_failed(self):
+ return self.is_failure
+
+
+class NixlKVManager(CommonKVManager):
+ def __init__(
+ self,
+ args: KVArgs,
+ disaggregation_mode: DisaggregationMode,
+ server_args: ServerArgs,
+ is_mla_backend: Optional[bool] = False,
+ ):
+ super().__init__(args, disaggregation_mode, server_args, is_mla_backend)
+ try:
+ from nixl._api import nixl_agent, nixl_agent_config
+ except ImportError as e:
+ raise ImportError(
+ "Please install NIXL by following the instructions at "
+ "https://github.com/ai-dynamo/nixl/blob/main/README.md "
+ "to run SGLang with NixlTransferEngine."
+ ) from e
+
+ backend = envs.SGLANG_DISAGGREGATION_NIXL_BACKEND.get()
+ agent_config = nixl_agent_config(
+ backends=[backend],
+ num_threads=(8 if disaggregation_mode == DisaggregationMode.PREFILL else 0),
+ )
+ self.agent = nixl_agent(str(uuid.uuid4()), agent_config)
+
+ available_plugins = self.agent.get_plugin_list()
+ if backend not in available_plugins:
+ raise ValueError(
+ f"NIXL backend '{backend}' not found. Available: {available_plugins}. "
+ f"Please install the required NIXL plugin or choose from: {available_plugins}"
+ )
+ logger.info(f"NIXL KVManager initialized with backend: {backend}")
+
+ self.register_buffer_to_engine()
+
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
+ self._start_bootstrap_thread()
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
+ self.transfer_statuses: Dict[int, TransferStatus] = defaultdict(
+ TransferStatus
+ )
+ self._start_heartbeat_checker_thread()
+ else:
+ raise ValueError(
+ f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
+ )
+
+ def _start_heartbeat_checker_thread(self):
+ """
+ Start the heartbeat checker thread for Decode worker.
+ TODO (smor): unite nixl heartbeat checker with mooncake's.
+ """
+
+ def heartbeat_checker():
+ while True:
+ time.sleep(self.heartbeat_interval)
+ with self.connection_lock:
+ addresses = list(self.prefill_info_table.keys())
+
+ for bootstrap_addr in addresses:
+ session = None
+ try:
+ with self.session_pool_lock:
+ session = self.session_pool[bootstrap_addr]
+ response = session.get(
+ f"http://{bootstrap_addr}/health",
+ timeout=(2, 3),
+ headers={"Connection": "keep-alive"},
+ )
+ if response.status_code == 200:
+ self.heartbeat_failures[bootstrap_addr] = 0
+
+ else:
+ logger.info(
+ f"Attempting to reconnect to {bootstrap_addr}..."
+ )
+ self.heartbeat_failures[bootstrap_addr] = (
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+ )
+ with self.session_pool_lock:
+ if bootstrap_addr in self.session_pool:
+ del self.session_pool[bootstrap_addr]
+ except Exception:
+ logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
+ self.heartbeat_failures[bootstrap_addr] = (
+ self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+ )
+
+ if (
+ self.heartbeat_failures.get(bootstrap_addr, 0)
+ >= self.max_failures
+ ):
+ self._handle_node_failure(bootstrap_addr)
+ with self.session_pool_lock:
+ if bootstrap_addr in self.session_pool:
+ del self.session_pool[bootstrap_addr]
+
+ threading.Thread(target=heartbeat_checker, daemon=True).start()
+
+ def _handle_node_failure(self, failed_bootstrap_addr):
+ """Handle failure of a prefill node."""
+ with self.connection_lock:
+ keys_to_remove = [
+ k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
+ ]
+ for k in keys_to_remove:
+ del self.connection_pool[k]
+ self.prefill_info_table.pop(failed_bootstrap_addr, None)
+
+ possible_affected_rooms = self.addr_to_rooms_tracker.get(
+ failed_bootstrap_addr, []
+ )
+ self.addr_to_rooms_tracker.pop(failed_bootstrap_addr, None)
+
+ # Mark all pending transfers associated with the failed node as failed
+ affected_rooms = []
+ for room in possible_affected_rooms:
+ if (
+ room in self.transfer_statuses
+ and not self.transfer_statuses[room].is_done()
+ ):
+ # Mark the transfer as failed
+ self.transfer_statuses[room].is_failure = True
+ affected_rooms.append(room)
+
+ logger.error(
+ f"Lost connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), "
+ f"{len(affected_rooms)} transfers affected"
+ )
+ for room in possible_affected_rooms:
+ logger.error(f"Let room {room} be failed due to prefill down")
+ self.update_status(room, KVPoll.Failed)
+
+ def register_buffer_to_engine(self):
+ kv_addrs = []
+ for kv_data_ptr, kv_data_len in zip(
+ self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
+ ):
+ kv_addrs.append((kv_data_ptr, kv_data_len, self.kv_args.gpu_id, ""))
+ self.kv_descs = self.agent.register_memory(kv_addrs, "VRAM")
+ logger.debug(f"Register kv tensors, len(kv_addr)= {len(kv_addrs)}")
+ if not self.kv_descs:
+ raise Exception("NIXL memory registration failed for kv tensors")
+ aux_addrs = []
+ for aux_data_ptr, aux_data_len in zip(
+ self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+ ):
+ aux_addrs.append((aux_data_ptr, aux_data_len, 0, ""))
+ self.aux_descs = self.agent.register_memory(aux_addrs, "DRAM")
+ logger.debug(f"Register aux tensors, len(aux_addrs)= {len(aux_addrs)}")
+ if not self.aux_descs:
+ raise Exception("NIXL memory registration failed for aux tensors")
+
+ # Register state/extra pool data buffers if present
+ if self.kv_args.state_data_ptrs and self.kv_args.state_data_lens:
+ state_addrs = []
+ for state_data_ptr, state_data_len in zip(
+ self.kv_args.state_data_ptrs, self.kv_args.state_data_lens
+ ):
+ state_addrs.append(
+ (state_data_ptr, state_data_len, self.kv_args.gpu_id, "")
+ )
+ self.state_descs = self.agent.register_memory(state_addrs, "VRAM")
+ logger.debug(
+ f"Register state tensors, len(state_addrs)= {len(state_addrs)}"
+ )
+ if not self.state_descs:
+ raise Exception("NIXL memory registration failed for state tensors")
+
+ def _add_remote_peer(self, decode_kv_args: KVArgsRegisterInfo):
+ agent_name = decode_kv_args.agent_name
+ if agent_name in self.decode_kv_args_table:
+ logger.info(f"Peer {agent_name} was already registered, ignoring.")
+ return
+ self.decode_kv_args_table[agent_name] = decode_kv_args
+ self.agent.add_remote_agent(decode_kv_args.agent_metadata)
+
+ def _send_kvcache_generic(
+ self,
+ peer_name: str,
+ src_data_ptrs: list[int],
+ dst_data_ptrs: list[int],
+ item_lens: list[int],
+ prefill_data_indices: npt.NDArray[np.int32],
+ dst_data_indices: npt.NDArray[np.int32],
+ dst_gpu_id: int,
+ notif: str,
+ ):
+ """Generic KV cache transfer supporting both MHA and MLA architectures.
+ Used by both send_kvcache and maybe_send_extra."""
+ # group by indices
+ prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+ prefill_data_indices, dst_data_indices
+ )
+
+ logger.debug(f"sending kvcache to {peer_name} with notif {notif}")
+ # Make descs
+ if self.is_mla_backend:
+ src_kv_ptrs, dst_kv_ptrs, layers_current_pp_stage = (
+ self.get_mla_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
+ )
+ layers_params = [
+ (
+ src_kv_ptrs[layer_id],
+ dst_kv_ptrs[layer_id],
+ item_lens[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+ else:
+ src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
+ self.get_mha_kv_ptrs_with_pp(src_data_ptrs, dst_data_ptrs)
+ )
+
+ layers_params = [
+ (
+ src_k_ptrs[layer_id],
+ dst_k_ptrs[layer_id],
+ item_lens[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ] + [
+ (
+ src_v_ptrs[layer_id],
+ dst_v_ptrs[layer_id],
+ item_lens[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+
+ src_addrs = []
+ src_lens = []
+ dst_addrs = []
+ dst_lens = []
+
+ # Precompute block starts/lengths to reduce Python-level loops.
+ prefill_starts = np.fromiter(
+ (block[0] for block in prefill_kv_blocks), dtype=np.int64
+ )
+ dst_starts = np.fromiter((block[0] for block in dst_kv_blocks), dtype=np.int64)
+ block_lens = np.fromiter(
+ (len(block) for block in prefill_kv_blocks), dtype=np.int64
+ )
+
+ for src_ptr, dst_ptr, item_len in layers_params:
+ lengths = item_len * block_lens
+ src_addrs.append(src_ptr + prefill_starts * item_len)
+ src_lens.append(lengths)
+ dst_addrs.append(dst_ptr + dst_starts * item_len)
+ dst_lens.append(lengths)
+
+ def make_req_array(addr_chunks, len_chunks, gpu):
+ if not addr_chunks:
+ return np.empty((0, 3), dtype=np.int64)
+ flat_addrs = np.concatenate(addr_chunks)
+ flat_lens = np.concatenate(len_chunks)
+ return np.column_stack(
+ (
+ flat_addrs,
+ flat_lens,
+ np.full_like(flat_addrs, gpu),
+ )
+ )
+
+ src_reqs = make_req_array(src_addrs, src_lens, self.kv_args.gpu_id)
+ dst_reqs = make_req_array(dst_addrs, dst_lens, dst_gpu_id)
+
+ logger.debug(
+ f"len(src_addrs): before group: {len(prefill_data_indices)}, after group: {len(src_addrs)}"
+ )
+ src_descs = self.agent.get_xfer_descs(src_reqs, "VRAM")
+ dst_descs = self.agent.get_xfer_descs(dst_reqs, "VRAM")
+ # Transfer data
+ xfer_handle = self.agent.initialize_xfer(
+ "WRITE",
+ src_descs,
+ dst_descs,
+ peer_name,
+ notif.encode("ascii"), # type: ignore
+ )
+ if not xfer_handle:
+ raise Exception("KVSender failed to create transfer")
+ state = self.agent.transfer(xfer_handle)
+ if state == "ERR":
+ raise Exception("KVSender failed to post transfer")
+ return xfer_handle
+
+ def send_kvcache(
+ self,
+ peer_name: str,
+ prefill_kv_indices: npt.NDArray[np.int32],
+ dst_kv_ptrs: list[int],
+ dst_kv_indices: npt.NDArray[np.int32],
+ dst_gpu_id: int,
+ notif: str,
+ ):
+ return self._send_kvcache_generic(
+ peer_name=peer_name,
+ src_data_ptrs=self.kv_args.kv_data_ptrs,
+ dst_data_ptrs=dst_kv_ptrs,
+ item_lens=self.kv_args.kv_item_lens,
+ prefill_data_indices=prefill_kv_indices,
+ dst_data_indices=dst_kv_indices,
+ dst_gpu_id=dst_gpu_id,
+ notif=notif,
+ )
+
+ def send_kvcache_slice(
+ self,
+ peer_name: str,
+ prefill_kv_indices: npt.NDArray[np.int32],
+ dst_kv_ptrs: list[int],
+ dst_kv_indices: npt.NDArray[np.int32],
+ dst_gpu_id: int,
+ notif: str,
+ prefill_tp_size: int,
+ decode_tp_size: int,
+ decode_tp_rank: int,
+ dst_kv_item_len: int,
+ ):
+ # Get configuration from kv_args
+ local_tp_rank_in_group = self.kv_args.engine_rank % prefill_tp_size
+ dst_tp_rank_in_group = decode_tp_rank % decode_tp_size
+ num_kv_heads = self.kv_args.kv_head_num
+
+ # Calculate head distribution
+ src_heads_per_rank = num_kv_heads
+ dst_heads_per_rank = num_kv_heads * prefill_tp_size // decode_tp_size
+
+ src_kv_item_len = self.kv_args.kv_item_lens[0]
+ page_size = self.kv_args.page_size
+
+ bytes_per_head_slice_to_send = (
+ dst_kv_item_len // page_size // dst_heads_per_rank
+ )
+
+ # Determine which heads to send
+ if prefill_tp_size > decode_tp_size:
+ # Multiple prefill ranks to one decode rank
+ src_head_start_offset = 0
+ num_heads_to_send = src_heads_per_rank
+ dst_head_start_offset = local_tp_rank_in_group * src_heads_per_rank
+ else:
+ # Send KVCache from 1 prefill instance to multiple decode instances
+ src_head_start_offset = (
+ dst_tp_rank_in_group * dst_heads_per_rank
+ ) % src_heads_per_rank
+ num_heads_to_send = dst_heads_per_rank
+ dst_head_start_offset = 0
+
+ src_k_ptrs, src_v_ptrs, dst_k_ptrs, dst_v_ptrs, layers_current_pp_stage = (
+ self.get_mha_kv_ptrs_with_pp(self.kv_args.kv_data_ptrs, dst_kv_ptrs)
+ )
+ # Calculate precise byte offset and length for the sub-slice within the token
+ src_head_slice_offset = src_head_start_offset * bytes_per_head_slice_to_send
+ dst_head_slice_offset = dst_head_start_offset * bytes_per_head_slice_to_send
+ heads_bytes_per_token_to_send = num_heads_to_send * bytes_per_head_slice_to_send
+
+ src_dst_ptr_pairs = [
+ (
+ src_k_ptrs[layer_id],
+ dst_k_ptrs[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ] + [
+ (
+ src_v_ptrs[layer_id],
+ dst_v_ptrs[layer_id],
+ )
+ for layer_id in range(layers_current_pp_stage)
+ ]
+
+ prefill_indices = np.asarray(prefill_kv_indices, dtype=np.int64)
+ dst_indices = np.asarray(dst_kv_indices, dtype=np.int64)
+ bytes_per_token_prefill = src_kv_item_len // page_size
+ bytes_per_token_decode = dst_kv_item_len // page_size
+ token_offsets = np.arange(page_size, dtype=np.int64)
+
+ src_addrs = []
+ dst_addrs = []
+
+ for src_ptr, dst_ptr in src_dst_ptr_pairs:
+ src_page_bases = src_ptr + prefill_indices * src_kv_item_len
+ dst_page_bases = dst_ptr + dst_indices * dst_kv_item_len
+
+ src_all = (
+ src_page_bases[:, None]
+ + token_offsets[None, :] * bytes_per_token_prefill
+ + src_head_slice_offset
+ ).ravel()
+ dst_all = (
+ dst_page_bases[:, None]
+ + token_offsets[None, :] * bytes_per_token_decode
+ + dst_head_slice_offset
+ ).ravel()
+
+ src_addrs.append(src_all)
+ dst_addrs.append(dst_all)
+
+ def make_req_array(addr_chunks, size, gpu):
+ if not addr_chunks:
+ return np.empty((0, 3), dtype=np.int64)
+ flat_addrs = np.concatenate(addr_chunks)
+ return np.column_stack(
+ (
+ flat_addrs,
+ np.full_like(flat_addrs, size),
+ np.full_like(flat_addrs, gpu),
+ )
+ )
+
+ src_reqs = make_req_array(
+ src_addrs, heads_bytes_per_token_to_send, self.kv_args.gpu_id
+ )
+ dst_reqs = make_req_array(dst_addrs, heads_bytes_per_token_to_send, dst_gpu_id)
+
+ # Use NIXL agent for transfer
+ src_descs = self.agent.get_xfer_descs(src_reqs, "VRAM")
+ dst_descs = self.agent.get_xfer_descs(dst_reqs, "VRAM")
+
+ xfer_handle = self.agent.initialize_xfer(
+ "WRITE", src_descs, dst_descs, peer_name, notif.encode("ascii")
+ )
+ if not xfer_handle:
+ raise Exception("Failed to create sliced KV transfer")
+
+ state = self.agent.transfer(xfer_handle)
+ if state == "ERR":
+ raise Exception("Failed to post sliced KV transfer")
+
+ return xfer_handle
+
+ def send_aux(
+ self,
+ peer_name: str,
+ prefill_aux_index: int,
+ dst_aux_ptrs: list[int],
+ dst_aux_index: int,
+ notif: str,
+ ):
+ src_addrs = []
+ dst_addrs = []
+
+ prefill_aux_ptrs = self.kv_args.aux_data_ptrs
+ prefill_aux_item_lens = self.kv_args.aux_item_lens
+
+ for i, _ in enumerate(dst_aux_ptrs):
+ length = prefill_aux_item_lens[i]
+ src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
+ dst_addr = dst_aux_ptrs[i] + length * dst_aux_index
+ src_addrs.append((src_addr, length, 0))
+ dst_addrs.append((dst_addr, length, 0))
+
+ src_descs = self.agent.get_xfer_descs(src_addrs, "DRAM")
+ dst_descs = self.agent.get_xfer_descs(dst_addrs, "DRAM")
+ # Transfer data
+ xfer_handle = self.agent.initialize_xfer(
+ "WRITE",
+ src_descs,
+ dst_descs,
+ peer_name,
+ notif.encode("ascii"), # type: ignore
+ )
+ if not xfer_handle:
+ raise Exception("KVSender failed to create transfer")
+ state = self.agent.transfer(xfer_handle)
+ if state == "ERR":
+ raise Exception("KVSender failed to post transfer")
+ return xfer_handle
+
+ def _send_mamba_state(
+ self,
+ peer_name: str,
+ prefill_state_indices: List[int],
+ dst_state_data_ptrs: list[int],
+ dst_state_indices: List[int],
+ dst_gpu_id: int,
+ notif: str,
+ ):
+ """Transfer Mamba states via RDMA."""
+ assert len(prefill_state_indices) == 1, "Mamba should have single state index"
+ assert len(dst_state_indices) == len(
+ prefill_state_indices
+ ), "State indices count mismatch between Prefill and Decode"
+
+ src_addrs = []
+ dst_addrs = []
+
+ prefill_state_data_ptrs = self.kv_args.state_data_ptrs
+ prefill_state_item_lens = self.kv_args.state_item_lens
+
+ for i, dst_state_ptr in enumerate(dst_state_data_ptrs):
+ length = prefill_state_item_lens[i]
+ src_addr = prefill_state_data_ptrs[i] + length * int(
+ prefill_state_indices[0]
+ )
+ dst_addr = dst_state_ptr + length * int(dst_state_indices[0])
+ src_addrs.append((src_addr, length, self.kv_args.gpu_id))
+ dst_addrs.append((dst_addr, length, dst_gpu_id))
+
+ src_descs = self.agent.get_xfer_descs(src_addrs, "VRAM")
+ dst_descs = self.agent.get_xfer_descs(dst_addrs, "VRAM")
+
+ xfer_handle = self.agent.initialize_xfer(
+ "WRITE",
+ src_descs,
+ dst_descs,
+ peer_name,
+ notif.encode("ascii"),
+ )
+ if not xfer_handle:
+ raise Exception("Failed to create Mamba state transfer")
+ state = self.agent.transfer(xfer_handle)
+ if state == "ERR":
+ raise Exception("Failed to post Mamba state transfer")
+ return xfer_handle
+
+ def maybe_send_extra(
+ self,
+ peer_name: str,
+ prefill_state_indices: List[int],
+ dst_state_data_ptrs: list[int],
+ dst_state_indices: List[int],
+ dst_gpu_id: int,
+ notif: str,
+ decode_tp_size: int,
+ ):
+ """Send state or extra pool data with type-specific handling."""
+ state_type = getattr(self.kv_args, "state_type", "none")
+
+ if state_type == "mamba":
+ if self.attn_tp_size != decode_tp_size:
+ raise RuntimeError(
+ "PD Disaggregation does NOT support PD different TP sizes for hybrid mamba models yet."
+ )
+ return self._send_mamba_state(
+ peer_name,
+ prefill_state_indices,
+ dst_state_data_ptrs,
+ dst_state_indices,
+ dst_gpu_id,
+ notif,
+ )
+ elif state_type in ["swa", "nsa"]:
+ if not self.is_mla_backend and self.attn_tp_size != decode_tp_size:
+ raise RuntimeError(
+ f"PD Disaggregation does NOT support PD different TP sizes for non-MLA {state_type.upper()} hybrid models yet."
+ )
+ if len(prefill_state_indices) != len(dst_state_indices):
+ raise RuntimeError(
+ f"State index length mismatch: prefill={len(prefill_state_indices)}, "
+ f"dst={len(dst_state_indices)}"
+ )
+ return self._send_kvcache_generic(
+ peer_name=peer_name,
+ src_data_ptrs=self.kv_args.state_data_ptrs,
+ dst_data_ptrs=dst_state_data_ptrs,
+ item_lens=self.kv_args.state_item_lens,
+ prefill_data_indices=np.array(prefill_state_indices, dtype=np.int32),
+ dst_data_indices=np.array(dst_state_indices, dtype=np.int32),
+ dst_gpu_id=dst_gpu_id,
+ notif=notif,
+ )
+ else:
+ if state_type != "none":
+ raise RuntimeError(
+ f"PD Disaggregation via NIXL does NOT support {state_type} hybrid models yet."
+ )
+ return None
+
+ def add_transfer_request(
+ self,
+ bootstrap_room: int,
+ kv_indices: npt.NDArray[np.int32],
+ index_slice: slice,
+ is_last: bool,
+ chunk_id: int,
+ aux_index: Optional[int] = None,
+ state_indices: Optional[List[int]] = None,
+ ):
+ assert self.disaggregation_mode == DisaggregationMode.PREFILL
+ assert not is_last or (is_last and aux_index is not None)
+
+ reqs_to_be_processed = self.transfer_infos[bootstrap_room].values()
+ handles = []
+ for req in reqs_to_be_processed:
+ assert bootstrap_room == req.room
+ if req.is_dummy():
+ continue
+
+ chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
+ assert len(chunked_dst_kv_indice) == len(kv_indices)
+ assert req.agent_name in self.decode_kv_args_table
+
+ notif = f"{req.room}_kv_{chunk_id}_{int(is_last)}_{self.kv_args.pp_rank}"
+ decode_tp_size = self.decode_kv_args_table[req.agent_name].decode_tp_size
+
+ if self.is_mla_backend or (decode_tp_size == self.attn_tp_size):
+ kv_xfer_handle = self.send_kvcache(
+ req.agent_name,
+ kv_indices,
+ self.decode_kv_args_table[req.agent_name].dst_kv_ptrs,
+ chunked_dst_kv_indice,
+ self.decode_kv_args_table[req.agent_name].gpu_id,
+ notif,
+ )
+ else:
+ kv_xfer_handle = self.send_kvcache_slice(
+ req.agent_name,
+ kv_indices,
+ self.decode_kv_args_table[req.agent_name].dst_kv_ptrs,
+ chunked_dst_kv_indice,
+ self.decode_kv_args_table[req.agent_name].gpu_id,
+ notif,
+ prefill_tp_size=self.attn_tp_size,
+ decode_tp_size=decode_tp_size,
+ decode_tp_rank=self.decode_kv_args_table[
+ req.agent_name
+ ].decode_tp_rank,
+ dst_kv_item_len=self.decode_kv_args_table[
+ req.agent_name
+ ].dst_kv_item_len,
+ )
+
+ handles.append(kv_xfer_handle)
+ # Only the last chunk we need to send the aux data.
+ if is_last:
+ if state_indices is not None:
+ dst_info = self.decode_kv_args_table[req.agent_name]
+ state_xfer_handle = self.maybe_send_extra(
+ req.agent_name,
+ state_indices,
+ dst_info.dst_state_data_ptrs,
+ req.dst_state_indices,
+ dst_info.gpu_id,
+ f"{req.room}_state_{self.kv_args.pp_rank}",
+ decode_tp_size,
+ )
+ if state_xfer_handle is not None:
+ handles.append(state_xfer_handle)
+
+ assert aux_index is not None
+ aux_xfer_handle = self.send_aux(
+ req.agent_name,
+ aux_index,
+ self.decode_kv_args_table[req.agent_name].dst_aux_ptrs,
+ req.dst_aux_index,
+ f"{req.room}_aux",
+ )
+ handles.append(aux_xfer_handle)
+ if is_last:
+ del self.transfer_infos[bootstrap_room]
+ return handles
+
+ def update_transfer_status(self):
+ # Process notifications from received transfers.
+ notif_map = self.agent.get_new_notifs()
+ for peer_name, messages in notif_map.items():
+ # We could also check that self.bootstrap_info['agent_name'] matches
+ # the message sender. But the bootstrap room alone should be
+ # sufficient to map the status.
+ for msg in messages:
+ components = msg.decode("ascii").split("_", 4)
+ room = int(components[0])
+ if components[1] == "kv":
+ chunk_id = int(components[2])
+ is_last = bool(int(components[3]))
+ pp_rank = int(components[4]) if len(components) > 4 else 0
+ # Track received chunks per pp_rank
+ self.transfer_statuses[room].received_kvs_per_pp[pp_rank].add(
+ chunk_id
+ )
+ if is_last:
+ # Record expected chunk count for this pp_rank
+ self.transfer_statuses[room].expected_kvs_per_pp[pp_rank] = (
+ chunk_id + 1
+ )
+ # Set num_pp_ranks_expected from table (or default to 1)
+ if self.transfer_statuses[room].num_pp_ranks_expected is None:
+ self.transfer_statuses[room].num_pp_ranks_expected = (
+ self.required_prefill_response_num_table.get(room, 1)
+ )
+ elif components[1] == "aux":
+ self.transfer_statuses[room].received_aux = True
+ elif components[1] == "state":
+ pp_rank = int(components[2]) if len(components) > 2 else 0
+ self.transfer_statuses[room].received_state_per_pp.add(pp_rank)
+
+ def check_transfer_done(self, room: int):
+ if room not in self.transfer_statuses:
+ return False
+ return self.transfer_statuses[room].is_done()
+
+ def _start_bootstrap_thread(self):
+ def bootstrap_thread():
+ """This thread recvs transfer info from the decode engine"""
+ while True:
+ waiting_req_bytes = self.server_socket.recv_multipart()
+ logger.debug(
+ f"Received multipart with total byte size {sum(len(x) for x in waiting_req_bytes)}"
+ )
+ assert (
+ waiting_req_bytes[0] == GUARD
+ ), f"First message should be {GUARD}. Foreign traffic?"
+ waiting_req_bytes = waiting_req_bytes[1:]
+ room = waiting_req_bytes[0].decode("ascii")
+ agent_name = waiting_req_bytes[3].decode("ascii")
+ if room == "None":
+ # Register new peer and save KV base pointers.
+ self._add_remote_peer(
+ KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
+ )
+ logger.debug(f"Register KVArgs from {agent_name} successfully")
+ continue
+ room = int(room)
+ if room not in self.transfer_infos:
+ self.transfer_infos[room] = {}
+ self.transfer_infos[room][agent_name] = TransferInfo.from_zmq(
+ waiting_req_bytes
+ )
+ required_dst_info_num = self.transfer_infos[room][
+ agent_name
+ ].required_dst_info_num
+ logger.debug(f"got info {room=} {agent_name=} {required_dst_info_num=}")
+ if len(self.transfer_infos[room]) == required_dst_info_num:
+ logger.debug(f"{room=} is bootstrapped")
+ self.update_status(room, KVPoll.WaitingForInput)
+
+ threading.Thread(target=bootstrap_thread).start()
+
+
+class NixlKVSender(CommonKVSender):
+ def __init__(
+ self,
+ mgr: NixlKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: int,
+ dest_tp_ranks: List[int],
+ pp_rank: int,
+ ):
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, dest_tp_ranks, pp_rank)
+ self.xfer_handles = []
+ self.has_sent = False
+ self.chunk_id = 0
+
+ def send(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ state_indices: Optional[List[int]] = None,
+ ):
+ index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
+ self.curr_idx += len(kv_indices)
+ is_last = self.curr_idx == self.num_kv_indices
+
+ # Special handling for cp
+ if self.kv_mgr.enable_all_cp_ranks_for_transfer:
+ kv_indices, index_slice = filter_kv_indices_for_cp_rank(
+ self.kv_mgr,
+ kv_indices,
+ index_slice,
+ )
+ elif self.kv_mgr.is_dummy_cp_rank:
+ if not is_last:
+ return
+ else:
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Success)
+ return
+
+ new_xfer_handles = self.kv_mgr.add_transfer_request(
+ self.bootstrap_room,
+ kv_indices,
+ index_slice,
+ is_last,
+ self.chunk_id,
+ self.aux_index,
+ state_indices,
+ )
+ self.xfer_handles.extend(new_xfer_handles)
+ self.chunk_id += 1
+ if is_last:
+ self.has_sent = True
+ del self.kv_mgr.request_status[self.bootstrap_room]
+
+ def poll(self) -> KVPoll:
+ if not self.has_sent:
+ return self.kv_mgr.check_status(self.bootstrap_room)
+ states = [self.kv_mgr.agent.check_xfer_state(x) for x in self.xfer_handles]
+ if all([x == "DONE" for x in states]):
+ return KVPoll.Success # type: ignore
+ if any([x == "ERR" for x in states]):
+ raise Exception("KVSender transfer encountered an error.")
+ return KVPoll.WaitingForInput # type: ignore
+
+ def failure_exception(self):
+ raise RuntimeError("NIXL KVSender Exception")
+
+
+class NixlKVReceiver(CommonKVReceiver):
+ def __init__(
+ self,
+ mgr: NixlKVManager,
+ bootstrap_addr: str,
+ bootstrap_room: Optional[int] = None,
+ prefill_dp_rank: Optional[int] = None,
+ ):
+ self.started_transfer = False
+ self.conclude_state = None
+ super().__init__(mgr, bootstrap_addr, bootstrap_room, prefill_dp_rank)
+
+ # Track this room with its bootstrap address for heartbeat monitoring
+ if hasattr(self.kv_mgr, "addr_to_rooms_tracker"):
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(
+ self.bootstrap_room
+ )
+ self.init_time = None
+
+ def init(
+ self,
+ kv_indices: npt.NDArray[np.int32],
+ aux_index: Optional[int] = None,
+ state_indices: Optional[List[int]] = None,
+ ):
+ if self.bootstrap_infos is None:
+ logger.error(
+ f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
+ )
+ self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+ return
+
+ for bootstrap_info in self.bootstrap_infos:
+ logger.debug(
+ f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
+ )
+ sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+ is_dummy = bootstrap_info["is_dummy"]
+ logger.debug(
+ f"Sending to prefill server with bootstrap room {self.bootstrap_room} {is_dummy=}"
+ )
+ with lock:
+ sock.send_multipart(
+ [
+ GUARD,
+ str(self.bootstrap_room).encode("ascii"),
+ self.kv_mgr.local_ip.encode("ascii"),
+ str(self.kv_mgr.rank_port).encode("ascii"),
+ self.kv_mgr.agent.name.encode("ascii"),
+ kv_indices.tobytes() if not is_dummy else b"",
+ str(aux_index).encode("ascii"),
+ str(self.required_dst_info_num).encode("ascii"),
+ (
+ np.array(state_indices, dtype=np.int32).tobytes()
+ if not is_dummy and state_indices is not None
+ else b""
+ ),
+ ]
+ )
+
+ # Mark that we expect state data if state_indices was provided
+ if state_indices is not None:
+ self.kv_mgr.transfer_statuses[self.bootstrap_room].expects_state = True
+
+ self.started_transfer = True
+ self.init_time = time.time()
+
+ def poll(self) -> KVPoll:
+ if self.conclude_state is not None:
+ return self.conclude_state
+ status = self.kv_mgr.check_status(self.bootstrap_room)
+ if status in (KVPoll.Success, KVPoll.Failed):
+ self.conclude_state = status
+ return status
+ if not self.started_transfer:
+ return KVPoll.WaitingForInput # type: ignore
+
+ now = time.time()
+ elapsed = now - self.init_time
+
+ if elapsed >= self.kv_mgr.waiting_timeout:
+ logger.error(f"Request {self.bootstrap_room} waiting_timeout")
+ self.kv_mgr.record_failure(
+ self.bootstrap_room,
+ f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
+ )
+ self.conclude_state = KVPoll.Failed
+ return KVPoll.Failed
+
+ self.kv_mgr.update_transfer_status()
+ if self.kv_mgr.check_transfer_done(self.bootstrap_room): # type: ignore
+ self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].discard(
+ self.bootstrap_room
+ )
+ # Check if the transfer failed
+ if self.kv_mgr.transfer_statuses[self.bootstrap_room].is_failed():
+ self.conclude_state = KVPoll.Failed
+ logger.error(
+ f"Transfer for room {self.bootstrap_room} failed due to node failure"
+ )
+ else:
+ self.conclude_state = KVPoll.Success
+ del self.kv_mgr.transfer_statuses[self.bootstrap_room]
+ return self.conclude_state # type: ignore
+ return KVPoll.WaitingForInput # type: ignore
+
+ def _register_kv_args(self):
+ for bootstrap_info in self.bootstrap_infos:
+ sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+ packed_kv_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
+ )
+ packed_aux_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.aux_data_ptrs
+ )
+ packed_state_data_ptrs = b"".join(
+ struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.state_data_ptrs
+ )
+
+ with lock:
+ sock.send_multipart(
+ [
+ GUARD,
+ "None".encode("ascii"),
+ self.kv_mgr.local_ip.encode("ascii"),
+ str(self.kv_mgr.rank_port).encode("ascii"),
+ self.kv_mgr.agent.name.encode("ascii"),
+ self.kv_mgr.agent.get_agent_metadata(),
+ packed_kv_data_ptrs,
+ packed_aux_data_ptrs,
+ packed_state_data_ptrs,
+ str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
+ str(self.kv_mgr.attn_tp_size).encode("ascii"),
+ str(self.kv_mgr.kv_args.engine_rank).encode("ascii"),
+ str(self.kv_mgr.kv_args.kv_item_lens[0]).encode("ascii"),
+ ]
+ )
+
+ def failure_exception(self):
+ raise RuntimeError("NIXL KVReceiver Exception")
+
+
+class NixlKVBootstrapServer(CommonKVBootstrapServer):
+ pass
diff --git a/sglang/python/sglang/srt/dllm/__pycache__/config.cpython-311.pyc b/sglang/python/sglang/srt/dllm/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9c932c7680c3a2db18dc49b72b83fd8727e284f
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/__pycache__/config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/algorithm/__init__.py b/sglang/python/sglang/srt/dllm/algorithm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d4ae9a4f7599b24369beb9c0f0a62d741657cf1
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/algorithm/__init__.py
@@ -0,0 +1,39 @@
+import importlib
+import logging
+import pkgutil
+
+from sglang.srt.dllm.config import DllmConfig
+
+logger = logging.getLogger(__name__)
+
+
+def import_algorithms():
+ mapping = {}
+ package_name = "sglang.srt.dllm.algorithm"
+ package = importlib.import_module(package_name)
+ for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+ if ispkg:
+ continue
+ try:
+ module = importlib.import_module(name)
+ except Exception as e:
+ logger.warning(f"Ignore import error when loading {name}: {e}")
+ continue
+ if not hasattr(module, "Algorithm"):
+ continue
+
+ algo = module.Algorithm
+ mapping[algo.__name__] = algo
+
+ return mapping
+
+
+def get_algorithm(config: DllmConfig):
+ try:
+ name = config.algorithm
+ return algo_name_to_cls[name](config)
+ except:
+ raise RuntimeError(f"Unknown diffusion LLM algorithm: {name}")
+
+
+algo_name_to_cls = import_algorithms()
diff --git a/sglang/python/sglang/srt/dllm/algorithm/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc7f12ef4ae4f18ab72db7273a59f0a52a461ec1
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/algorithm/__pycache__/base.cpython-311.pyc b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f622cc1ff2d0de0b2f50dbf162a8c1a2f6ebaf11
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/base.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/algorithm/__pycache__/joint_threshold.cpython-311.pyc b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/joint_threshold.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f30e951e38f78344d492838353774ba02723459c
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/joint_threshold.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/algorithm/__pycache__/low_confidence.cpython-311.pyc b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/low_confidence.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7c5751711e2bad5b4c9ce3d300db6f8c15f95c6
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/algorithm/__pycache__/low_confidence.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/algorithm/base.py b/sglang/python/sglang/srt/dllm/algorithm/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..349ddf4cd9e9c14ce3f32332f23c0c819cabb853
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/algorithm/base.py
@@ -0,0 +1,18 @@
+from sglang.srt.dllm.algorithm import get_algorithm
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.server_args import ServerArgs
+
+
+class DllmAlgorithm:
+
+ def __init__(
+ self,
+ config: DllmConfig,
+ ):
+ self.block_size = config.block_size
+ self.mask_id = config.mask_id
+
+ @staticmethod
+ def from_server_args(server_args: ServerArgs):
+ config = DllmConfig.from_server_args(server_args)
+ return get_algorithm(config)
diff --git a/sglang/python/sglang/srt/dllm/algorithm/joint_threshold.py b/sglang/python/sglang/srt/dllm/algorithm/joint_threshold.py
new file mode 100644
index 0000000000000000000000000000000000000000..a572fda384722a77aab073ea1ac5e1d4d433c04b
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/algorithm/joint_threshold.py
@@ -0,0 +1,139 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from sglang.srt.dllm.algorithm.base import DllmAlgorithm
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class JointThreshold(DllmAlgorithm):
+
+ def __init__(
+ self,
+ config: DllmConfig,
+ ):
+ super().__init__(config)
+ self.threshold = config.algorithm_config.get("threshold", 0.5)
+ self.edit_threshold = config.algorithm_config.get("edit_threshold", 0)
+ self.max_post_edit_steps = config.algorithm_config.get(
+ "max_post_edit_steps", 16
+ )
+ self.penalty_lambda = config.algorithm_config.get("penalty_lambda", 0)
+
+ def run(
+ self,
+ model_runner: ModelRunner,
+ forward_batch: ForwardBatch,
+ ) -> tuple[LogitsProcessorOutput | torch.Tensor, torch.Tensor | None, bool]:
+ batch_size = forward_batch.batch_size
+ device = forward_batch.input_ids.device
+
+ mask_index = forward_batch.input_ids == self.mask_id
+ if not mask_index.any():
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ return out.logits_output, [], out.can_run_graph
+
+ start_list = []
+ prompt_masks = []
+ for i in range(batch_size):
+ block_start = i * self.block_size
+ block_end = block_start + self.block_size
+ block_input_ids = forward_batch.input_ids[block_start:block_end]
+
+ prompt_mask = block_input_ids != self.mask_id
+ prompt_masks.append(prompt_mask)
+ start_list.append(prompt_mask.sum().item())
+
+ post_edit_steps = torch.zeros(batch_size, dtype=torch.int32, device=device)
+
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
+ # Controls whether to perform an additional forward pass for KV cache persistence.
+ # For certain decoding rounds where the terminal step yields no state change,
+ # this can be set to False to bypass the overhead of an idle forward pass.
+ any_changed_in_last_step = False
+
+ max_iterations = self.block_size + self.max_post_edit_steps
+ for _ in range(max_iterations):
+ if finished.all():
+ break
+
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+
+ any_changed_in_last_step = False
+
+ for i in range(batch_size):
+ if finished[i]:
+ continue
+
+ block_start = i * self.block_size
+ block_end = block_start + self.block_size
+
+ curr_input_ids = forward_batch.input_ids[block_start:block_end]
+ curr_logits = logits_output.full_logits[block_start:block_end]
+ curr_prompt_mask = prompt_masks[i]
+
+ if self.penalty_lambda > 0:
+ prev_ids = curr_input_ids[:-1]
+ curr_logits[1:, :].scatter_(
+ 1, prev_ids.unsqueeze(-1), -self.penalty_lambda, reduce="add"
+ )
+
+ x = torch.argmax(curr_logits, dim=-1)
+ p = torch.squeeze(
+ torch.gather(
+ F.softmax(curr_logits, dim=-1),
+ dim=-1,
+ index=torch.unsqueeze(x, -1),
+ ),
+ -1,
+ )
+
+ mask_index = curr_input_ids == self.mask_id
+ has_mask = mask_index.any()
+
+ # Mask to token (M2T)
+ mask_transfer_index = torch.zeros_like(mask_index)
+ if has_mask:
+ confidence = torch.where(mask_index, p, -np.inf)
+ mask_transfer_index = confidence > self.threshold
+
+ if not mask_transfer_index.any():
+ _, select_index = torch.topk(confidence, k=1)
+ mask_transfer_index[select_index] = True
+ else:
+ post_edit_steps[i] += 1
+ if post_edit_steps[i] > self.max_post_edit_steps:
+ finished[i] = True
+ continue
+
+ # Token to token (T2T)
+ edit_mask = ~mask_index & ~curr_prompt_mask
+ edit_transfer_index = (
+ (p > self.edit_threshold) & (curr_input_ids != x) & edit_mask
+ )
+
+ transfer_index = mask_transfer_index | edit_transfer_index
+ if not transfer_index.any():
+ finished[i] = True
+ continue
+
+ curr_input_ids[transfer_index] = x[transfer_index]
+ any_changed_in_last_step = True
+
+ if any_changed_in_last_step:
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+
+ next_token_ids = torch.reshape(forward_batch.input_ids, (batch_size, -1))
+ next_token_ids_list = [
+ next_token_ids[i, start_list[i] :] for i in range(batch_size)
+ ]
+
+ return logits_output, next_token_ids_list, can_run_cuda_graph
+
+
+Algorithm = JointThreshold
diff --git a/sglang/python/sglang/srt/dllm/algorithm/low_confidence.py b/sglang/python/sglang/srt/dllm/algorithm/low_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..6acdcc7f8b683d2785c84ea6c371266e06576e98
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/algorithm/low_confidence.py
@@ -0,0 +1,104 @@
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from sglang.srt.dllm.algorithm.base import DllmAlgorithm
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class LowConfidence(DllmAlgorithm):
+
+ def __init__(
+ self,
+ config: DllmConfig,
+ ):
+ super().__init__(config)
+ self.threshold = config.algorithm_config.get("threshold", 0.95)
+
+ def run(
+ self,
+ model_runner: ModelRunner,
+ forward_batch: ForwardBatch,
+ ) -> Tuple[Union[LogitsProcessorOutput, torch.Tensor], List[torch.Tensor], bool]:
+ batch_size = forward_batch.batch_size
+ # Here, the forward_batch full logits contains all the blocks
+ # such as [dllm_block_size * batch_size, hidden_size]
+ start_list = []
+ mask_index = forward_batch.input_ids == self.mask_id
+
+ # Fast path: if there is no mask token, forward and save kv cache
+ if torch.sum(mask_index).item() == 0:
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+
+ next_token_ids = []
+ return logits_output, next_token_ids, can_run_cuda_graph
+
+ # Calculate start positions for each block
+ for block_id in range(batch_size):
+ block_start = block_id * self.block_size
+ block_end = block_start + self.block_size
+ block_input_ids = forward_batch.input_ids[block_start:block_end]
+ block_mask_index = block_input_ids == self.mask_id
+ start = self.block_size - torch.sum(block_mask_index).item()
+ start_list.append(start)
+
+ for _ in range(self.block_size):
+ mask_index = forward_batch.input_ids == self.mask_id
+ if torch.sum(mask_index).item() == 0:
+ break
+
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+ assert batch_size == forward_batch.input_ids.shape[0] // self.block_size
+ for batch_id in range(batch_size):
+ curr_block_start = batch_id * self.block_size
+ curr_block_end = curr_block_start + self.block_size
+ block_input_ids = forward_batch.input_ids[
+ curr_block_start:curr_block_end,
+ ]
+ block_mask_index = block_input_ids == self.mask_id
+ if torch.sum(block_mask_index).item() == 0:
+ continue
+ curr_logits = logits_output.full_logits[
+ curr_block_start:curr_block_end,
+ ]
+
+ x = torch.argmax(curr_logits, dim=-1)
+ p = torch.squeeze(
+ torch.gather(
+ F.softmax(curr_logits, dim=-1),
+ dim=-1,
+ index=torch.unsqueeze(x, -1),
+ ),
+ -1,
+ )
+ x = torch.where(block_mask_index, x, block_input_ids)
+ confidence = torch.where(block_mask_index, p, -np.inf)
+
+ transfer_index = confidence > self.threshold
+
+ if transfer_index.sum().item() == 0:
+ _, select_index = torch.topk(confidence, k=1)
+ transfer_index[select_index] = True
+
+ block_input_ids[transfer_index] = x[transfer_index]
+
+ out = model_runner.forward(forward_batch, pp_proxy_tensors=None)
+ logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+ # Here next token ids is tricky to implement the dynamic lengths,
+ # so we return a list of tensors
+ next_token_ids = torch.reshape(forward_batch.input_ids, (batch_size, -1))
+ next_token_ids_list = [
+ next_token_ids[i, start_list[i] :] for i in range(batch_size)
+ ]
+
+ return logits_output, next_token_ids_list, can_run_cuda_graph
+
+
+Algorithm = LowConfidence
diff --git a/sglang/python/sglang/srt/dllm/mixin/__pycache__/req.cpython-311.pyc b/sglang/python/sglang/srt/dllm/mixin/__pycache__/req.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fc5feaf47d5238c02cf7d4d2e0bd23eaef39286
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/mixin/__pycache__/req.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/mixin/__pycache__/scheduler.cpython-311.pyc b/sglang/python/sglang/srt/dllm/mixin/__pycache__/scheduler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..086733ce3a378fa1cabae7ccffccbdc77fc4e627
Binary files /dev/null and b/sglang/python/sglang/srt/dllm/mixin/__pycache__/scheduler.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/dllm/mixin/req.py b/sglang/python/sglang/srt/dllm/mixin/req.py
new file mode 100644
index 0000000000000000000000000000000000000000..720b9d1db162fcef48eee7e0e4593cd6bb008084
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/mixin/req.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import enum
+from typing import TYPE_CHECKING, Optional
+
+from sglang.srt.dllm.config import DllmConfig
+
+if TYPE_CHECKING:
+ from sglang.srt.managers.schedule_batch import Req
+
+
+class DllmReqPhase(str, enum.Enum):
+ STAGING_PREFILL = "staging_prefill"
+ STAGING_DECODE = "staging_decode"
+ INCOMING_PREFILL = "incoming_prefill"
+ INCOMING_DECODE = "incoming_decode"
+
+
+class ReqDllmMixin:
+ def init_diffusion_llm(self: Req, dllm_config: DllmConfig):
+ self.dllm_phase: Optional[DllmReqPhase] = None
+ self.dllm_block_offset = 0
+ self.dllm_config = dllm_config
+
+ if self.dllm_config is not None:
+ if len(self.origin_input_ids) < self.dllm_config.block_size:
+ self.dllm_phase = DllmReqPhase.INCOMING_DECODE
+ else:
+ self.dllm_phase = DllmReqPhase.INCOMING_PREFILL
+
+ def is_dllm(self: Req) -> bool:
+ return self.dllm_config is not None
+
+ def is_dllm_prefill(self: Req) -> bool:
+ return self.dllm_phase in [
+ DllmReqPhase.STAGING_PREFILL,
+ DllmReqPhase.INCOMING_PREFILL,
+ ]
+
+ def determine_dllm_phase(self: Req):
+ prefix_length = len(self.prefix_indices)
+ min_required_length = prefix_length + self.dllm_config.block_size
+
+ if len(self.fill_ids) < min_required_length:
+ # still incoming stage
+ return
+
+ input_block = self.fill_ids[prefix_length:min_required_length]
+ is_prefill_phase = self.dllm_config.mask_id not in input_block
+
+ if is_prefill_phase:
+ self.dllm_phase = DllmReqPhase.STAGING_PREFILL
+ else:
+ self.dllm_phase = DllmReqPhase.STAGING_DECODE
+
+ def _init_fill_ids_for_dllm(self: Req):
+ self.dllm_block_offset = (
+ 0
+ if not self.fill_ids
+ else self.dllm_block_offset + self.dllm_config.block_size
+ )
+ self.fill_ids = (
+ self.origin_input_ids
+ + self.output_ids
+ + [self.dllm_config.mask_id] * self.dllm_config.block_size
+ )
+
+ def _update_block_offset_for_dllm(self):
+ prefix_len = len(self.prefix_indices)
+ assert (
+ prefix_len % self.dllm_config.block_size == 0
+ ), f"Unexpected prefix len: {prefix_len}"
+ if prefix_len > self.dllm_block_offset:
+ self.dllm_block_offset = prefix_len
diff --git a/sglang/python/sglang/srt/dllm/mixin/scheduler.py b/sglang/python/sglang/srt/dllm/mixin/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b852507c788b6ec68b884e05450a3cf0d840167
--- /dev/null
+++ b/sglang/python/sglang/srt/dllm/mixin/scheduler.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, List, Optional, Set, Union
+
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.dllm.mixin.req import DllmReqPhase
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.schedule_policy import AddReqResult, PrefillAdder
+from sglang.srt.mem_cache.common import release_kv_cache
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.observability.req_time_stats import set_time_batch
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+ from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
+
+
+class SchedulerDllmMixin:
+ def init_diffusion_llm(self: Scheduler):
+ self.dllm_config = (
+ DllmConfig.from_server_args(self.server_args)
+ if self.server_args.dllm_algorithm is not None
+ else None
+ )
+ self.dllm_manager = DllmManager(dllm_config=self.dllm_config)
+
+ def get_new_batch_dllm(self: Scheduler) -> Optional[ScheduleBatch]:
+ """Generate a new batch for DLLM (Diffusion LLM) scheduling."""
+ if self.enable_priority_preemption:
+ self.running_batch.batch_is_full = False
+
+ # Early exit if batch is full or no requests available
+ if self._should_skip_prefill():
+ return None
+
+ running_bs = len(self.running_batch.reqs)
+ self.policy.calc_priority(self.waiting_queue)
+
+ # Create prefill adder with resource constraints
+ adder = self._create_dllm_prefill_adder(running_bs)
+
+ # Initialize DLLM manager and transfer requests
+ self.dllm_manager.init_next_round()
+ self._fetch_waiting_reqs()
+
+ # Process batches
+ forward_mode = self._process_dllm_batches(adder)
+
+ can_run_list = adder.can_run_list
+ if not can_run_list:
+ return None
+
+ # Record metrics and update state
+ set_time_batch(can_run_list, "set_forward_entry_time")
+ self._update_state_for_batch(can_run_list, adder, running_bs)
+
+ # Create and prepare batch
+ new_batch = self._create_dllm_batch(can_run_list, forward_mode)
+ return new_batch
+
+ def process_batch_result_dllm(
+ self: Scheduler,
+ batch: ScheduleBatch,
+ result: GenerationBatchResult,
+ ):
+ if result.copy_done is not None:
+ result.copy_done.synchronize()
+
+ if result.next_token_ids:
+ self.token_to_kv_pool_allocator.free_group_begin()
+
+ for idx in range(batch.batch_size()):
+ req = batch.reqs[idx]
+
+ next_token_ids = result.next_token_ids[idx].tolist()
+ new_tokens = len(next_token_ids)
+ if new_tokens == 0:
+ continue
+
+ req.fill_ids[-new_tokens:] = next_token_ids[:]
+ self.num_generated_tokens += new_tokens
+
+ req.output_ids.extend(next_token_ids)
+ req.check_finished(new_accepted_len=new_tokens)
+
+ if req.finished():
+ release_kv_cache(req, self.tree_cache)
+ req.time_stats.set_completion_time()
+
+ self.stream_output(batch.reqs, batch.return_logprob)
+ self.token_to_kv_pool_allocator.free_group_end()
+
+ if self.current_scheduler_metrics_enabled:
+ can_run_cuda_graph = getattr(result, "can_run_cuda_graph", False)
+ self.log_prefill_stats(
+ prefill_stats=batch.prefill_stats,
+ can_run_cuda_graph=can_run_cuda_graph,
+ dp_cooperation_info=batch.dp_cooperation_info,
+ )
+
+ def _fetch_waiting_reqs(self: Scheduler):
+ # Calculate how many requests can be added to DLLM manager
+ max_dllm_capacity = self.dllm_config.max_running_requests - len(
+ self.dllm_manager.waiting_queue
+ )
+ num_requests_to_add = min(max_dllm_capacity, len(self.waiting_queue))
+
+ if num_requests_to_add > 0:
+ requests_to_add = self.waiting_queue[:num_requests_to_add]
+ self.dllm_manager.add_waiting_reqs(requests_to_add)
+ self.waiting_queue = self.waiting_queue[num_requests_to_add:]
+
+ def _should_skip_prefill(self: Scheduler) -> bool:
+ """Check if DLLM prefill should be skipped."""
+ if (
+ self.running_batch.batch_is_full or not self.waiting_queue
+ ) and self.dllm_manager.is_empty():
+ return True
+
+ running_bs = len(self.running_batch.reqs)
+ if (
+ self.get_num_allocatable_reqs(running_bs) <= 0
+ and self.dllm_manager.is_empty()
+ and not self.enable_priority_preemption
+ ):
+ self.running_batch.batch_is_full = True
+ return True
+
+ return False
+
+ def _create_dllm_prefill_adder(self: Scheduler, running_bs: int) -> PrefillAdder:
+ """Create a prefill adder configured for DLLM scheduling."""
+ return PrefillAdder(
+ self.page_size,
+ self.tree_cache,
+ self.token_to_kv_pool_allocator,
+ self.running_batch,
+ self.new_token_ratio,
+ self.max_prefill_tokens,
+ self.chunked_prefill_size,
+ running_bs if self.is_mixed_chunk else 0,
+ self.priority_scheduling_preemption_threshold,
+ prefill_max_requests=self.server_args.prefill_max_requests,
+ dllm_config=self.dllm_config,
+ )
+
+ def _process_dllm_batches(self: Scheduler, adder: PrefillAdder) -> ForwardMode:
+ """Process prefill or decode batches for DLLM."""
+ forward_mode = ForwardMode.DLLM_EXTEND
+
+ # Try prefill batch first
+ prefill_reqs = self.dllm_manager.get_prefill_requests()
+ if prefill_reqs:
+ self._process_batch_by_phase(
+ adder,
+ prefill_reqs,
+ DllmReqPhase.STAGING_PREFILL,
+ DllmReqPhase.INCOMING_PREFILL,
+ )
+ else:
+ # Fall back to decode batch
+ decode_reqs = self.dllm_manager.get_decode_requests()
+ self._process_batch_by_phase(
+ adder,
+ decode_reqs,
+ DllmReqPhase.STAGING_DECODE,
+ DllmReqPhase.INCOMING_DECODE,
+ )
+
+ return forward_mode
+
+ def _process_batch_by_phase(
+ self,
+ adder: PrefillAdder,
+ batch: List[Req],
+ staging_phase: DllmReqPhase,
+ incoming_phase: DllmReqPhase,
+ ) -> None:
+ """Process a batch, separating staging and incoming requests."""
+ staging_reqs = [req for req in batch if req.dllm_phase == staging_phase]
+ if staging_reqs:
+ staging_result = self.process_dllm_staging_reqs(adder, staging_reqs)
+ if staging_result != AddReqResult.CONTINUE:
+ return
+
+ incoming_reqs = [req for req in batch if req.dllm_phase == incoming_phase]
+ if incoming_reqs:
+ self.process_dllm_incoming_reqs(adder, incoming_reqs)
+
+ def _update_state_for_batch(
+ self: Scheduler, can_run_list: List[Req], adder: PrefillAdder, running_bs: int
+ ) -> None:
+ """Update state for the batch."""
+
+ if adder.preempt_list:
+ for req in adder.preempt_list:
+ self._add_request_to_queue(req)
+
+ if can_run_list:
+ self.dllm_manager.add_staging_reqs(can_run_list)
+ self.dllm_manager.increment_chunked_count()
+
+ self.adder = adder
+ self.can_run_list = can_run_list
+ self.running_bs = len(self.running_batch.reqs)
+
+ def _create_dllm_batch(
+ self: Scheduler, can_run_list: List[Req], forward_mode: ForwardMode
+ ) -> ScheduleBatch:
+ """Create and prepare a new DLLM batch."""
+ new_batch = ScheduleBatch.init_new(
+ can_run_list,
+ self.req_to_token_pool,
+ self.token_to_kv_pool_allocator,
+ self.tree_cache,
+ self.model_config,
+ self.enable_overlap,
+ self.spec_algorithm,
+ dllm_config=self.dllm_config,
+ )
+ new_batch.prepare_for_extend()
+ new_batch.forward_mode = forward_mode
+ new_batch.decoding_reqs = None
+
+ # Record prefill stats for logging after forward
+ from sglang.srt.observability.scheduler_metrics_mixin import PrefillStats
+
+ new_batch.prefill_stats = PrefillStats.from_adder(
+ self.adder, self.running_batch.reqs, self.enable_priority_scheduling
+ )
+
+ return new_batch
+
+ def process_dllm_incoming_reqs(
+ self: Scheduler, adder: PrefillAdder, reqs: List[Req]
+ ) -> AddReqResult:
+ """Process incoming DLLM requests with resource allocation and preemption."""
+ res = AddReqResult.CONTINUE
+ for req in reqs:
+ # Check if batch is full
+ running_bs = len(self.running_batch.reqs)
+ if len(adder.can_run_list) >= self.get_num_allocatable_reqs(running_bs):
+ self.running_batch.batch_is_full = True
+
+ # Try preemption if batch is full
+ if self.running_batch.batch_is_full:
+ if (
+ not self.enable_priority_preemption
+ or not adder.preempt_to_schedule(req, self.server_args)
+ ):
+ break
+
+ # Prepare and add request
+ req.init_next_round_input(self.tree_cache)
+ res = adder.add_one_req(
+ req,
+ has_chunked_req=True,
+ truncation_align_size=self.truncation_align_size,
+ )
+
+ if res != AddReqResult.CONTINUE:
+ if res == AddReqResult.NO_TOKEN:
+ self.running_batch.batch_is_full = True
+ break
+
+ return res
+
+ def process_dllm_staging_reqs(
+ self: Scheduler, adder: PrefillAdder, reqs: List[Req]
+ ) -> AddReqResult:
+ """Process staging DLLM requests with resource allocation."""
+ for req in reqs:
+ res = adder.add_dllm_staging_req(req)
+ if res == AddReqResult.NO_TOKEN:
+ return res
+
+ return AddReqResult.CONTINUE
+
+
+class DllmManager:
+ """
+ Manager for Diffusion LLM request scheduling.
+
+ Maintains two queues:
+ - waiting_queue: The requests waiting to be scheduled with max running requests limit
+ - staging_queue: Requests allocated resources by PrefillAdder
+ """
+
+ def __init__(self, dllm_config: Optional[DllmConfig] = None):
+ self.dllm_config = dllm_config
+ self.max_running_reqs = (
+ dllm_config.max_running_requests if dllm_config is not None else 1
+ )
+ self.waiting_queue: List[Req] = []
+ self.staging_queue: List[Req] = []
+
+ def get_prefill_requests(self) -> List[Req]:
+ """Get all prefill requests from waiting queue."""
+ return [req for req in self.waiting_queue if req.is_dllm_prefill()]
+
+ def get_decode_requests(self) -> List[Req]:
+ """Get all decode requests from waiting queue."""
+ return [req for req in self.waiting_queue if not req.is_dllm_prefill()]
+
+ def add_waiting_reqs(self, reqs: Union[Req, List[Req]]) -> None:
+ """Add requests to waiting queue with redundancy check."""
+ assert self.dllm_config is not None, "Diffusion LLM config is not set."
+
+ reqs_to_add = reqs if isinstance(reqs, list) else [reqs]
+
+ # Check for duplicate request IDs
+ if self._has_duplicate_reqs(reqs_to_add):
+ raise RuntimeError("Redundant requests detected in dLLM requests.")
+
+ self.waiting_queue.extend(reqs_to_add)
+
+ def add_staging_reqs(self, reqs: Union[Req, List[Req]]) -> None:
+ """Add requests to staging queue (allocated by PrefillAdder)."""
+ reqs_to_add = reqs if isinstance(reqs, list) else [reqs]
+ self.staging_queue.extend(reqs_to_add)
+
+ def _has_duplicate_reqs(self, reqs: List[Req]) -> bool:
+ """Check if any request ID already exists in waiting queue."""
+ existing_rids: Set[str] = {r.rid for r in self.waiting_queue}
+ return any(req.rid in existing_rids for req in reqs)
+
+ def any_staging_reqs(self) -> bool:
+ """Check if there are requests in staging queue."""
+ return self.dllm_config is not None and len(self.staging_queue) > 0
+
+ def is_empty(self) -> bool:
+ """Check if both queues are empty or DLLM is not configured."""
+ if self.dllm_config is None:
+ return True
+ return len(self.waiting_queue) == 0
+
+ def increment_chunked_count(self) -> None:
+ """Increment chunked count for all staging requests."""
+ for req in self.staging_queue:
+ req.is_chunked += 1
+
+ def filter_finished_reqs(self) -> None:
+ """Remove finished requests from both queues."""
+ self.waiting_queue = [req for req in self.waiting_queue if not req.finished()]
+ self.staging_queue = [req for req in self.staging_queue if not req.finished()]
+
+ def init_next_round(self) -> None:
+ """Initialize staging requests for next round and clear staging queue."""
+ for req in self.staging_queue:
+ req.init_next_round_input()
+ self.staging_queue = []
diff --git a/sglang/python/sglang/srt/elastic_ep/__pycache__/elastic_ep.cpython-311.pyc b/sglang/python/sglang/srt/elastic_ep/__pycache__/elastic_ep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f68862ed1204eb7e4f7a865438063042b5328280
Binary files /dev/null and b/sglang/python/sglang/srt/elastic_ep/__pycache__/elastic_ep.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_client.cpython-311.pyc b/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_client.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f24ee7a54fe2a1d9d9c7c750b06a68ff0de124bc
Binary files /dev/null and b/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_client.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_manager.cpython-311.pyc b/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a00ac1585e59d4e7827596a7c3784d450c1cd606
Binary files /dev/null and b/sglang/python/sglang/srt/elastic_ep/__pycache__/expert_backup_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb630e7142d21ded625bde2c9aac8b17a8417c1e
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/eplb_manager.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/eplb_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2adbe9cdf30cbd9870145be327fe01d2fd73e56b
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/eplb_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/expert_distribution.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/expert_distribution.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61fb0e0d536f171f2d45b8ee7c311c4f30af0dd6
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/expert_distribution.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/expert_location.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/expert_location.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c76007a99a5d922bed44c2c983fdc388ecdc6c5
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/expert_location.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/expert_location_dispatch.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/expert_location_dispatch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08eda536d6421a0d9c5785e9586b67453be81202
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/expert_location_dispatch.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/__pycache__/expert_location_updater.cpython-311.pyc b/sglang/python/sglang/srt/eplb/__pycache__/expert_location_updater.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b251f4100ff68653cb06c8fdaf04e2366a53833a
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/__pycache__/expert_location_updater.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/__init__.py b/sglang/python/sglang/srt/eplb/eplb_algorithms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09a1417574c85ae2c9894fdad1b11e8561731c3
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_algorithms/__init__.py
@@ -0,0 +1,80 @@
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
+from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec, elasticity_aware
+
+
+class EplbAlgorithm(Enum):
+ deepseek = auto()
+ deepseek_hierarchical = auto()
+ deepseek_vec = auto()
+ deepseek_vec_hierarchical = auto()
+ elasticity_aware = auto()
+ # TODO may have more algorithm later
+
+
+def rebalance_experts(
+ tokens_per_expert: torch.Tensor,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+ num_groups: Optional[int],
+ num_nodes: int,
+ algorithm: EplbAlgorithm,
+):
+ if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
+ return deepseek.rebalance_experts(
+ weight=tokens_per_expert.sum(dim=0),
+ num_replicas=num_physical_experts,
+ num_groups=num_groups,
+ num_nodes=num_nodes,
+ num_gpus=num_physical_experts // num_local_physical_experts,
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
+ )
+
+ if algorithm in [
+ EplbAlgorithm.deepseek_vec,
+ EplbAlgorithm.deepseek_vec_hierarchical,
+ ]:
+ return deepseek_vec.rebalance_experts(
+ tokens_per_expert=tokens_per_expert,
+ num_physical_experts=num_physical_experts,
+ num_local_physical_experts=num_local_physical_experts,
+ num_groups=num_groups,
+ num_nodes=num_nodes,
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
+ )
+
+ if algorithm == EplbAlgorithm.elasticity_aware:
+ return elasticity_aware.rebalance_experts(
+ weight=tokens_per_expert.sum(dim=0),
+ num_replicas=num_physical_experts,
+ num_groups=num_groups,
+ num_nodes=num_nodes,
+ num_gpus=num_physical_experts // num_local_physical_experts,
+ enable_hierarchical=False,
+ active_ranks=(
+ ElasticEPStateManager.instance().active_ranks
+ if ElasticEPStateManager.instance() is not None
+ else ElasticEPStateManager.healthy_rank_state()
+ ),
+ )
+
+ raise NotImplementedError
+
+
+def compute_algorithm(
+ raw_algorithm: str,
+ num_groups: Optional[int],
+ num_nodes: int,
+) -> EplbAlgorithm:
+ if raw_algorithm != "auto":
+ return EplbAlgorithm[raw_algorithm]
+
+ # TODO test on real scenarios and know which ones perform better
+ if (num_groups is not None) and (num_groups % num_nodes == 0):
+ return EplbAlgorithm.deepseek_hierarchical
+ else:
+ return EplbAlgorithm.deepseek
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f169073ea269317349863d3a834076447b4262e5
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek.cpython-311.pyc b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d5793f91f4dd8cae67064ac281f135eaf837022
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek_vec.cpython-311.pyc b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek_vec.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0715e42d94a933b0688e7aa9423f6513b706fe6
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/deepseek_vec.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/elasticity_aware.cpython-311.pyc b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/elasticity_aware.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b066f687389d4baf3960b27a4128474f6f092
Binary files /dev/null and b/sglang/python/sglang/srt/eplb/eplb_algorithms/__pycache__/elasticity_aware.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek.py b/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek.py
new file mode 100644
index 0000000000000000000000000000000000000000..34bbc491027bed262830a01b2d599517dd609f45
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek.py
@@ -0,0 +1,221 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Tuple
+
+import torch
+
+
+def balanced_packing(
+ weight: torch.Tensor, num_packs: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ """
+ Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
+ are as balanced as possible.
+
+ Parameters:
+ weight: [X, n], the weight of each item
+ num_packs: number of packs
+
+ Returns:
+ pack_index: [X, n], the pack index of each item
+ rank_in_pack: [X, n], the rank of the item in the pack
+ """
+ num_layers, num_groups = weight.shape
+ assert num_groups % num_packs == 0
+ groups_per_pack = num_groups // num_packs
+
+ if groups_per_pack == 1:
+ pack_index = torch.arange(
+ weight.size(-1), dtype=torch.int64, device=weight.device
+ ).expand(weight.shape)
+ rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+ return pack_index, rank_in_pack
+
+ indices = weight.float().sort(-1, descending=True).indices.cpu()
+ pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
+ rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+ for i in range(num_layers):
+ pack_weights = [0] * num_packs
+ pack_items = [0] * num_packs
+ for group in indices[i]:
+ pack = min(
+ (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+ key=pack_weights.__getitem__,
+ )
+ assert pack_items[pack] < groups_per_pack
+ pack_index[i, group] = pack
+ rank_in_pack[i, group] = pack_items[pack]
+ pack_weights[pack] += weight[i, group]
+ pack_items[pack] += 1
+ return pack_index, rank_in_pack
+
+
+def replicate_experts(
+ weight: torch.Tensor, num_phy: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
+
+ Parameters:
+ weight: [X, num_log]
+ num_phy: total number of experts after replication
+
+ Returns:
+ phy2log: [X, num_phy], logical expert id of each physical expert
+ rank: [X, num_phy], the replica rank
+ logcnt: [X, num_log], number of replicas for each logical expert
+ """
+ n, num_log = weight.shape
+ num_redundant = num_phy - num_log
+ assert num_redundant >= 0
+ device = weight.device
+ phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+ rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+ logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+ arangen = torch.arange(n, dtype=torch.int64, device=device)
+ for i in range(num_log, num_phy):
+ redundant_indices = (weight / logcnt).max(dim=-1).indices
+ phy2log[:, i] = redundant_indices
+ rank[:, i] = logcnt[arangen, redundant_indices]
+ logcnt[arangen, redundant_indices] += 1
+ return phy2log, rank, logcnt
+
+
+def rebalance_experts_hierarchical(
+ weight: torch.Tensor,
+ num_physical_experts: int,
+ num_groups: int,
+ num_nodes: int,
+ num_gpus: int,
+):
+ """
+ Parameters:
+ weight: [num_moe_layers, num_logical_experts]
+ num_physical_experts: number of physical experts after replication
+ num_groups: number of expert groups
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+ Returns:
+ physical_to_logical_map: [num_moe_layers, num_physical_experts]
+ logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+ logical_count: [num_moe_layers, num_logical_experts]
+ """
+ num_layers, num_logical_experts = weight.shape
+ assert num_logical_experts % num_groups == 0
+ group_size = num_logical_experts // num_groups
+ assert num_groups % num_nodes == 0
+ groups_per_node = num_groups // num_nodes
+ assert num_gpus % num_nodes == 0
+ assert num_physical_experts % num_gpus == 0
+ phy_experts_per_gpu = num_physical_experts // num_gpus
+
+ def inverse(perm: torch.Tensor) -> torch.Tensor:
+ inv = torch.empty_like(perm)
+ inv.scatter_(
+ 1,
+ perm,
+ torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
+ perm.shape
+ ),
+ )
+ return inv
+
+ # Step 1: pack groups to nodes
+ tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+ group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
+ log2mlog = (
+ (
+ (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+ ).unsqueeze(-1)
+ + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
+ ).flatten(-2)
+ mlog2log = inverse(log2mlog)
+
+ # Step 2: construct redundant experts within nodes
+ # [num_layers * num_nodes, num_logical_experts // num_nodes]
+ tokens_per_mlog = weight.gather(-1, mlog2log).view(
+ -1, num_logical_experts // num_nodes
+ )
+ phy2mlog, phyrank, mlogcnt = replicate_experts(
+ tokens_per_mlog, num_physical_experts // num_nodes
+ )
+
+ # Step 3: pack physical_experts to GPUs
+ # [num_layers * num_nodes, num_physical_experts // num_nodes]
+ tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+ pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
+ phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+ pphy2phy = inverse(phy2pphy)
+
+ pphy2mlog = phy2mlog.gather(
+ -1, pphy2phy
+ ) # [num_layers * num_nodes, num_log_per_nodes]
+ pphy2mlog = (
+ pphy2mlog.view(num_layers, num_nodes, -1)
+ + torch.arange(
+ 0,
+ num_logical_experts,
+ num_logical_experts // num_nodes,
+ device=group_pack_index.device,
+ ).view(1, -1, 1)
+ ).flatten(-2)
+ pphy2log = mlog2log.gather(-1, pphy2mlog)
+ pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+ logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+ return pphy2log, pphyrank, logcnt
+
+
+def rebalance_experts(
+ weight: torch.Tensor,
+ num_replicas: int,
+ num_groups: int,
+ num_nodes: int,
+ num_gpus: int,
+ enable_hierarchical: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Entry point for expert-parallelism load balancer.
+
+ Parameters:
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
+ num_groups: number of expert groups
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+ Returns:
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
+ """
+
+ num_layers, num_logical_experts = weight.shape
+ weight = weight.float().cpu()
+ if enable_hierarchical:
+ # use hierarchical load-balance policy
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+ weight, num_replicas, num_groups, num_nodes, num_gpus
+ )
+ else:
+ # use global load-balance policy
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+ weight, num_replicas, 1, 1, num_gpus
+ )
+ maxlogcnt = logcnt.max().item()
+ log2phy: torch.Tensor = torch.full(
+ (num_layers, num_logical_experts, maxlogcnt),
+ -1,
+ dtype=torch.int64,
+ device=logcnt.device,
+ )
+ log2phy.view(num_layers, -1).scatter_(
+ -1,
+ phy2log * maxlogcnt + phyrank,
+ torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+ num_layers, -1
+ ),
+ )
+ return phy2log, log2phy, logcnt
+
+
+__all__ = ["rebalance_experts"]
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py b/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb165448a7c2053bf705b90550565e380aa7b5e9
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py
@@ -0,0 +1,276 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Optional, Tuple
+
+import torch
+
+
+def pack_groups(tokens_per_group: torch.Tensor, num_nodes: int) -> torch.Tensor:
+ num_layers, num_groups = tokens_per_group.shape
+ assert num_groups % num_nodes == 0
+ groups_per_rank = num_groups // num_nodes
+
+ indices = tokens_per_group.float().sort(-1, descending=True).indices.cpu()
+ ret = torch.full_like(
+ tokens_per_group, fill_value=-1, dtype=torch.int64, device="cpu"
+ )
+ for layer in range(num_layers):
+ node_tokens = [0] * num_nodes
+ node_groups = [0] * num_nodes
+ for group in indices[layer]:
+
+ def key_func(rank: int) -> int:
+ if node_groups[rank] >= groups_per_rank:
+ return 1, 0
+ else:
+ return 0, node_tokens[rank]
+
+ rank = min(range(num_nodes), key=key_func)
+ assert node_groups[rank] < groups_per_rank
+ ret[layer, group] = rank * groups_per_rank + node_groups[rank]
+ node_tokens[rank] += tokens_per_group[layer, group]
+ node_groups[rank] += 1
+ return ret
+
+
+def make_redundant_experts_chunkwise(
+ tokens_per_expert: torch.Tensor,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+ num_physical_experts_per_chunk: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ num_steps, num_moe_layers, num_logical_experts = tokens_per_expert.shape
+ num_redundancy_experts = num_physical_experts - num_logical_experts
+
+ physical_to_logical_map = torch.empty(
+ num_moe_layers,
+ num_physical_experts,
+ dtype=torch.int,
+ device=tokens_per_expert.device,
+ )
+ logical_to_physical_map = torch.full(
+ (num_moe_layers, num_logical_experts, num_redundancy_experts + 1),
+ -1,
+ dtype=torch.int,
+ device=tokens_per_expert.device,
+ )
+ logical_count = torch.ones(
+ num_moe_layers,
+ num_logical_experts,
+ dtype=torch.int,
+ device=tokens_per_expert.device,
+ )
+
+ assert num_physical_experts % num_physical_experts_per_chunk == 0
+ num_chunks = num_physical_experts // num_physical_experts_per_chunk
+ assert num_logical_experts % num_chunks == 0
+ num_logical_experts_per_group = num_logical_experts // num_chunks
+ assert num_redundancy_experts % num_chunks == 0
+ num_redundancy_experts_per_group = num_redundancy_experts // num_chunks
+
+ arange_num_moe_layers_num_groups = torch.arange(
+ num_moe_layers * num_chunks, dtype=torch.int, device=tokens_per_expert.device
+ )
+ arange_num_logical_experts = torch.arange(
+ num_logical_experts, dtype=torch.int, device=tokens_per_expert.device
+ )
+ arange_num_logical_experts_per_group = torch.arange(
+ num_logical_experts_per_group, dtype=torch.int, device=tokens_per_expert.device
+ )
+ arange_num_groups = torch.arange(
+ num_chunks, dtype=torch.int, device=tokens_per_expert.device
+ )
+ physical_to_logical_map.view(
+ num_moe_layers, num_chunks, num_physical_experts_per_chunk
+ )[:, :, :num_logical_experts_per_group] = arange_num_logical_experts.view(
+ num_chunks, num_logical_experts_per_group
+ )
+ logical_to_physical_map[:, :, 0] = (
+ arange_num_logical_experts_per_group.expand(
+ num_chunks, num_logical_experts_per_group
+ )
+ + arange_num_groups[:, None] * num_physical_experts_per_chunk
+ ).view(num_logical_experts)
+
+ tokens_per_expert_all_diff = tokens_per_expert + arange_num_logical_experts * 1e-4
+ for i in range(num_redundancy_experts_per_group):
+ score = (
+ tokens_per_expert_all_diff / logical_count
+ ) # NOTE: Values in score must be different from each other
+ score1 = tokens_per_expert / (logical_count + 1)
+ score = score.view(
+ num_steps, num_moe_layers, num_chunks, num_logical_experts_per_group
+ )
+ score1 = score1.view_as(score)
+ values, indices = score.max(-1, keepdim=True)
+ values = values.expand_as(score).contiguous()
+ score.scatter_(-1, indices, score1.gather(-1, indices))
+ values.scatter_(-1, indices, score.max(-1, keepdim=True).values)
+ redundancy_indices = values.sum(0).argmin(-1)
+ physical_to_logical_map.view(
+ num_moe_layers, num_chunks, num_physical_experts_per_chunk
+ )[:, :, num_logical_experts_per_group + i] = (
+ redundancy_indices + arange_num_groups * num_logical_experts_per_group
+ )
+ redundancy_count = (
+ logical_count.view(
+ num_moe_layers * num_chunks, num_logical_experts_per_group
+ )
+ .gather(-1, redundancy_indices.view(num_moe_layers * num_chunks, 1))
+ .squeeze(1)
+ )
+ physical_redundancy_indices = (
+ (
+ arange_num_groups * num_physical_experts_per_chunk
+ + num_logical_experts_per_group
+ + i
+ )
+ .expand(num_moe_layers, num_chunks)
+ .flatten()
+ )
+ logical_to_physical_map.view(
+ num_moe_layers * num_chunks,
+ num_logical_experts_per_group,
+ num_redundancy_experts + 1,
+ )[
+ arange_num_moe_layers_num_groups,
+ redundancy_indices.view(num_moe_layers * num_chunks),
+ redundancy_count,
+ ] = physical_redundancy_indices
+ logical_count.view(num_moe_layers * num_chunks, num_logical_experts_per_group)[
+ arange_num_moe_layers_num_groups,
+ redundancy_indices.view(num_moe_layers * num_chunks),
+ ] += 1
+
+ if num_local_physical_experts > 1:
+ # Load-balancing between GPUs
+ physical_to_logical_map_int64 = physical_to_logical_map.to(torch.int64)
+ counts = logical_count.gather(-1, physical_to_logical_map_int64)
+ score = tokens_per_expert.sum(0).gather(-1, physical_to_logical_map_int64)
+ score = score / counts
+ score = score.view(num_moe_layers, num_chunks, num_physical_experts_per_chunk)
+ indices = score.argsort(-1, descending=True)
+ indices += torch.arange(
+ 0,
+ num_physical_experts,
+ num_physical_experts_per_chunk,
+ dtype=indices.dtype,
+ device=indices.device,
+ )[None, :, None]
+
+ assert num_physical_experts_per_chunk % num_local_physical_experts == 0
+ num_local_groups = num_physical_experts_per_chunk // num_local_physical_experts
+ indices = indices.view(
+ num_moe_layers, num_chunks, num_local_physical_experts, num_local_groups
+ )
+ indices[:, :, 1::2, :] = indices[:, :, 1::2, :].flip(-1)
+ indices = indices.transpose(2, 3)
+ indices = indices.reshape(num_moe_layers, num_physical_experts)
+ physical_to_logical_map = physical_to_logical_map.gather(-1, indices)
+ mask = logical_to_physical_map == -1
+ logical_to_physical_map[mask] = 0
+ logical_to_physical_map = (
+ indices.argsort(-1)
+ .gather(
+ -1, logical_to_physical_map.view(num_moe_layers, -1).to(torch.int64)
+ )
+ .view_as(logical_to_physical_map)
+ .to(torch.int)
+ )
+ logical_to_physical_map[mask] = -1
+
+ return physical_to_logical_map, logical_to_physical_map, logical_count
+
+
+def decode_rebalance_experts(
+ tokens_per_expert: torch.Tensor,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+):
+ return make_redundant_experts_chunkwise(
+ tokens_per_expert,
+ num_physical_experts,
+ num_local_physical_experts,
+ num_physical_experts,
+ )
+
+
+def prefill_rebalance_experts(
+ tokens_per_expert: torch.Tensor,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+ num_groups: int,
+ num_nodes: int,
+):
+ tokens_per_expert = tokens_per_expert.float().cpu()
+
+ num_steps, _, num_logical_experts = tokens_per_expert.shape
+ assert num_logical_experts % num_groups == 0
+ group_size = num_logical_experts // num_groups
+ assert num_groups % num_nodes == 0, f"{num_groups=} {num_nodes=}"
+
+ tokens_per_group = tokens_per_expert.sum(0).unflatten(-1, (num_groups, -1)).sum(-1)
+ group_perm = pack_groups(
+ tokens_per_group, num_nodes
+ ) # [num_moe_layers, num_groups] => [num_moe_layers, num_nodes]
+
+ # log2mlog [layers, #logexp] -> [layers, #logexp]
+ log2mlog = (
+ (group_perm * group_size).unsqueeze(-1)
+ + torch.arange(group_size, dtype=torch.int64, device=group_perm.device)
+ ).flatten(-2)
+
+ # mlog2log [layers, #logexp] -> [layers, #logexp], inverse of log2mlog
+ mlog2log = torch.empty_like(log2mlog)
+ arange = torch.arange(
+ num_logical_experts, dtype=torch.int64, device=mlog2log.device
+ )
+ mlog2log.scatter_(1, log2mlog, arange.expand(log2mlog.size(0), -1))
+
+ # tokens_per_mlog[i][j][k] = tokens_per_expert[i][j][mlog2log[j][k]]
+ tokens_per_mlog = tokens_per_expert.gather(
+ 2, mlog2log.unsqueeze(0).expand(num_steps, -1, -1)
+ )
+
+ phy2mlog, mlog2phy, mlog_count = make_redundant_experts_chunkwise(
+ tokens_per_mlog,
+ num_physical_experts,
+ num_local_physical_experts,
+ num_physical_experts // num_nodes,
+ )
+
+ # phy2log[i][j] = mlog2log[i][phy2mlog[i][j]]
+ phy2log = mlog2log.gather(1, phy2mlog.to(torch.int64))
+
+ # mlog2phy: [num_moe_layers, num_logical_experts, ...]
+ # log2phy[i][j][k] = mlog2phy[i][log2mlog[i][j]][k]
+ log2phy = mlog2phy.gather(
+ 1, log2mlog.unsqueeze(-1).expand(-1, -1, mlog2phy.size(-1)).to(torch.int64)
+ )
+
+ # log_count[i][j] = mlog_count[i][log2mlog[i][j]]
+ log_count = mlog_count.gather(1, log2mlog)
+ return phy2log, log2phy, log_count
+
+
+def rebalance_experts(
+ tokens_per_expert: torch.Tensor,
+ num_physical_experts: int,
+ num_local_physical_experts: int,
+ num_groups: Optional[int],
+ num_nodes: int,
+ enable_hierarchical: bool,
+):
+ if enable_hierarchical:
+ return prefill_rebalance_experts(
+ tokens_per_expert=tokens_per_expert,
+ num_physical_experts=num_physical_experts,
+ num_local_physical_experts=num_local_physical_experts,
+ num_groups=num_groups,
+ num_nodes=num_nodes,
+ )
+ else:
+ return decode_rebalance_experts(
+ tokens_per_expert=tokens_per_expert,
+ num_physical_experts=num_physical_experts,
+ num_local_physical_experts=num_local_physical_experts,
+ )
diff --git a/sglang/python/sglang/srt/eplb/eplb_algorithms/elasticity_aware.py b/sglang/python/sglang/srt/eplb/eplb_algorithms/elasticity_aware.py
new file mode 100644
index 0000000000000000000000000000000000000000..c781c444ae3b7671a680ef6c8982c0cde9e960f5
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_algorithms/elasticity_aware.py
@@ -0,0 +1,87 @@
+from typing import Tuple
+
+import torch
+
+from sglang.srt.eplb.eplb_algorithms.deepseek import rebalance_experts_hierarchical
+
+
+def rebalance_experts(
+ weight: torch.Tensor,
+ num_replicas: int,
+ num_groups: int,
+ num_nodes: int,
+ num_gpus: int,
+ enable_hierarchical: bool,
+ active_ranks: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Entry point for expert-parallelism load balancer.
+
+ Parameters:
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
+ num_groups: number of expert groups
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+ Returns:
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
+ """
+
+ num_layers, num_logical_experts = weight.shape
+ weight = weight.float().cpu()
+ num_active_ranks = active_ranks.sum().item()
+ num_local_experts = num_replicas // num_gpus
+ if num_active_ranks < num_gpus:
+ # Must fall back to global load-balance policy
+ # and fix some params
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+ weight,
+ num_local_experts * num_active_ranks,
+ 1,
+ 1,
+ num_active_ranks,
+ )
+ elif enable_hierarchical:
+ # use hierarchical load-balance policy
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+ weight, num_replicas, num_groups, num_nodes, num_gpus
+ )
+ else:
+ # use global load-balance policy
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+ weight, num_replicas, 1, 1, num_gpus
+ )
+ maxlogcnt = logcnt.max().item()
+ log2phy: torch.Tensor = torch.full(
+ (num_layers, num_logical_experts, maxlogcnt),
+ -1,
+ dtype=torch.int64,
+ device=logcnt.device,
+ )
+ log2phy.view(num_layers, -1).scatter_(
+ -1,
+ phy2log * maxlogcnt + phyrank,
+ torch.arange(
+ num_local_experts * num_active_ranks,
+ dtype=torch.int64,
+ device=log2phy.device,
+ ).expand(num_layers, -1),
+ )
+ if num_active_ranks < num_gpus:
+ phy2log_slices = list(
+ phy2log.view(num_layers, num_active_ranks, -1).unbind(dim=1)
+ )
+ active_ranks_list = active_ranks.tolist()
+ for idx, active_rank in enumerate(active_ranks_list):
+ if not active_rank:
+ phy2log_slices.insert(idx, torch.zeros_like(phy2log_slices[0]))
+ log2phy = torch.where(
+ log2phy >= idx * num_local_experts,
+ log2phy + num_local_experts,
+ log2phy,
+ )
+ phy2log = torch.stack(phy2log_slices, dim=1).contiguous().view(num_layers, -1)
+ return phy2log, log2phy, logcnt
diff --git a/sglang/python/sglang/srt/eplb/eplb_simulator/__init__.py b/sglang/python/sglang/srt/eplb/eplb_simulator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1fcbdf00e0dc48f5d89f857c150e9f34e584e6a
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_simulator/__init__.py
@@ -0,0 +1 @@
+from . import reader
diff --git a/sglang/python/sglang/srt/eplb/eplb_simulator/reader.py b/sglang/python/sglang/srt/eplb/eplb_simulator/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..97405c3194777144d9e8bea92f24e87a7629263f
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_simulator/reader.py
@@ -0,0 +1,51 @@
+from collections import defaultdict
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.eplb.expert_distribution import (
+ _convert_global_physical_count_to_logical_count,
+)
+
+convert_global_physical_count_to_logical_count = (
+ _convert_global_physical_count_to_logical_count
+)
+
+
+def read_mode_per_pass(dir_data: Path):
+ """Read data from ExpertDistributionRecorder when recorded with mode `per_pass`"""
+
+ # gpc := global_physical_count
+ gpc_of_forward_pass_and_rank = defaultdict(lambda: defaultdict())
+ for path in tqdm(list(dir_data.glob("*.pt"))):
+ data_pack = torch.load(path, weights_only=True)
+ last_physical_to_logical_map = data_pack["last_physical_to_logical_map"]
+ for record in data_pack["records"]:
+ forward_pass_id = record["forward_pass_id"]
+ rank = record["rank"]
+ assert (
+ gpc_of_forward_pass_and_rank[forward_pass_id].get(rank) is None
+ ), f"Duplicated {forward_pass_id=} {rank=}"
+ gpc_of_forward_pass_and_rank[forward_pass_id][rank] = record[
+ "global_physical_count"
+ ]
+
+ forward_pass_ids = sorted(gpc_of_forward_pass_and_rank.keys())
+ print(f"Make {forward_pass_ids=} into array")
+
+ items = []
+ for forward_pass_id, gpc_of_rank in sorted(gpc_of_forward_pass_and_rank.items()):
+ gpc_of_rank_tensor = torch.stack(
+ [gpc for rank, gpc in sorted(gpc_of_rank.items())]
+ ).sum(dim=0)
+ items.append(gpc_of_rank_tensor)
+
+ gpc_of_forward_pass = torch.stack(items)
+ print(f"{gpc_of_forward_pass.shape=}")
+
+ return dict(
+ global_physical_count_of_forward_pass=gpc_of_forward_pass,
+ last_physical_to_logical_map=last_physical_to_logical_map,
+ forward_pass_ids=forward_pass_ids,
+ )
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/hardware_backend/npu/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91964fae8f9eb24411907b4a64d5f50f8c615397
Binary files /dev/null and b/sglang/python/sglang/srt/hardware_backend/npu/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/allocator_npu.py b/sglang/python/sglang/srt/hardware_backend/npu/allocator_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..01842218ebafad3b077e64312bef173bb5199abd
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/allocator_npu.py
@@ -0,0 +1,151 @@
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.mem_cache.allocator import (
+ PagedTokenToKVPoolAllocator,
+ alloc_extend_naive,
+)
+from sglang.srt.utils import get_num_new_pages, next_power_of_2
+
+if TYPE_CHECKING:
+ from sglang.srt.mem_cache.memory_pool import KVCache
+
+
+class NPUPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
+ def __init__(
+ self,
+ size: int,
+ page_size: int,
+ dtype: torch.dtype,
+ device: str,
+ kvcache: "KVCache",
+ need_sort: bool,
+ ):
+ super().__init__(size, page_size, dtype, device, kvcache, need_sort)
+ self.roundup = page_size - 1
+
+ def alloc_extend(
+ self,
+ prefix_lens: torch.Tensor,
+ prefix_lens_cpu: torch.Tensor,
+ seq_lens: torch.Tensor,
+ seq_lens_cpu: torch.Tensor,
+ last_loc: torch.Tensor,
+ extend_num_tokens: int,
+ ):
+ if self.debug_mode:
+ assert torch.all(
+ (last_loc + 1) % self.page_size == prefix_lens % self.page_size
+ )
+
+ num_new_pages = (
+ (seq_lens + self.roundup) // self.page_size
+ - (prefix_lens + self.roundup) // self.page_size
+ ).sum()
+ num_new_pages_item = num_new_pages.item()
+ if self.need_sort and num_new_pages_item > len(self.free_pages):
+ self.merge_and_sort_free()
+
+ if num_new_pages_item > len(self.free_pages):
+ return None
+
+ if num_new_pages_item < 200:
+ from sgl_kernel_npu.mem_cache.allocator import alloc_extend_kernel
+
+ out_indices = torch.empty(
+ (extend_num_tokens,),
+ dtype=torch.int64,
+ device=self.device,
+ )
+ max_num_extend_tokens = next_power_of_2(extend_num_tokens)
+ bs = prefix_lens.shape[0]
+ alloc_extend_kernel[(bs,)](
+ prefix_lens,
+ seq_lens,
+ last_loc,
+ self.free_pages,
+ out_indices,
+ next_power_of_2(bs),
+ self.page_size,
+ max_num_extend_tokens,
+ )
+
+ else:
+ out_indices = torch.empty(
+ (extend_num_tokens,),
+ dtype=torch.int32,
+ device=self.device,
+ )
+ alloc_extend_naive(
+ prefix_lens,
+ seq_lens,
+ last_loc,
+ self.free_pages,
+ out_indices,
+ self.page_size,
+ self.device,
+ )
+
+ if self.debug_mode:
+ assert len(torch.unique(out_indices)) == len(out_indices)
+
+ self.free_pages = self.free_pages[num_new_pages_item:]
+ return out_indices.int()
+
+ def alloc_decode(
+ self,
+ seq_lens: torch.Tensor,
+ seq_lens_cpu: torch.Tensor,
+ last_loc: torch.Tensor,
+ ):
+ if self.debug_mode:
+ assert torch.all(
+ (last_loc + 2) % self.page_size == seq_lens % self.page_size
+ )
+
+ num_new_pages = get_num_new_pages(
+ seq_lens=seq_lens_cpu,
+ page_size=self.page_size,
+ decode=True,
+ )
+
+ if num_new_pages > len(self.free_pages):
+ self.merge_and_sort_free()
+
+ if num_new_pages > len(self.free_pages):
+ return None
+
+ need_new_pages = (seq_lens % self.page_size == 1).int()
+ end_new_pages = torch.cumsum(need_new_pages, 0)
+ start_new_pages = end_new_pages - need_new_pages
+ if num_new_pages == 0:
+ out_indices = last_loc + 1
+ else:
+ out_indices = (last_loc + 1) * (1 - need_new_pages) + self.free_pages[
+ start_new_pages
+ ] * self.page_size * need_new_pages
+
+ if self.debug_mode:
+ assert len(torch.unique(out_indices)) == len(out_indices)
+
+ self.free_pages = self.free_pages[num_new_pages:]
+ return out_indices.int()
+
+ def free(self, free_index: torch.Tensor):
+ if free_index.numel() == 0:
+ return
+
+ if self.is_not_in_free_group:
+ device = free_index.device
+ free_page_indices = torch.unique(free_index.cpu() // self.page_size)
+ free_page_indices = free_page_indices.to(device)
+ if self.need_sort:
+ self.release_pages = torch.cat((free_page_indices, self.release_pages))
+ else:
+ self.free_pages = torch.cat((free_page_indices, self.free_pages))
+ else:
+ self.free_group.append(free_index)
+
+ if self.debug_mode:
+ assert len(torch.unique(self.free_pages)) == len(self.free_pages)
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5644177faad7f19e83d8ae134134f1606823cd
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
@@ -0,0 +1,1800 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch_npu
+from sgl_kernel_npu.attention.sinks_attention import (
+ attention_sinks_prefill_triton,
+ attention_sinks_triton,
+)
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.hardware_backend.npu.attention.ascend_torch_native_backend import (
+ AscendTorchNativeAttnBackend,
+)
+from sglang.srt.hardware_backend.npu.attention.mla_preprocess import (
+ is_fia_nz,
+ is_mla_preprocess_enabled,
+)
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import get_bool_env_var
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.radix_attention import RadixAttention
+ from sglang.srt.model_executor.model_runner import ModelRunner
+
+import logging
+
+import numpy as np
+
+
+def _reshape_kv_for_fia_nz(
+ tensor: torch.Tensor, num_heads: int, head_dim: int, page_size: int
+) -> torch.Tensor:
+ """Reshapes a tensor for FIA NZ format."""
+ return tensor.view(-1, 1, num_heads * head_dim // 16, page_size, 16)
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ForwardMetadata:
+
+ # calculated map for kv positions [bs * maxseqlen]
+ block_tables: Optional[torch.Tensor] = None
+
+ # seq len inputs
+ extend_seq_lens_cpu_int: Optional[torch.Tensor] = None
+ seq_lens_cpu_int: Optional[torch.Tensor] = None
+ seq_lens_cpu_list: Optional[List[int]] = None
+ seq_lens_list_cumsum: Optional[List[int]] = None
+ seq_lens: Optional[torch.Tensor] = None
+ actual_seq_lengths_q: Optional[torch.Tensor] = None
+ actual_seq_lengths_kv: Optional[torch.Tensor] = None
+
+ # prefix cache
+ prefix_lens: Optional[torch.Tensor] = None
+ flatten_prefix_block_tables: Optional[torch.Tensor] = None
+
+
+class AscendAttnMaskBuilder:
+ def __init__(self, model_runner: ModelRunner, device, use_fia, use_mla):
+ """
+ Initialize the AscendAttnMaskBuilder class.
+
+ :param model_runner: ModelRunner instance for model execution.
+ :param device: Device to run the model on (e.g., 'cuda', 'npu').
+ :param use_fia: Boolean flag to indicate if environment variable ASCEND_USE_FIA is set to 1.
+ """
+ self.use_fia = use_fia
+ self.model_runner = model_runner
+ self.device = device
+
+ # Initialize mask
+ mask_len = 128
+ self.mask = self.generate_attn_mask(mask_len, "norm", model_runner.dtype).to(
+ self.device
+ )
+
+ # Initialize FIA mask
+ fia_mask_len = 2048
+ self.fia_mask = self.generate_mask_flag(fia_mask_len).to(self.device)
+
+ # Initialize MTP mask
+ mtp_mask_len = 2048
+ self.mtp_mask = self.generate_mask_flag(mtp_mask_len).to(self.device)
+
+ # Initialize mixed chunk mask cache
+ mixed_mask_len = 2048
+ self.mixed_chunk_attn_mask = self.get_splitfuse_attn_mask(mixed_mask_len)
+
+ if use_mla:
+ # Initialize RingMla mask
+ ringmla_mask_len = 512
+ self.ringmla_mask = self.generate_attn_mask(
+ ringmla_mask_len, "norm", torch.bfloat16
+ ).to(self.device)
+
+ @staticmethod
+ def generate_mask_flag(max_seq_len):
+ """
+ Generate a mask flag for attention masks.
+
+ :param max_seq_len: Maximum sequence length for the mask.
+ :return: A boolean tensor representing the mask flag.
+ """
+ # Construct lower triangle matrix.
+ mask_flag = torch.ones((max_seq_len, max_seq_len), dtype=torch.bool).tril_()
+ # Create upper triangle matrix used to mark mask positions.
+ mask_flag = ~mask_flag
+ return mask_flag
+
+ @staticmethod
+ def generate_attn_mask(max_seq_len, mode, dtype=torch.float16):
+ """
+ Generate an attention mask.
+
+ :param max_seq_len: Maximum sequence length for the mask.
+ :param mode: Mode of the mask ('mix' or 'norm').
+ :param dtype: Data type of the mask tensor.
+ :return: A tensor representing the attention mask.
+ """
+ mask_flag = AscendAttnMaskBuilder.generate_mask_flag(max_seq_len)
+ if mode == "mix":
+ mask_value = (
+ float("-inf") if dtype in [torch.float16, torch.bfloat16] else 1
+ )
+ else:
+ mask_value = torch.finfo(torch.float32).min if dtype == torch.float16 else 1
+ attn_mask = (
+ torch.zeros(size=(max_seq_len, max_seq_len))
+ .masked_fill_(mask_flag, mask_value)
+ .to(dtype)
+ )
+ return attn_mask
+
+ @staticmethod
+ def get_attention_mask_id(seq_lens, extend_lens):
+ """
+ Generate attention mask IDs based on sequence lengths and extended lengths.
+
+ :param seq_lens: Sequence lengths.
+ :param extend_lens: Extended lengths.
+ :return: A tensor containing the attention mask IDs.
+ """
+ starts = seq_lens - extend_lens
+ ends = seq_lens
+
+ # Use torch.stack to stack the start and end indices together
+ ranges = torch.stack((starts, ends), dim=-1)
+
+ # Use list comprehension to generate tensors for each range and concatenate them
+ attn_mask_id = torch.cat([torch.arange(start, end) for start, end in ranges])
+ return attn_mask_id
+
+ def update_attn_cache(
+ self,
+ seqlen: int,
+ mask_cache: torch.Tensor,
+ seq_len_cached: int,
+ dtype: torch.dtype,
+ mode,
+ ):
+ """
+ Update the attention mask cache.
+
+ :param seqlen: Maximum sequence length.
+ :param mask_cache: Current attention mask cache.
+ :param seq_len_cached: Cached sequence length.
+ :param dtype: Data type of the mask tensor.
+ :param mode: Mode of the mask ('mix' or 'norm').
+ :return: Updated mask cache and sequence length cache.
+ """
+ if seqlen > seq_len_cached:
+ seq_len_cached = seqlen
+ mask_cache = self.generate_attn_mask(seqlen, mode, dtype)
+ if mask_cache.dtype != dtype:
+ mask_cache = mask_cache.to(dtype)
+ return mask_cache, seq_len_cached
+
+ def get_splitfuse_attn_mask(
+ self,
+ seq_lens: torch.Tensor = None,
+ ) -> torch.Tensor:
+ """
+ Generate a splitfuse attention mask.
+
+ :param seq_lens: Sequence lengths.
+ :return: A tensor representing the splitfuse attention mask.
+ """
+ attn_mask = (
+ torch.triu(torch.ones(seq_lens, seq_lens), diagonal=1)
+ .to(torch.int8)
+ .to(self.device)
+ )
+ return attn_mask
+
+
+class AscendAttnBackend(AttentionBackend):
+
+ def __init__(self, model_runner: ModelRunner):
+ super().__init__()
+ self.forward_metadata = None
+ self.device = model_runner.device
+ self.page_size = model_runner.page_size
+ self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+ if self.use_mla:
+ self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+ self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+ if (
+ "MiniCPM3ForCausalLM"
+ in model_runner.model_config.hf_config.architectures
+ ):
+ self.qk_nope_head_dim = (
+ model_runner.model_config.hf_config.qk_nope_head_dim
+ )
+ else:
+ self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+ self.q_head_dim = self.qk_rope_head_dim + self.qk_nope_head_dim
+ else:
+ self.use_alibi = getattr(model_runner.model_config, "use_alibi", False)
+ if (
+ "Gemma2ForSequenceClassification"
+ in model_runner.model_config.hf_config.architectures
+ ):
+ self.use_native_sdpa = True
+ self.native_attn = AscendTorchNativeAttnBackend()
+ self.graph_metadata = {}
+ self.max_context_len = model_runner.model_config.context_len
+ self.req_to_token = model_runner.req_to_token_pool.req_to_token
+ self.graph_mode = False
+ self.use_fia = get_bool_env_var("ASCEND_USE_FIA", "False")
+ self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+ self.speculative_num_draft_tokens = (
+ model_runner.server_args.speculative_num_draft_tokens
+ )
+ self.ascend_attn_mask_builder = AscendAttnMaskBuilder(
+ model_runner, self.device, self.use_fia, self.use_mla
+ )
+ self.mask, self.fia_mask, self.mtp_mask, self.mix_mask = (
+ self.ascend_attn_mask_builder.mask,
+ self.ascend_attn_mask_builder.fia_mask,
+ self.ascend_attn_mask_builder.mtp_mask,
+ self.ascend_attn_mask_builder.mixed_chunk_attn_mask,
+ )
+ if self.use_mla:
+ self.ringmla_mask = self.ascend_attn_mask_builder.ringmla_mask
+
+ def get_verify_buffers_to_fill_after_draft(self):
+ """
+ Return buffers for verify attention kernels that needs to be filled after draft.
+
+ Typically, these are tree mask and position buffers.
+ """
+ return [None, None]
+
+ def update_verify_buffers_to_fill_after_draft(
+ self, spec_info: SpecInput, cuda_graph_bs: Optional[int]
+ ):
+ pass
+
+ def init_forward_metadata(self, forward_batch: ForwardBatch):
+ """Init the metadata for a forward pass."""
+ self.forward_metadata = ForwardMetadata()
+ seq_lens_max = forward_batch.seq_lens.max()
+ if forward_batch.forward_mode.is_target_verify():
+ seq_lens_max += self.speculative_num_draft_tokens
+ self.forward_metadata.block_tables = (
+ forward_batch.req_to_token_pool.req_to_token[
+ forward_batch.req_pool_indices, :seq_lens_max
+ ][:, :: self.page_size]
+ // self.page_size
+ )
+ if forward_batch.extend_seq_lens is not None:
+ self.forward_metadata.extend_seq_lens = forward_batch.extend_seq_lens
+ self.forward_metadata.extend_seq_lens_cpu_int = (
+ forward_batch.extend_seq_lens.cpu().int()
+ )
+ if forward_batch.seq_lens is not None:
+ self.forward_metadata.seq_lens = forward_batch.seq_lens.int()
+ else:
+ self.forward_metadata.seq_lens = forward_batch.seq_lens_cpu.to(
+ self.device
+ ).int()
+
+ self.forward_metadata.seq_lens_cpu_int = forward_batch.seq_lens_cpu.int()
+ if (
+ not forward_batch.forward_mode.is_draft_extend_v2()
+ and not forward_batch.forward_mode.is_draft_extend()
+ and not forward_batch.forward_mode.is_target_verify()
+ ):
+ seq_lens_list_cumsum = np.cumsum(forward_batch.extend_seq_lens_cpu)
+ self.forward_metadata.seq_lens_list_cumsum = seq_lens_list_cumsum
+
+ if forward_batch.forward_mode.is_target_verify():
+ self.forward_metadata.seq_lens_cpu_int += self.speculative_num_draft_tokens
+
+ if (
+ self.use_mla
+ and forward_batch.forward_mode.is_extend()
+ and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
+ and not forward_batch.forward_mode.is_target_verify()
+ and sum(forward_batch.extend_prefix_lens_cpu) > 0
+ ):
+ self.forward_metadata.prefix_lens = forward_batch.extend_prefix_lens.to(
+ "cpu"
+ )
+ seq_prefix_lens = self.forward_metadata.prefix_lens.tolist()
+ self.forward_metadata.flatten_prefix_block_tables = torch.empty(
+ 0, dtype=torch.int32
+ ).to(self.device)
+ for req_idx, seq_len in zip(
+ forward_batch.req_pool_indices.tolist(), seq_prefix_lens
+ ):
+ req_indices = forward_batch.req_to_token_pool.req_to_token[req_idx]
+ req_prefix_block_tables = (
+ req_indices[:seq_len][:: self.page_size] // self.page_size
+ )
+ self.forward_metadata.flatten_prefix_block_tables = torch.cat(
+ (
+ self.forward_metadata.flatten_prefix_block_tables,
+ torch.flatten(req_prefix_block_tables),
+ )
+ )
+
+ self.graph_mode = False
+
+ def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+ self.graph_metadata = {
+ "block_tables": torch.empty(
+ (max_bs, (self.max_context_len + self.page_size - 1) // self.page_size),
+ dtype=torch.int32,
+ device=self.device,
+ ),
+ }
+
+ def init_forward_metadata_capture_cuda_graph(
+ self,
+ bs: int,
+ num_tokens: int,
+ req_pool_indices: torch.Tensor,
+ seq_lens: torch.Tensor,
+ encoder_lens: Optional[torch.Tensor],
+ forward_mode: ForwardMode,
+ spec_info: Optional[SpecInput],
+ ):
+ metadata = ForwardMetadata()
+
+ metadata.block_tables = self.graph_metadata["block_tables"][:bs, :]
+ metadata.seq_lens_cpu_list = seq_lens.cpu().int().tolist()
+ metadata.seq_lens = seq_lens
+ if (
+ forward_mode.is_target_verify()
+ or forward_mode.is_draft_extend_v2()
+ or forward_mode.is_draft_extend()
+ ):
+ metadata.actual_seq_lengths_q = torch.arange(
+ self.speculative_num_draft_tokens,
+ self.speculative_num_draft_tokens
+ + bs * self.speculative_num_draft_tokens,
+ self.speculative_num_draft_tokens,
+ dtype=torch.int32,
+ device=seq_lens.device,
+ )
+ else:
+ metadata.actual_seq_lengths_q = torch.tensor(
+ [1 + i * 1 for i in range(bs)],
+ dtype=torch.int32,
+ device=seq_lens.device,
+ )
+
+ self.graph_metadata[bs] = metadata
+ self.forward_metadata = metadata
+
+ self.graph_mode = True
+
+ def init_forward_metadata_replay_cuda_graph(
+ self,
+ bs: int,
+ req_pool_indices: torch.Tensor,
+ seq_lens: torch.Tensor,
+ seq_lens_sum: int,
+ encoder_lens: Optional[torch.Tensor],
+ forward_mode: ForwardMode,
+ spec_info: Optional[SpecInput],
+ seq_lens_cpu: Optional[torch.Tensor],
+ ):
+ metadata = self.graph_metadata[bs]
+ max_len = seq_lens_cpu[:bs].max().item()
+ if forward_mode.is_target_verify():
+ max_len += self.speculative_num_draft_tokens
+ max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+
+ metadata.block_tables[:bs, :max_seq_pages].copy_(
+ self.req_to_token[req_pool_indices[:bs], :max_len][:, :: self.page_size]
+ // self.page_size
+ )
+ metadata.block_tables[:bs, max_seq_pages:].fill_(0)
+ metadata.block_tables[bs:, :].fill_(0)
+ if forward_mode.is_target_verify():
+ seq_lens = seq_lens + self.speculative_num_draft_tokens
+ metadata.seq_lens[:bs].copy_(seq_lens[:bs])
+
+ self.forward_metadata = metadata
+
+ self.graph_mode = True
+
+ def get_cuda_graph_seq_len_fill_value(self):
+ return 0
+
+ def _generate_alibi_bias(
+ self,
+ seq_len: int,
+ slopes: torch.Tensor,
+ num_heads: int,
+ device: torch.device,
+ dtype: torch.dtype = torch.bfloat16,
+ ) -> torch.Tensor:
+ position_point = (
+ torch.arange(seq_len).view(1, 1, -1).expand(num_heads, -1, -1).to(device)
+ )
+ alibi = slopes.view(-1, 1, 1) * position_point
+ alibi_bias = alibi.view(num_heads, 1, seq_len).to(device).to(dtype)
+ return alibi_bias
+
+ def generate_alibi_bias(
+ self,
+ q_seq_len: int,
+ kv_seq_len: int,
+ slopes: torch.Tensor,
+ num_heads: int,
+ device: torch.device,
+ is_extend: bool = True,
+ dtype: torch.dtype = torch.bfloat16,
+ ) -> torch.Tensor:
+ MAX_LEN_ALB = 5000
+ max_seq_len = max(kv_seq_len, q_seq_len, MAX_LEN_ALB)
+ if getattr(self, "alibi_bias", None) is None:
+ self.alibi_bias = self._generate_alibi_bias(
+ max_seq_len, slopes, num_heads, device, dtype
+ )
+
+ if getattr(self, "super_mask", None) is None:
+ super_mask = torch.ones(size=(1, max_seq_len, max_seq_len), dtype=dtype)
+ super_mask = super_mask.float().fill_(float("-inf")).type_as(super_mask)
+ super_mask = torch.triu(super_mask, 1).to(device)
+ self.super_mask = super_mask
+ if is_extend:
+ return (
+ self.alibi_bias[:, :q_seq_len, :kv_seq_len]
+ + self.super_mask[:, :q_seq_len, :kv_seq_len]
+ )
+ else:
+ return self.alibi_bias[:, :q_seq_len, :kv_seq_len]
+
+ def attn_alibi(
+ self,
+ q,
+ k_cache,
+ v_cache,
+ block_tables,
+ seq_lens,
+ query_lens,
+ scale_value,
+ num_heads,
+ slopes,
+ is_extend,
+ ):
+ curr = 0
+ num_prompts = query_lens.shape[0]
+ head_size = k_cache.shape[3]
+ head_size_v = v_cache.shape[3]
+ block_size = k_cache.shape[1]
+ attn_output = []
+ for i in range(num_prompts):
+ seq_len = seq_lens[i].item()
+ block_table = block_tables[i]
+
+ j = torch.arange(seq_len, device=block_table.device)
+
+ block_number = block_table[j // block_size]
+ block_offset = j % block_size
+
+ k = k_cache[block_number, block_offset]
+ v = v_cache[block_number, block_offset]
+ k = k.view(seq_len, num_heads, head_size)
+ v = v.view(seq_len, num_heads, head_size_v)
+
+ if is_extend:
+ q_len = query_lens[i].item()
+ query = q[curr : curr + q_len]
+ else:
+ q_len = 1
+ query = q[curr : curr + 1]
+
+ query = query.to(torch.float32)
+ query = query * scale_value
+ query = query.permute(1, 0, 2)
+ k = k.permute(1, 2, 0)
+
+ score = torch.bmm(query, k)
+ score = score.to(torch.float32)
+ if slopes is not None:
+ alibi_bias = self.generate_alibi_bias(
+ q_seq_len=q_len,
+ kv_seq_len=seq_len,
+ slopes=slopes,
+ num_heads=num_heads,
+ device=q.device,
+ is_extend=is_extend,
+ dtype=query.dtype,
+ )
+ score = score + alibi_bias
+ score = torch.max(score, torch.tensor(torch.finfo(score.dtype).min))
+ p = torch.nn.functional.softmax(score, dim=-1)
+ v = v.permute(1, 0, 2)
+ out = torch.bmm(p, v)
+ out = out.permute(1, 0, 2)
+ out = out.reshape(-1, num_heads * head_size_v)
+ attn_output.append(out)
+ curr += q_len
+ attn_output = torch.cat(attn_output, dim=0).to(q.dtype).to(q.device)
+ attn_output = attn_output.view(-1, num_heads * head_size)
+ return attn_output
+
+ def do_cp_balance_attn(
+ self,
+ q_nope,
+ k_nope,
+ q_pe,
+ k_pe,
+ topk_indices,
+ layer,
+ actual_seq_qlen,
+ actual_seq_lengths_kv,
+ ):
+ seq_len = q_nope.shape[0]
+ split_len = (seq_len + 1) // 2
+ q_nope_prev, q_nope_next = torch.split(q_nope, split_len, dim=0)
+ q_rope_prev, q_rope_next = torch.split(q_pe, split_len, dim=0)
+ q_nope_prev = q_nope_prev.contiguous()
+ q_nope_next = q_nope_next.contiguous()
+ q_rope_prev = q_rope_prev.contiguous()
+ q_rope_next = q_rope_next.contiguous()
+ topk_indices_prev, topk_indices_next = topk_indices
+
+ actual_seq_qlen_prev, actual_seq_qlen_next = actual_seq_qlen
+ actual_seq_lengths_kv_prev, actual_seq_lengths_kv_next = actual_seq_lengths_kv
+
+ attn_out_prev, _, _ = torch_npu.npu_sparse_flash_attention(
+ query=q_nope_prev,
+ key=k_nope,
+ value=k_nope,
+ query_rope=q_rope_prev,
+ key_rope=k_pe,
+ sparse_indices=topk_indices_prev,
+ scale_value=layer.scaling,
+ actual_seq_lengths_query=actual_seq_qlen_prev.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ actual_seq_lengths_kv=actual_seq_lengths_kv_prev.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ block_table=self.forward_metadata.block_tables,
+ sparse_block_size=1,
+ layout_query="TND",
+ layout_kv="PA_BSND",
+ sparse_mode=3,
+ attention_mode=2,
+ return_softmax_lse=False,
+ )
+ attn_out_next, _, _ = torch_npu.npu_sparse_flash_attention(
+ query=q_nope_next,
+ key=k_nope,
+ value=k_nope,
+ query_rope=q_rope_next,
+ key_rope=k_pe,
+ sparse_indices=topk_indices_next,
+ scale_value=layer.scaling,
+ actual_seq_lengths_query=actual_seq_qlen_next.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ actual_seq_lengths_kv=actual_seq_lengths_kv_next.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ block_table=self.forward_metadata.block_tables,
+ sparse_block_size=1,
+ layout_query="TND",
+ layout_kv="PA_BSND",
+ sparse_mode=3,
+ attention_mode=2,
+ return_softmax_lse=False,
+ )
+ return torch.cat([attn_out_prev, attn_out_next], dim=0)
+
+ def forward_sparse(
+ self,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool = True,
+ # For multi_head latent attention
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ topk_indices: torch.Tensor = None,
+ ):
+
+ is_prefill = (
+ forward_batch.forward_mode.is_extend()
+ and not forward_batch.forward_mode.is_draft_extend_v2()
+ and not forward_batch.forward_mode.is_draft_extend()
+ and not forward_batch.forward_mode.is_target_verify()
+ )
+
+ if save_kv_cache:
+ k = k.view(-1, layer.tp_k_head_num, self.kv_lora_rank)
+ k_rope = k_rope.view(-1, layer.tp_k_head_num, self.qk_rope_head_dim)
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, k_rope
+ )
+ q_nope, q_pe = q, q_rope
+ k_nope, k_pe = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+
+ if is_prefill:
+ if self.forward_metadata.actual_seq_lengths_q is not None:
+ actual_seq_qlen = self.forward_metadata.actual_seq_lengths_q
+ else:
+ actual_seq_qlen = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
+ else:
+ if self.forward_metadata.actual_seq_lengths_q is None:
+ if (
+ forward_batch.forward_mode.is_draft_extend_v2()
+ or forward_batch.forward_mode.is_target_verify()
+ ):
+ actual_seq_qlen = (
+ torch.arange(
+ self.speculative_num_draft_tokens,
+ self.speculative_num_draft_tokens + q.shape[0],
+ self.speculative_num_draft_tokens,
+ dtype=torch.int32,
+ )
+ .to(q.device)
+ .to(torch.int32)
+ )
+ elif forward_batch.forward_mode.is_draft_extend():
+ actual_seq_qlen = (
+ forward_batch.extend_seq_lens.cumsum()
+ .to(q.device)
+ .to(torch.int32)
+ )
+ else:
+ actual_seq_qlen = (
+ torch.arange(1, q.shape[0] + 1).to(q.device).to(torch.int32)
+ )
+ else:
+ actual_seq_qlen = self.forward_metadata.actual_seq_lengths_q
+
+ if self.forward_metadata.actual_seq_lengths_kv is not None:
+ actual_seq_lengths_kv = self.forward_metadata.actual_seq_lengths_kv
+ elif self.forward_metadata.seq_lens_cpu_int is not None:
+ actual_seq_lengths_kv = self.forward_metadata.seq_lens_cpu_int
+ else:
+ actual_seq_lengths_kv = self.forward_metadata.seq_lens
+
+ if (
+ is_prefill
+ and is_nsa_enable_prefill_cp()
+ and forward_batch.nsa_cp_metadata is not None
+ ):
+ attn_out = self.do_cp_balance_attn(
+ q_nope,
+ k_nope,
+ q_pe,
+ k_pe,
+ topk_indices,
+ layer,
+ actual_seq_qlen,
+ actual_seq_lengths_kv,
+ )
+ else:
+ attn_out, _, _ = torch_npu.npu_sparse_flash_attention(
+ query=q_nope,
+ key=k_nope,
+ value=k_nope,
+ query_rope=q_pe,
+ key_rope=k_pe,
+ sparse_indices=topk_indices,
+ scale_value=layer.scaling,
+ actual_seq_lengths_query=actual_seq_qlen.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ actual_seq_lengths_kv=actual_seq_lengths_kv.to(
+ device=q_nope.device, dtype=torch.int32
+ ),
+ block_table=self.forward_metadata.block_tables,
+ sparse_block_size=1,
+ layout_query="TND",
+ layout_kv="PA_BSND",
+ sparse_mode=3,
+ attention_mode=2,
+ return_softmax_lse=False,
+ )
+
+ return attn_out
+
+ def forward_extend(
+ self,
+ q,
+ k,
+ v,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool = True,
+ # For multi_head latent attention
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ topk_indices: Optional[torch.Tensor] = None,
+ sinks: Optional[torch.Tensor] = None,
+ slopes: Optional[torch.Tensor] = None,
+ ):
+ if is_mla_preprocess_enabled():
+ # MLAPO and MLAPROLOG do save kv_cache
+ save_kv_cache = False
+ if topk_indices is not None:
+ return self.forward_sparse(
+ q,
+ k,
+ v,
+ layer,
+ forward_batch,
+ save_kv_cache,
+ q_rope,
+ k_rope,
+ topk_indices,
+ )
+ if (
+ forward_batch.forward_mode.is_target_verify()
+ or forward_batch.forward_mode.is_draft_extend()
+ or forward_batch.forward_mode.is_draft_extend_v2()
+ ):
+ return self.forward_mtp(
+ q,
+ k,
+ v,
+ layer,
+ forward_batch,
+ save_kv_cache,
+ q_rope=q_rope,
+ k_rope=k_rope,
+ )
+
+ if not self.use_mla:
+ # In cross attention layer, when there is no vision input,the values of k and v is None
+ if save_kv_cache and k is not None and v is not None:
+ # support cross attention
+ cache_loc = (
+ forward_batch.out_cache_loc
+ if not layer.is_cross_attention
+ else forward_batch.encoder_out_cache_loc
+ )
+ forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+ if sinks is not None:
+ attn_out = attention_sinks_prefill_triton(
+ q,
+ k_cache,
+ v_cache,
+ sinks,
+ self.forward_metadata.extend_seq_lens,
+ self.forward_metadata.block_tables,
+ self.forward_metadata.seq_lens,
+ layer.scaling,
+ layer.sliding_window_size,
+ layer.tp_q_head_num,
+ layer.tp_k_head_num,
+ )
+ return attn_out
+
+ if self.use_fia:
+ """FIA will support multi-bs in the later version of CANN"""
+ q = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
+ attn_output = torch.empty(
+ (q.size(0), layer.tp_q_head_num, layer.v_head_dim),
+ device=q.device,
+ dtype=q.dtype,
+ )
+ q_len_offset = 0
+ for q_len in forward_batch.extend_seq_lens_cpu:
+ attn_output[q_len_offset : q_len_offset + q_len] = (
+ torch.ops.npu.npu_fused_infer_attention_score(
+ q[None, q_len_offset : q_len_offset + q_len],
+ k[None, q_len_offset : q_len_offset + q_len],
+ v[None, q_len_offset : q_len_offset + q_len],
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="BSND", # todo, TND not supports q_heads!=k_heads
+ atten_mask=self.fia_mask.unsqueeze(0),
+ sparse_mode=3 if q_len != 1 else 0,
+ scale=layer.scaling,
+ next_tokens=0,
+ )[0]
+ )
+ q_len_offset += q_len
+ attn_output = attn_output.view(
+ -1, layer.tp_q_head_num * layer.v_head_dim
+ )
+
+ else:
+ causal = True
+ if (
+ layer.is_cross_attention
+ or layer.attn_type == AttentionType.ENCODER_ONLY
+ ):
+ causal = False
+
+ # there are some accuracy issues in cross attention scene to use torch_npu._npu_flash_attention_qlens
+ # forward_batch.encoder_lens is not None in cross attention scend, we add native attn to solve accuracy issues
+ # Model skywork-reward-gemma2-2-27B also suffers from precision anomalies, thus the torch native backend becomes beneficial approach.
+ if (
+ layer.qk_head_dim <= 128
+ and causal
+ and forward_batch.encoder_lens is None
+ and layer.logit_cap == 0
+ and not getattr(self, "use_native_sdpa", False)
+ ):
+ if not self.use_alibi:
+ query = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+ attn_output = torch.empty(
+ (query.shape[0], layer.tp_q_head_num * layer.v_head_dim),
+ dtype=query.dtype,
+ device=query.device,
+ )
+
+ torch_npu._npu_flash_attention_qlens(
+ query=query,
+ key_cache=k_cache,
+ value_cache=v_cache,
+ mask=self.mask,
+ block_table=self.forward_metadata.block_tables,
+ seq_len=self.forward_metadata.extend_seq_lens_cpu_int,
+ context_lens=self.forward_metadata.seq_lens_cpu_int,
+ scale_value=layer.scaling,
+ num_heads=layer.tp_q_head_num,
+ num_kv_heads=layer.tp_k_head_num,
+ out=attn_output,
+ )
+ else:
+ attn_output = self.attn_alibi(
+ q=q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim),
+ k_cache=k_cache,
+ v_cache=v_cache,
+ block_tables=self.forward_metadata.block_tables,
+ seq_lens=self.forward_metadata.seq_lens_cpu_int,
+ query_lens=self.forward_metadata.extend_seq_lens_cpu_int,
+ scale_value=layer.scaling,
+ num_heads=layer.tp_q_head_num,
+ slopes=slopes,
+ is_extend=True,
+ )
+ else:
+ if layer.qk_head_dim != layer.v_head_dim:
+ attn_output = q.new_empty(
+ (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+ )
+ else:
+ attn_output = torch.empty_like(q)
+
+ use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+ q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+ o_ = attn_output.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+ # add forward_batch.encoder_lens and is_cross_attention arguments for cross attention scene
+ attn_output = self.native_attn.run_sdpa_forward_extend(
+ q_,
+ o_,
+ k_cache.view(-1, layer.tp_k_head_num, layer.qk_head_dim),
+ v_cache.view(-1, layer.tp_v_head_num, layer.v_head_dim),
+ forward_batch.req_to_token_pool.req_to_token,
+ forward_batch.req_pool_indices,
+ forward_batch.seq_lens,
+ forward_batch.extend_prefix_lens,
+ forward_batch.extend_seq_lens,
+ forward_batch.encoder_lens,
+ is_cross_attention=layer.is_cross_attention,
+ scaling=layer.scaling,
+ enable_gqa=use_gqa,
+ causal=causal,
+ logit_cap=layer.logit_cap,
+ logit_capping_method=layer.logit_capping_method,
+ )
+ attn_output = attn_output.view(
+ -1, layer.tp_q_head_num * layer.v_head_dim
+ )
+ elif sum(forward_batch.extend_prefix_lens_cpu) > 0:
+ num_token_padding = q.shape[0]
+ q, k, v = [
+ data[: forward_batch.num_token_non_padded_cpu] for data in [q, k, v]
+ ]
+ q_nope, q_rope = q.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
+ k_nope, k_rope = k.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ # 1st, compute extend tokens to get attn_output and attn_lse
+ num_tokens = q_nope.size(0)
+ attn_output = torch.zeros(
+ num_tokens,
+ layer.tp_q_head_num,
+ layer.v_head_dim,
+ dtype=q_nope.dtype,
+ device=q_nope.device,
+ )
+ attn_lse = torch.zeros(
+ layer.tp_q_head_num,
+ num_tokens,
+ dtype=torch.float32,
+ device=q_nope.device,
+ )
+ torch_npu.atb.npu_ring_mla(
+ q_nope=q_nope,
+ q_rope=q_rope,
+ k_nope=k_nope,
+ k_rope=k_rope,
+ value=v,
+ mask=self.ringmla_mask,
+ seqlen=self.forward_metadata.extend_seq_lens_cpu_int,
+ head_num=layer.tp_q_head_num,
+ kv_head_num=layer.tp_k_head_num,
+ pre_out=None,
+ prev_lse=None,
+ qk_scale=layer.scaling,
+ kernel_type="kernel_type_high_precision",
+ mask_type="mask_type_triu",
+ calc_type="calc_type_first_ring",
+ output=attn_output,
+ softmax_lse=attn_lse,
+ )
+
+ # 2nd, load history kvcache(kv_a and k_pe) and calculate k_nope
+ k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_buffer = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+ kv_cached = torch.index_select(
+ k_buffer, 0, self.forward_metadata.flatten_prefix_block_tables
+ )
+ k_rope_cached = torch.index_select(
+ v_buffer, 0, self.forward_metadata.flatten_prefix_block_tables
+ ).flatten(0, 1)
+
+ assert layer.kv_b_proj is not None
+ kv = layer.kv_b_proj(kv_cached)[0].view(
+ -1, layer.tp_k_head_num, self.qk_nope_head_dim + layer.v_head_dim
+ )
+ k_nope, v = kv.split([self.qk_nope_head_dim, layer.v_head_dim], dim=-1)
+
+ # 3rd, compute history kv to attn_out
+ k_rope = k_rope_cached.expand(-1, layer.tp_k_head_num, -1)
+ seq_len = torch.stack(
+ [
+ self.forward_metadata.extend_seq_lens_cpu_int,
+ self.forward_metadata.prefix_lens,
+ ]
+ )
+ torch_npu.atb.npu_ring_mla(
+ q_nope=q_nope,
+ q_rope=q_rope,
+ k_nope=k_nope,
+ k_rope=k_rope,
+ value=v,
+ mask=self.ringmla_mask,
+ seqlen=seq_len,
+ head_num=layer.tp_q_head_num,
+ kv_head_num=layer.tp_k_head_num,
+ pre_out=attn_output,
+ prev_lse=attn_lse,
+ qk_scale=layer.scaling,
+ kernel_type="kernel_type_high_precision",
+ mask_type="no_mask",
+ calc_type="calc_type_default",
+ output=attn_output,
+ softmax_lse=attn_lse,
+ )
+ attn_output = attn_output.reshape(
+ [-1, layer.tp_q_head_num, layer.v_head_dim]
+ )
+ if num_token_padding != forward_batch.num_token_non_padded_cpu:
+ attn_output = torch.cat(
+ [
+ attn_output,
+ attn_output.new_zeros(
+ num_token_padding - attn_output.shape[0],
+ *attn_output.shape[1:],
+ ),
+ ],
+ dim=0,
+ )
+ else:
+ assert (
+ layer.qk_head_dim != layer.v_head_dim
+ ), "FIA only supports qk_head_dim != v_head_dim"
+ if layer.v_head_dim in [256]:
+ """Currently, in NO_QUANT situation, qk_nope_head_dim == v_head_dim, and rope exists, v_head_dim only support 512 and 128"""
+ kv_lora_rank = k.shape[-1] - self.qk_rope_head_dim
+ kv_c, k_rope = k.split([kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+ if save_kv_cache:
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, kv_c, k_rope
+ )
+ attn_output = q.new_empty(
+ (q.shape[0], layer.tp_q_head_num, kv_lora_rank)
+ )
+ use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+ layer.layer_id
+ )
+ kv_cache = torch.cat([k_cache, v_cache], dim=-1)
+ attn_output = self.native_attn.run_sdpa_forward_extend(
+ q,
+ attn_output,
+ kv_cache.view(-1, layer.tp_k_head_num, layer.qk_head_dim),
+ k_cache.view(-1, layer.tp_v_head_num, layer.v_head_dim),
+ forward_batch.req_to_token_pool.req_to_token,
+ forward_batch.req_pool_indices,
+ forward_batch.seq_lens,
+ forward_batch.extend_prefix_lens,
+ forward_batch.extend_seq_lens,
+ scaling=layer.scaling,
+ enable_gqa=use_gqa,
+ causal=True,
+ )
+ else:
+ num_token_padding = q.shape[0]
+ q, k, v = [
+ data[: forward_batch.num_token_non_padded_cpu] for data in [q, k, v]
+ ]
+
+ q_nope, q_rope = q.split(
+ [layer.v_head_dim, self.qk_rope_head_dim], dim=-1
+ )
+ k_nope, k_rope = k.split(
+ [layer.v_head_dim, self.qk_rope_head_dim], dim=-1
+ )
+
+ attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+ q_nope,
+ k_nope,
+ v,
+ query_rope=q_rope,
+ key_rope=k_rope,
+ num_heads=layer.tp_q_head_num,
+ input_layout="TND",
+ atten_mask=self.fia_mask,
+ sparse_mode=3,
+ actual_seq_lengths=self.forward_metadata.seq_lens_list_cumsum,
+ actual_seq_lengths_kv=self.forward_metadata.seq_lens_list_cumsum,
+ scale=layer.scaling,
+ next_tokens=0,
+ )
+
+ attn_output = attn_output.reshape(
+ -1, layer.tp_q_head_num, layer.v_head_dim
+ )
+ if num_token_padding != forward_batch.num_token_non_padded_cpu:
+ attn_output = torch.cat(
+ [
+ attn_output,
+ attn_output.new_zeros(
+ num_token_padding - attn_output.shape[0],
+ *attn_output.shape[1:],
+ ),
+ ],
+ dim=0,
+ )
+
+ return attn_output
+
+ def forward_mtp(
+ self,
+ q,
+ k,
+ v,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool,
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ ):
+ if save_kv_cache:
+ if self.use_mla:
+ k = k.view(-1, layer.tp_k_head_num, self.kv_lora_rank)
+ k_rope = k_rope.view(-1, layer.tp_k_head_num, self.qk_rope_head_dim)
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, k_rope
+ )
+ else:
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, v
+ )
+
+ if not self.use_mla:
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+ layer.layer_id
+ ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+ layer.layer_id
+ ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
+ query = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim).contiguous()
+ if not self.graph_mode:
+ num_token_padding = query.shape[0]
+ query = query[: forward_batch.num_token_non_padded_cpu]
+ if self.forward_metadata.seq_lens_cpu_int is None:
+ actual_seq_lengths_kv = self.forward_metadata.seq_lens_cpu_list
+ else:
+ actual_seq_lengths_kv = (
+ self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+ )
+ if forward_batch.forward_mode.is_draft_extend():
+ actual_seq_lengths = (
+ np.array(forward_batch.extend_seq_lens_cpu).cumsum().tolist()
+ )
+ else:
+ actual_seq_lengths = np.arange(
+ self.speculative_num_draft_tokens,
+ self.speculative_num_draft_tokens + query.shape[0],
+ self.speculative_num_draft_tokens,
+ )
+
+ attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+ query,
+ k_cache,
+ v_cache,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="TND",
+ atten_mask=self.mtp_mask,
+ scale=layer.scaling,
+ actual_seq_lengths=actual_seq_lengths,
+ actual_seq_lengths_kv=actual_seq_lengths_kv,
+ sparse_mode=3,
+ )
+ attn_output = attn_output.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+ if (
+ not self.graph_mode
+ and forward_batch.num_token_non_padded_cpu != num_token_padding
+ ):
+ attn_output = torch.cat(
+ [
+ attn_output,
+ attn_output.new_zeros(
+ num_token_padding - forward_batch.num_token_non_padded_cpu,
+ *attn_output.shape[1:],
+ ),
+ ],
+ dim=0,
+ )
+ return attn_output
+ else:
+ c_kv, k_rope = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+ if is_fia_nz():
+ k_rope_cache = _reshape_kv_for_fia_nz(
+ k_rope, layer.tp_k_head_num, self.qk_rope_head_dim, self.page_size
+ )
+ c_kv_cache = _reshape_kv_for_fia_nz(
+ c_kv, layer.tp_v_head_num, self.kv_lora_rank, self.page_size
+ )
+ else:
+ k_rope_cache = k_rope.view(
+ -1, layer.tp_k_head_num, self.page_size, self.qk_rope_head_dim
+ )
+ c_kv_cache = c_kv.view(
+ -1, layer.tp_v_head_num, self.page_size, self.kv_lora_rank
+ )
+
+ q_nope = q.view(-1, layer.tp_q_head_num, self.kv_lora_rank).contiguous()
+ q_rope = q_rope.view(-1, layer.tp_q_head_num, self.qk_rope_head_dim)
+ if not self.graph_mode:
+ num_token_padding = q.shape[0]
+ q_nope = q_nope[: forward_batch.num_token_non_padded_cpu]
+ q_rope = q_rope[: forward_batch.num_token_non_padded_cpu]
+ if self.forward_metadata.seq_lens_cpu_int is None:
+ actual_seq_lengths_kv = self.forward_metadata.seq_lens_cpu_list
+ else:
+ actual_seq_lengths_kv = (
+ self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+ )
+ if forward_batch.forward_mode.is_draft_extend():
+ actual_seq_lengths = (
+ np.array(forward_batch.extend_seq_lens_cpu).cumsum().tolist()
+ )
+ else:
+ actual_seq_lengths = np.arange(
+ self.speculative_num_draft_tokens,
+ self.speculative_num_draft_tokens + q_nope.shape[0],
+ self.speculative_num_draft_tokens,
+ )
+
+ workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+ q_nope,
+ c_kv_cache,
+ c_kv_cache,
+ query_rope=q_rope,
+ key_rope=k_rope_cache,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="TND",
+ scale=layer.scaling,
+ antiquant_mode=0,
+ antiquant_scale=None,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ sparse_mode=3,
+ atten_mask=self.mtp_mask,
+ actual_seq_lengths=actual_seq_lengths,
+ actual_seq_lengths_kv=actual_seq_lengths_kv,
+ )
+ attn_output = torch.empty_like(q_nope, dtype=q.dtype, device=q.device)
+ softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+ torch_npu.npu_fused_infer_attention_score.out(
+ q_nope,
+ c_kv_cache,
+ c_kv_cache,
+ query_rope=q_rope,
+ key_rope=k_rope_cache,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="TND",
+ scale=layer.scaling,
+ antiquant_mode=0,
+ antiquant_scale=None,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ sparse_mode=3,
+ atten_mask=self.mtp_mask,
+ actual_seq_lengths=actual_seq_lengths,
+ actual_seq_lengths_kv=actual_seq_lengths_kv,
+ workspace=workspace,
+ out=[attn_output, softmax_lse],
+ )
+ attn_output = attn_output.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+ if (
+ not self.graph_mode
+ and forward_batch.num_token_non_padded_cpu != num_token_padding
+ ):
+ attn_output = torch.cat(
+ [
+ attn_output,
+ attn_output.new_zeros(
+ num_token_padding - attn_output.shape[0],
+ *attn_output.shape[1:],
+ ),
+ ],
+ dim=0,
+ )
+ return attn_output
+
+ def forward_decode_graph(
+ self,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool = True,
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ sinks: Optional[torch.Tensor] = None,
+ ):
+ if save_kv_cache:
+ if self.use_mla:
+ k = k.view(-1, layer.tp_k_head_num, self.kv_lora_rank)
+ k_rope = k_rope.view(-1, layer.tp_k_head_num, self.qk_rope_head_dim)
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, k_rope
+ )
+ else:
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, v
+ )
+
+ if sinks is not None:
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+ attn_out = attention_sinks_triton(
+ q,
+ k_cache,
+ v_cache,
+ sinks,
+ self.forward_metadata.block_tables,
+ self.forward_metadata.seq_lens,
+ layer.scaling,
+ layer.sliding_window_size,
+ layer.tp_q_head_num,
+ layer.tp_k_head_num,
+ )
+ return attn_out
+
+ if not self.use_mla:
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+ layer.layer_id
+ ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+ layer.layer_id
+ ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
+ query = q.reshape(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
+ if self.forward_metadata.seq_lens_cpu_int is None:
+ actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+ else:
+ actual_seq_len_kv = (
+ self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+ )
+ num_tokens = query.shape[0]
+ workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+ query,
+ k_cache,
+ v_cache,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="BSH",
+ scale=layer.scaling,
+ actual_seq_lengths_kv=actual_seq_len_kv,
+ )
+ output = torch.empty(
+ (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
+ dtype=q.dtype,
+ device=q.device,
+ )
+ softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+ torch_npu.npu_fused_infer_attention_score.out(
+ query,
+ k_cache,
+ v_cache,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="BSH",
+ scale=layer.scaling,
+ actual_seq_lengths_kv=actual_seq_len_kv,
+ workspace=workspace,
+ out=[output, softmax_lse],
+ )
+ return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+ else:
+ c_kv, k_rope = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+ if is_fia_nz():
+ k_rope_cache = _reshape_kv_for_fia_nz(
+ k_rope, layer.tp_k_head_num, self.qk_rope_head_dim, self.page_size
+ )
+ c_kv_cache = _reshape_kv_for_fia_nz(
+ c_kv, layer.tp_v_head_num, self.kv_lora_rank, self.page_size
+ )
+ else:
+ k_rope_cache = k_rope.view(
+ -1, self.page_size, layer.tp_k_head_num * self.qk_rope_head_dim
+ )
+ c_kv_cache = c_kv.view(
+ -1, self.page_size, layer.tp_k_head_num * self.kv_lora_rank
+ )
+
+ q_nope = q.view(-1, 1, layer.tp_q_head_num, self.kv_lora_rank).contiguous()
+ q_rope = q_rope.view(-1, 1, layer.tp_q_head_num, self.qk_rope_head_dim)
+
+ if self.forward_metadata.seq_lens_cpu_int is None:
+ actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+ else:
+ actual_seq_len_kv = (
+ self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+ )
+
+ workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+ q_nope,
+ c_kv_cache,
+ c_kv_cache,
+ query_rope=q_rope,
+ key_rope=k_rope_cache,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ input_layout="BSND",
+ scale=layer.scaling,
+ actual_seq_lengths_kv=actual_seq_len_kv,
+ antiquant_mode=0,
+ antiquant_scale=None,
+ sparse_mode=0,
+ )
+ output = torch.empty_like(q_nope, dtype=q.dtype, device=q.device)
+ softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+
+ torch_npu.npu_fused_infer_attention_score.out(
+ q_nope,
+ c_kv_cache,
+ c_kv_cache,
+ query_rope=q_rope,
+ key_rope=k_rope_cache,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ input_layout="BSND",
+ scale=layer.scaling,
+ actual_seq_lengths_kv=actual_seq_len_kv,
+ antiquant_mode=0,
+ antiquant_scale=None,
+ sparse_mode=0,
+ workspace=workspace,
+ out=[output, softmax_lse],
+ )
+ return output.view(-1, layer.tp_q_head_num * self.kv_lora_rank)
+
+ def forward_decode(
+ self,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool = True,
+ # For multi-head latent attention
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ topk_indices: Optional[torch.Tensor] = None,
+ sinks: Optional[torch.Tensor] = None,
+ slopes: Optional[torch.Tensor] = None,
+ ):
+ if is_mla_preprocess_enabled():
+ # MLAPO does saving kv_cache
+ save_kv_cache = False
+ if topk_indices is not None:
+ return self.forward_sparse(
+ q,
+ k,
+ v,
+ layer,
+ forward_batch,
+ save_kv_cache,
+ q_rope,
+ k_rope,
+ topk_indices,
+ )
+
+ if self.graph_mode and (not self.enable_torch_compile):
+ return self.forward_decode_graph(
+ q,
+ k,
+ v,
+ layer,
+ forward_batch,
+ save_kv_cache,
+ q_rope=q_rope,
+ k_rope=k_rope,
+ sinks=sinks,
+ )
+
+ if not self.use_mla:
+ # In cross attention layer, when there is no vision input,the values of k and v is None
+ if save_kv_cache and k is not None and v is not None:
+ # support cross attention
+ cache_loc = (
+ forward_batch.out_cache_loc
+ if not layer.is_cross_attention
+ else forward_batch.encoder_out_cache_loc
+ )
+ forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+ num_tokens = q.shape[0]
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+ if sinks is not None:
+ attn_out = attention_sinks_triton(
+ q,
+ k_cache,
+ v_cache,
+ sinks,
+ self.forward_metadata.block_tables,
+ self.forward_metadata.seq_lens,
+ layer.scaling,
+ layer.sliding_window_size,
+ layer.tp_q_head_num,
+ layer.tp_k_head_num,
+ )
+ return attn_out
+
+ if self.use_fia:
+ if self.forward_metadata.seq_lens_cpu_int is None:
+ actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+ else:
+ actual_seq_len_kv = (
+ self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+ )
+ attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+ q.view(
+ forward_batch.batch_size,
+ -1,
+ layer.tp_q_head_num,
+ layer.qk_head_dim,
+ ),
+ k_cache.view(
+ -1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim
+ ),
+ v_cache.view(
+ -1, self.page_size, layer.tp_v_head_num * layer.qk_head_dim
+ ),
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="BSND",
+ atten_mask=None,
+ block_size=self.page_size,
+ block_table=self.forward_metadata.block_tables,
+ actual_seq_lengths_kv=actual_seq_len_kv,
+ scale=layer.scaling,
+ )
+ # there are some accuracy issues in cross attention scene to use torch_npu._npu_flash_attention_qlens
+ # forward_batch.encoder_lens is not None in cross attention scend, we add native attn to solve accuracy issues
+ elif forward_batch.encoder_lens is None and layer.logit_cap == 0:
+ query = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
+ num_tokens = query.shape[0]
+ if not self.use_alibi:
+ attn_output = torch.empty(
+ (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
+ dtype=query.dtype,
+ device=query.device,
+ )
+
+ torch_npu._npu_paged_attention(
+ query=query,
+ key_cache=k_cache,
+ value_cache=v_cache,
+ num_heads=layer.tp_q_head_num,
+ num_kv_heads=layer.tp_k_head_num,
+ scale_value=layer.scaling,
+ block_table=self.forward_metadata.block_tables,
+ context_lens=self.forward_metadata.seq_lens_cpu_int,
+ out=attn_output,
+ )
+ else:
+ attn_output = self.attn_alibi(
+ q=query,
+ k_cache=k_cache,
+ v_cache=v_cache,
+ block_tables=self.forward_metadata.block_tables,
+ seq_lens=self.forward_metadata.seq_lens_cpu_int,
+ query_lens=torch.ones(num_tokens, dtype=torch.int32),
+ scale_value=layer.scaling,
+ num_heads=layer.tp_q_head_num,
+ slopes=slopes,
+ is_extend=False,
+ )
+ else:
+ if layer.qk_head_dim != layer.v_head_dim:
+ attn_output = q.new_empty(
+ (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+ )
+ else:
+ attn_output = torch.empty_like(q)
+
+ use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+ q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+ o_ = attn_output.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+ attn_output = self.native_attn.run_sdpa_forward_decode(
+ q_,
+ o_,
+ k_cache.view(-1, layer.tp_k_head_num, layer.qk_head_dim),
+ v_cache.view(-1, layer.tp_v_head_num, layer.v_head_dim),
+ forward_batch.req_to_token_pool.req_to_token,
+ forward_batch.req_pool_indices,
+ forward_batch.seq_lens,
+ forward_batch.encoder_lens,
+ is_cross_attention=layer.is_cross_attention,
+ scaling=layer.scaling,
+ enable_gqa=use_gqa,
+ causal=False,
+ logit_cap=layer.logit_cap,
+ logit_capping_method=layer.logit_capping_method,
+ )
+ return attn_output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+ else:
+ if save_kv_cache:
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, k_rope
+ )
+ num_tokens = q.shape[0]
+ kv_c = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ k_pe = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+ if self.use_fia and (layer.tp_q_head_num // layer.tp_k_head_num) >= 8:
+ """layer.tp_q_head_num // layer.tp_k_head_num < 8 will support in the later version of CANN"""
+ if is_fia_nz():
+ kv_c = _reshape_kv_for_fia_nz(
+ kv_c, layer.tp_k_head_num, self.kv_lora_rank, self.page_size
+ )
+ k_pe = _reshape_kv_for_fia_nz(
+ k_pe, layer.tp_k_head_num, self.qk_rope_head_dim, self.page_size
+ )
+ else:
+ kv_c = kv_c.view(
+ -1, self.page_size, layer.tp_k_head_num * self.kv_lora_rank
+ )
+ k_pe = k_pe.view(
+ -1, self.page_size, layer.tp_k_head_num * self.qk_rope_head_dim
+ )
+ q = q.view(
+ forward_batch.batch_size, -1, layer.tp_q_head_num, self.kv_lora_rank
+ )
+ q_rope = q_rope.view(
+ forward_batch.batch_size,
+ -1,
+ layer.tp_q_head_num,
+ self.qk_rope_head_dim,
+ )
+ attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+ q,
+ kv_c,
+ kv_c,
+ query_rope=q_rope,
+ key_rope=k_pe,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="BSND",
+ atten_mask=None,
+ sparse_mode=0,
+ scale=layer.scaling,
+ antiquant_mode=0,
+ antiquant_scale=None,
+ block_table=self.forward_metadata.block_tables,
+ block_size=self.page_size,
+ actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_int,
+ )
+ else:
+ assert (
+ self.graph_mode == False
+ ) # _npu_paged_attention_mla not support graph mode
+ if q_rope is not None:
+ q = torch.cat([q, q_rope], dim=-1)
+ query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+ kv_c_and_k_pe_cache = torch.cat([kv_c, k_pe], dim=-1)
+ kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
+ -1,
+ self.page_size,
+ layer.tp_k_head_num,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ )
+ attn_output = torch.empty(
+ [num_tokens, layer.tp_q_head_num, self.kv_lora_rank],
+ dtype=q.dtype,
+ device=q.device,
+ )
+ torch_npu._npu_paged_attention_mla(
+ query=query,
+ key_cache=kv_c_and_k_pe_cache,
+ num_kv_heads=layer.tp_k_head_num,
+ num_heads=layer.tp_q_head_num,
+ scale_value=layer.scaling,
+ block_table=self.forward_metadata.block_tables,
+ context_lens=self.forward_metadata.seq_lens_cpu_int,
+ mla_vheadsize=self.kv_lora_rank,
+ out=attn_output,
+ )
+ return attn_output.view(num_tokens, layer.tp_q_head_num * self.kv_lora_rank)
+
+ def forward_mixed(
+ self,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ layer: RadixAttention,
+ forward_batch: ForwardBatch,
+ save_kv_cache: bool = True,
+ q_rope: Optional[torch.Tensor] = None,
+ k_rope: Optional[torch.Tensor] = None,
+ topk_indices: Optional[torch.Tensor] = None,
+ ):
+ if (
+ topk_indices is not None
+ or self.use_mla
+ or (not self.use_fia and layer.qk_head_dim > 128)
+ ):
+ raise NotImplementedError(
+ "The 'enable-mixed-chunk' feature is currently unsupported in the following scenarios: "
+ "1. When using the MLA backend on Ascend NPU devices, "
+ "2. When using the deepseekv3.2 model on Ascend NPU devices, "
+ "3. When the environment variable ASCEND_USE_FIA is set to 0 and qk_head_dim exceeds 128 on Ascend NPU devices."
+ )
+ if save_kv_cache:
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ layer, forward_batch.out_cache_loc, k, v
+ )
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+ v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+ num_block, block_size, _, _ = k_cache.shape
+ key = k_cache.view(num_block, block_size, -1)
+ value = v_cache.view(num_block, block_size, -1)
+
+ query = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
+
+ attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+ query,
+ key,
+ value,
+ num_heads=layer.tp_q_head_num,
+ num_key_value_heads=layer.tp_k_head_num,
+ input_layout="TND",
+ block_size=block_size,
+ block_table=self.forward_metadata.block_tables,
+ atten_mask=self.mix_mask,
+ sparse_mode=3,
+ actual_seq_lengths=self.forward_metadata.seq_lens_list_cumsum,
+ actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_int,
+ scale=layer.scaling,
+ )
+
+ return attn_output.view(
+ attn_output.shape[0], layer.tp_q_head_num * layer.v_head_dim
+ )
+
+
+class AscendAttnMultiStepDraftBackend:
+ """
+ Wrap multiple Ascend attention backends as one for multiple consecutive
+ draft decoding steps
+ """
+
+ def __init__(
+ self,
+ model_runner: ModelRunner,
+ topk: int,
+ speculative_num_steps: int,
+ ):
+ self.topk = topk
+ self.speculative_num_steps = speculative_num_steps
+
+ self.attn_backends = []
+ for _ in range(self.speculative_num_steps):
+ self.attn_backends.append(AscendAttnBackend(model_runner))
+
+ def common_template(self, forward_batch: ForwardBatch, call_fn: int):
+ assert forward_batch.spec_info is not None
+
+ for i in range(self.speculative_num_steps - 1):
+ call_fn(i, forward_batch)
+
+ def init_forward_metadata(self, forward_batch: ForwardBatch):
+ def call_fn(i, forward_batch):
+ assert forward_batch.spec_info is not None
+ self.attn_backends[i].init_forward_metadata(forward_batch)
+
+ self.common_template(forward_batch, call_fn)
+
+ def init_cuda_graph_state(self, max_bs, max_num_tokens):
+ for i in range(self.speculative_num_steps):
+ self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+ def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+ def call_fn(i, forward_batch):
+ self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+ forward_batch.batch_size,
+ forward_batch.batch_size * self.topk,
+ forward_batch.req_pool_indices,
+ forward_batch.seq_lens,
+ encoder_lens=None,
+ forward_mode=ForwardMode.DECODE,
+ spec_info=forward_batch.spec_info,
+ )
+
+ self.common_template(forward_batch, call_fn)
+
+ def init_forward_metadata_replay_cuda_graph(
+ self, forward_batch: ForwardBatch, bs: int
+ ):
+ def call_fn(i, forward_batch):
+ self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+ bs,
+ forward_batch.req_pool_indices,
+ forward_batch.seq_lens,
+ seq_lens_sum=-1,
+ encoder_lens=None,
+ forward_mode=ForwardMode.DECODE,
+ spec_info=forward_batch.spec_info,
+ seq_lens_cpu=forward_batch.seq_lens_cpu,
+ )
+
+ self.common_template(forward_batch, call_fn)
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_torch_native_backend.py b/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_torch_native_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..34bbfc67f2ddc6299e669c9215dd33d7d38a96f9
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/attention/ascend_torch_native_backend.py
@@ -0,0 +1,282 @@
+from __future__ import annotations
+
+import math
+
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+
+class AscendTorchNativeAttnBackend:
+ def __init__(self):
+ pass
+
+ def scaled_dot_product_attention_with_softcapping(
+ self,
+ query,
+ key,
+ value,
+ attn_mask=None,
+ is_causal=False,
+ scale=None,
+ enable_gqa=False,
+ logit_cap=0.0,
+ logit_capping_method="tanh",
+ ) -> torch.Tensor:
+ L, S = query.size(-2), key.size(-2)
+ scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+ attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+ if is_causal:
+ assert attn_mask is None
+ temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).tril(
+ diagonal=0
+ )
+ attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+ attn_bias.to(query.dtype)
+
+ if attn_mask is not None:
+ if attn_mask.dtype == torch.bool:
+ attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+ else:
+ attn_bias = attn_mask + attn_bias
+
+ if enable_gqa:
+ key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
+ value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
+
+ attn_weight = query @ key.transpose(-2, -1) * scale_factor
+
+ if logit_cap > 0:
+ if logit_capping_method == "tanh":
+ attn_weight = logit_cap * torch.tanh(attn_weight / logit_cap)
+
+ attn_weight += attn_bias
+ attn_weight = torch.softmax(attn_weight, dim=-1)
+ return attn_weight @ value
+
+ def run_sdpa_forward_extend(
+ self,
+ query: torch.Tensor,
+ output: torch.Tensor,
+ k_cache: torch.Tensor,
+ v_cache: torch.Tensor,
+ req_to_token: torch.Tensor,
+ req_pool_indices: torch.Tensor,
+ seq_lens: torch.Tensor,
+ extend_prefix_lens: torch.Tensor,
+ extend_seq_lens: torch.Tensor,
+ encoder_lens: torch.Tensor = None,
+ is_cross_attention: bool = False,
+ scaling=None,
+ enable_gqa=False,
+ causal=False,
+ logit_cap: float = 0.0,
+ logit_capping_method: str = "tanh",
+ ):
+ """Run the extend forward by using torch native sdpa op.
+
+ Args:
+ query: [num_tokens, num_heads, head_size]
+ output: [num_tokens, num_heads, head_size]
+ k_cache: [max_total_num_tokens, num_heads, head_size]
+ v_cache: [max_total_num_tokens, num_heads, head_size]
+ req_to_token: [max_num_reqs, max_context_len]
+ req_pool_indices: [num_seqs]
+ seq_lens: [num_seqs]
+ extend_prefix_lens: [num_seqs]
+ extend_seq_lens: [num_seqs]
+ encoder_lens: [num_seqs]
+ is_cross_attention: [bool]
+ scaling: float or None
+ enable_gqa: bool
+ causal: bool
+
+ Returns:
+ output: [num_tokens, num_heads, head_size]
+ """
+
+ assert seq_lens.shape[0] == extend_prefix_lens.shape[0]
+ assert seq_lens.shape[0] == extend_seq_lens.shape[0]
+
+ # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+ query = query.movedim(0, query.dim() - 2)
+
+ start_q, start_kv = 0, 0
+ for seq_idx in range(seq_lens.shape[0]):
+ # Need optimize the performance later.
+
+ extend_seq_len_q = extend_seq_lens[seq_idx]
+ prefill_seq_len_q = extend_prefix_lens[seq_idx]
+
+ seq_len_kv = seq_lens[seq_idx]
+ end_q = start_q + extend_seq_len_q
+ end_kv = start_kv + seq_len_kv
+ atten_start_kv = 0
+ atten_end_kv = seq_lens[seq_idx]
+ # support cross attention
+ if encoder_lens is not None:
+ if is_cross_attention:
+ atten_end_kv = encoder_lens[seq_idx]
+ else:
+ atten_start_kv = encoder_lens[seq_idx]
+ atten_end_kv = encoder_lens[seq_idx] + extend_seq_len_q
+
+ per_req_query = query[:, start_q:end_q, :]
+ per_req_query_redudant = torch.empty(
+ (per_req_query.shape[0], seq_len_kv, per_req_query.shape[2]),
+ dtype=per_req_query.dtype,
+ device=per_req_query.device,
+ )
+
+ per_req_query_redudant[:, prefill_seq_len_q:, :] = per_req_query
+
+ # get key and value from cache. per_req_tokens contains the kv cache
+ # index for each token in the sequence.
+ req_pool_idx = req_pool_indices[seq_idx]
+ per_req_tokens = req_to_token[req_pool_idx, atten_start_kv:atten_end_kv]
+ per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+ per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+ if not (per_req_query.dtype == per_req_key.dtype == per_req_value.dtype):
+ # scaled_dot_product_attention() expects query, key, and value to have the same dtype
+ per_req_key = per_req_key.to(per_req_query.dtype)
+ per_req_value = per_req_value.to(per_req_query.dtype)
+
+ if logit_cap > 0:
+ per_req_out_redudant = (
+ self.scaled_dot_product_attention_with_softcapping(
+ per_req_query_redudant.unsqueeze(0),
+ per_req_key.unsqueeze(0),
+ per_req_value.unsqueeze(0),
+ enable_gqa=enable_gqa,
+ scale=scaling,
+ is_causal=causal,
+ logit_cap=logit_cap,
+ logit_capping_method=logit_capping_method,
+ )
+ .squeeze(0)
+ .movedim(query.dim() - 2, 0)
+ )
+ else:
+ per_req_out_redudant = (
+ scaled_dot_product_attention(
+ per_req_query_redudant.unsqueeze(0),
+ per_req_key.unsqueeze(0),
+ per_req_value.unsqueeze(0),
+ enable_gqa=enable_gqa,
+ scale=scaling,
+ is_causal=causal,
+ )
+ .squeeze(0)
+ .movedim(query.dim() - 2, 0)
+ )
+ output[start_q:end_q, :, :] = per_req_out_redudant[prefill_seq_len_q:, :, :]
+ start_q, start_kv = end_q, end_kv
+ return output
+
+ def run_sdpa_forward_decode(
+ self,
+ query: torch.Tensor,
+ output: torch.Tensor,
+ k_cache: torch.Tensor,
+ v_cache: torch.Tensor,
+ req_to_token: torch.Tensor,
+ req_pool_indices: torch.Tensor,
+ seq_lens: torch.Tensor,
+ encoder_lens: torch.Tensor = None,
+ is_cross_attention: bool = False,
+ scaling=None,
+ enable_gqa=False,
+ causal=False,
+ logit_cap: float = 0.0,
+ logit_capping_method: str = "tanh",
+ ):
+ """Run the decode forward by using torch native sdpa op.
+
+ Args:
+ query: [num_tokens, num_heads, head_size]
+ output: [num_tokens, num_heads, head_size]
+ k_cache: [max_total_num_tokens, num_heads, head_size]
+ v_cache: [max_total_num_tokens, num_heads, head_size]
+ req_to_token: [max_num_reqs, max_context_len]
+ req_pool_indices: [num_seqs]
+ seq_lens: [num_seqs]
+ encoder_lens: [num_seqs]
+ is_cross_attention: [bool]
+ scaling: float or None
+ enable_gqa: bool
+ causal: bool
+
+ Returns:
+ output: [num_tokens, num_heads, head_size]
+ """
+
+ # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+ query = query.movedim(0, query.dim() - 2)
+
+ start_q, start_kv = 0, 0
+ for seq_idx in range(seq_lens.shape[0]):
+ # Need optimize the performance later.
+
+ seq_len_q = 1
+ seq_len_kv = seq_lens[seq_idx]
+ end_q = start_q + seq_len_q
+ end_kv = start_kv + seq_len_kv
+ atten_start_kv = 0
+ atten_end_kv = seq_lens[seq_idx]
+ # support cross attention
+ if encoder_lens is not None:
+ if is_cross_attention:
+ atten_end_kv = encoder_lens[seq_idx]
+ else:
+ atten_start_kv = encoder_lens[seq_idx]
+ atten_end_kv = encoder_lens[seq_idx] + seq_len_kv
+
+ per_req_query = query[:, start_q:end_q, :]
+
+ # get key and value from cache. per_req_tokens contains the kv cache
+ # index for each token in the sequence.
+ req_pool_idx = req_pool_indices[seq_idx]
+ per_req_tokens = req_to_token[req_pool_idx, atten_start_kv:atten_end_kv]
+ per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+ per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+ if not (per_req_query.dtype == per_req_key.dtype == per_req_value.dtype):
+ # scaled_dot_product_attention() expects query, key, and value to have the same dtype
+ per_req_key = per_req_key.to(per_req_query.dtype)
+ per_req_value = per_req_value.to(per_req_query.dtype)
+
+ if logit_cap > 0:
+ per_req_out = (
+ self.scaled_dot_product_attention_with_softcapping(
+ per_req_query.unsqueeze(0),
+ per_req_key.unsqueeze(0),
+ per_req_value.unsqueeze(0),
+ enable_gqa=enable_gqa,
+ scale=scaling,
+ is_causal=causal,
+ logit_cap=logit_cap,
+ logit_capping_method=logit_capping_method,
+ )
+ .squeeze(0)
+ .movedim(query.dim() - 2, 0)
+ )
+ else:
+ per_req_out = (
+ scaled_dot_product_attention(
+ per_req_query.unsqueeze(0),
+ per_req_key.unsqueeze(0),
+ per_req_value.unsqueeze(0),
+ enable_gqa=enable_gqa,
+ scale=scaling,
+ is_causal=causal,
+ )
+ .squeeze(0)
+ .movedim(query.dim() - 2, 0)
+ )
+ output[start_q:end_q, :, :] = per_req_out
+ start_q, start_kv = end_q, end_kv
+
+ return output
+
+ def support_triton(self):
+ return False
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/cmo.py b/sglang/python/sglang/srt/hardware_backend/npu/cmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f3b4f1696b4778d3e2d7b4acb25d85535bd6bb
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/cmo.py
@@ -0,0 +1,54 @@
+import torch
+
+cmo_stream = None
+
+
+def get_cmo_stream():
+ """
+ Cache Management Operation(CMO).
+ Launch a new stream to prefetch the weight of matmul when running other
+ AIV or communication kernels, aiming to overlap the memory access time.
+ """
+ global cmo_stream
+ return cmo_stream
+
+
+def set_cmo_stream(stream):
+ global cmo_stream
+ cmo_stream = stream
+
+
+def prepare_weight_cache(handle, cache, PREFETCH_MAX_SIZE=1000000000):
+ """
+ PREFETCH_MAX_SIZE: maximum size (bytes) for each prefetch operation.
+ This affects the time spent in prefetch:
+ time ≈ PREFETCH_MAX_SIZE / system_bandwidth
+ """
+ import torch_npu
+
+ stream = get_cmo_stream()
+ if stream is None:
+ stream = torch.npu.Stream()
+ set_cmo_stream(stream)
+ stream.wait_stream(torch.npu.current_stream())
+ with torch.npu.stream(stream):
+ if isinstance(cache, list):
+ for weight in cache:
+ torch_npu.npu_prefetch(
+ weight,
+ handle,
+ PREFETCH_MAX_SIZE,
+ )
+ else:
+ torch_npu.npu_prefetch(
+ cache,
+ handle,
+ PREFETCH_MAX_SIZE,
+ )
+
+
+def wait_cmo_stream():
+ stream = get_cmo_stream()
+ if stream is not None:
+ cur_stream = torch.npu.current_stream()
+ cur_stream.wait_stream(stream)
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/__pycache__/eagle_draft_npu_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/__pycache__/eagle_draft_npu_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c683ef158d5c7b47cbc59ea62d2dc25dbef1ee1
Binary files /dev/null and b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/__pycache__/eagle_draft_npu_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_extend_npu_graph_runner.py b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_extend_npu_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..92308ca46c89ed222bcefeb1d145ebfb8acac09c
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_extend_npu_graph_runner.py
@@ -0,0 +1,71 @@
+# Copyright 2024-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with npu graph and torch.compile."""
+
+from __future__ import annotations
+
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.configs.model_config import is_deepseek_nsa
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.speculative.eagle_draft_extend_cuda_graph_runner import (
+ EAGLEDraftExtendCudaGraphRunner,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.speculative.eagle_worker import EAGLEWorker
+
+
+class EAGLEDraftExtendNpuGraphRunner(EAGLEDraftExtendCudaGraphRunner):
+ def __init__(self, eagle_worker: EAGLEWorker):
+ super().__init__(eagle_worker)
+
+ def _create_graph(self):
+ return torch.npu.NPUGraph()
+
+ def _cache_loc_dtype(self):
+ return torch.int32
+
+ def _capture_init(self, run_once_fn):
+ for _ in range(2):
+ torch.npu.synchronize()
+ self.model_runner.tp_group.barrier()
+ run_once_fn()
+
+ def _capture_graph(self, graph, pool, stream, run_once_fn):
+ with torch.npu.graph(
+ graph, pool=pool, stream=stream, auto_dispatch_capture=True
+ ):
+ out = run_once_fn()
+ return out
+
+ def _replay_update(self, seq_lens):
+ self.graphs[self.bs].update(
+ cpu_update_input=[{"actual_seq_lengths_kv": seq_lens}]
+ )
+
+ def _replay(self, forward_batch: ForwardBatch):
+ if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
+ seq_lens = forward_batch.seq_lens_cpu.tolist() + [0] * (
+ self.bs - self.raw_bs
+ )
+ thread = threading.Thread(target=self._replay_update, args=(seq_lens,))
+ thread.start()
+ self.graphs[self.bs].replay()
+ thread.join()
+ else:
+ self.graphs[self.bs].replay()
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_npu_graph_runner.py b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_npu_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ffc5fdd2d57ef9e022bffc20b14223eee77a6f0
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/eagle_draft_npu_graph_runner.py
@@ -0,0 +1,109 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with npu graph and torch.compile"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, Dict, Union
+
+import numpy as np
+import torch
+
+from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
+ EAGLEDraftCudaGraphRunner,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.speculative.eagle_worker import EAGLEWorker
+
+from sglang.srt.utils import is_npu
+
+logger = logging.getLogger(__name__)
+
+if is_npu():
+ torch.cuda.CUDAGraph = torch.npu.NPUGraph
+ torch.cuda.synchronize = torch.npu.synchronize
+ torch.cuda.graph = torch.npu.graph
+ torch.cuda.stream = torch.npu.stream
+ torch.cuda.Stream = torch.npu.Stream
+ torch.cuda.current_stream = torch.npu.current_stream
+
+
+class EAGLEDraftNpuGraphRunner(EAGLEDraftCudaGraphRunner):
+ def __init__(self, eagle_worker: EAGLEWorker):
+ super().__init__(eagle_worker)
+ self.update_attr_name = None
+ self.update_attr_type = None
+ self._init_arch_map()
+
+ def _init_arch_map(self):
+ self.attr_name: Dict[str, str] = {
+ AttentionArch.MLA: "actual_seq_lengths_kv",
+ AttentionArch.MHA: "context_lens",
+ }
+ self.attr_type: Dict[str, Union[list, torch.Tensor]] = {
+ AttentionArch.MLA: [],
+ AttentionArch.MHA: torch.Tensor(),
+ }
+
+ def _create_graph(self):
+ return torch.npu.NPUGraph()
+
+ def _capture_init(self, run_once_fn):
+ for _ in range(2):
+ torch.npu.synchronize()
+ self.model_runner.tp_group.barrier()
+ run_once_fn()
+
+ def _capture_graph(self, graph, pool, stream, run_once_fn):
+ with torch.npu.graph(
+ graph, pool=pool, stream=stream, auto_dispatch_capture=True
+ ):
+ out = run_once_fn()
+ return out
+
+ def _get_update_attr_name(self):
+ return self.attr_name[AttentionArch.MLA]
+
+ def _get_update_attr_type(self):
+ return self.attr_type[AttentionArch.MLA]
+
+ def _replay_update(self, seq_lens):
+ if isinstance(self.update_attr_type, torch.Tensor):
+ seq_lens = torch.from_numpy(np.array(seq_lens).astype(np.int32))
+
+ self.graphs[self.bs].update(
+ cpu_update_input=[{self.update_attr_name: seq_lens}]
+ )
+
+ def _replay(self, forward_batch: ForwardBatch):
+ self.update_attr_name = self._get_update_attr_name()
+ self.update_attr_type = self._get_update_attr_type()
+ if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
+ seq_lens = forward_batch.seq_lens_cpu.tolist() + [0] * (
+ self.bs - self.raw_bs
+ )
+ thread = threading.Thread(target=self._replay_update, args=(seq_lens,))
+ thread.start()
+ self.graphs[self.bs].replay()
+ thread.join()
+ else:
+ self.graphs[self.bs].replay()
+
+ def _cache_loc_dtype(self):
+ return torch.int32
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/npu_graph_runner.py b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/npu_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..896667d33c9d7219103e3c8868d1bc2a0aeeb8ca
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/graph_runner/npu_graph_runner.py
@@ -0,0 +1,201 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with npu graph and torch.compile."""
+
+from __future__ import annotations
+
+import logging
+import os
+import threading
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+import sglang
+from sglang.srt.configs.model_config import AttentionArch, is_deepseek_nsa
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
+from sglang.srt.utils import (
+ empty_context,
+ get_bool_env_var,
+ get_compiler_backend,
+ is_npu,
+)
+
+is_npu = is_npu()
+
+if is_npu:
+ import torch_npu
+ from torch_npu.profiler import ProfilerActivity, profile
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+ from sglang.srt.model_executor.model_runner import ModelRunner
+
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+
+
+@contextmanager
+def patch_model_npu(
+ model: torch.nn.Module,
+ enable_compile: bool,
+ num_tokens: int,
+ tp_group: GroupCoordinator,
+):
+ if enable_compile:
+ backend = get_compiler_backend("npugraph_ex")
+ yield torch.compile(
+ torch.no_grad()(model.forward),
+ fullgraph=True,
+ dynamic=False,
+ backend=backend,
+ )
+ else:
+ yield model.forward
+
+
+class NPUGraphRunner(CudaGraphRunner):
+ """A NPUGraphRunner runs the forward pass of a model with npu graph and torch.compile."""
+
+ def __init__(self, model_runner: ModelRunner):
+ sglang.srt.model_executor.cuda_graph_runner.patch_model = patch_model_npu
+ super().__init__(model_runner)
+ self.update_attr_name = None
+ self.update_attr_type = None
+ self.model_runner = model_runner
+ self._init_arch_map()
+ self.use_fia = get_bool_env_var("ASCEND_USE_FIA", "False")
+
+ def _init_arch_map(self):
+ self.attr_name: Dict[str, str] = {
+ AttentionArch.MLA: "actual_seq_lengths_kv",
+ AttentionArch.MHA: "context_lens",
+ }
+ self.attr_type: Dict[str, Union[list, torch.Tensor]] = {
+ AttentionArch.MLA: [],
+ AttentionArch.MHA: torch.Tensor(),
+ }
+
+ def _create_device_graph(self):
+ return torch.npu.NPUGraph()
+
+ def _capture_graph(self, graph, pool, stream, run_once_fn):
+ if self.enable_torch_compile:
+ skip_guard_context = torch.compiler.set_stance(skip_guard_eval_unsafe=True)
+ else:
+ skip_guard_context = empty_context()
+
+ with skip_guard_context, torch.npu.graph(
+ graph,
+ pool=pool,
+ stream=stream,
+ auto_dispatch_capture=True,
+ ):
+ out = run_once_fn()
+ return out
+
+ def _get_update_attr_name(self):
+ return self.attr_name[AttentionArch.MLA]
+
+ def _get_update_attr_type(self):
+ return self.attr_type[AttentionArch.MLA]
+
+ def _update_inputs(self, seq_lens):
+ if isinstance(self.update_attr_type, torch.Tensor):
+ seq_lens = torch.from_numpy(np.array(seq_lens).astype(np.int32))
+
+ self.graphs[self.bs].update(
+ cpu_update_input=[{self.update_attr_name: seq_lens}]
+ )
+
+ def _cache_loc_dtype(self):
+ return torch.int32
+
+ def _init_profile_context_and_memory_record(self):
+ output_dir = os.path.join(
+ os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp"), "graph_capture_profile"
+ )
+ if not Path(output_dir).exists():
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
+ logger.info(
+ f"Profiling starts for graph capture for NPU. Traces will be saved to: {output_dir}"
+ )
+ experimental_config = torch_npu.profiler._ExperimentalConfig(
+ export_type=[torch_npu.profiler.ExportType.Text],
+ profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+ )
+ profile_context = profile(
+ activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
+ record_shapes=True,
+ profile_memory=True,
+ on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
+ output_dir, async_mode=True
+ ),
+ experimental_config=experimental_config,
+ )
+ return profile_context
+
+ def _post_process_after_profile(self, prof_context):
+ # for NPU, profile data will be saved to disk for further analysis.
+ pass
+
+ def replay(
+ self,
+ forward_batch: ForwardBatch,
+ skip_attn_backend_init: bool = False,
+ pp_proxy_tensors: Optional[PPProxyTensors] = None,
+ ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+ if not skip_attn_backend_init:
+ self.replay_prepare(forward_batch, pp_proxy_tensors)
+ else:
+ # In speculative decoding, these two fields are still needed.
+ self.buffers.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+ self.buffers.positions[: self.raw_num_token].copy_(forward_batch.positions)
+
+ self.update_attr_name = self._get_update_attr_name()
+ self.update_attr_type = self._get_update_attr_type()
+ # Replay
+ if not is_deepseek_nsa(self.model_runner.model_config.hf_config):
+ if forward_batch.forward_mode.is_target_verify():
+ seq_lens_cpu = forward_batch.seq_lens.cpu() + self.num_tokens_per_bs
+ seq_lens = seq_lens_cpu.tolist() + [0] * (self.bs - self.raw_bs)
+ else:
+ seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (
+ self.bs - self.raw_bs
+ )
+ thread = threading.Thread(target=self._update_inputs, args=(seq_lens,))
+ thread.start()
+ self.graphs[self.bs].replay()
+ thread.join()
+ else:
+ self.graphs[self.bs].replay()
+
+ output = self.output_buffers[self.bs]
+ if isinstance(output, LogitsProcessorOutput):
+ return LogitsProcessorOutput(
+ next_token_logits=output.next_token_logits[: self.raw_num_token],
+ hidden_states=(
+ output.hidden_states[: self.raw_num_token]
+ if output.hidden_states is not None
+ else None
+ ),
+ )
+ else:
+ assert isinstance(output, PPProxyTensors)
+ return PPProxyTensors({k: v[: self.bs] for k, v in output.tensors.items()})
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/memory_pool_npu.py b/sglang/python/sglang/srt/hardware_backend/npu/memory_pool_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e7ce693e33839a991fc3a7a63daf034f9c0605
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/memory_pool_npu.py
@@ -0,0 +1,362 @@
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import torch_npu
+
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
+from sglang.srt.mem_cache.memory_pool import (
+ MHATokenToKVPool,
+ MLATokenToKVPool,
+ get_tensor_size_bytes,
+)
+from sglang.srt.utils import get_bool_env_var
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.radix_attention import RadixAttention
+
+
+class NPUMHATokenToKVPool(MHATokenToKVPool):
+
+ def __init__(
+ self,
+ size: int,
+ page_size: int,
+ dtype: torch.dtype,
+ head_num: int,
+ head_dim: int,
+ layer_num: int,
+ device: str,
+ enable_memory_saver: bool,
+ start_layer: Optional[int] = None,
+ end_layer: Optional[int] = None,
+ enable_alt_stream: bool = True,
+ enable_kv_cache_copy: bool = False,
+ ):
+ self.use_fia = get_bool_env_var("ASCEND_USE_FIA", "False")
+ super().__init__(
+ size=size,
+ page_size=page_size,
+ dtype=dtype,
+ head_num=head_num,
+ head_dim=head_dim,
+ layer_num=layer_num,
+ device=device,
+ enable_memory_saver=enable_memory_saver,
+ start_layer=start_layer,
+ end_layer=end_layer,
+ enable_alt_stream=enable_alt_stream,
+ enable_kv_cache_copy=enable_kv_cache_copy,
+ )
+
+ def _create_buffers(self):
+ with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+ # [size, head_num, head_dim] for each layer
+ # The padded slot 0 is used for writing dummy outputs from padded tokens.
+ # Continuous memory improves the efficiency of Ascend`s transmission backend,
+ # while other backends remain unchanged.
+ self.kv_buffer = torch.zeros(
+ (
+ 2,
+ self.layer_num,
+ self.size // self.page_size + 1,
+ self.page_size,
+ self.head_num,
+ self.head_dim,
+ ),
+ dtype=self.store_dtype,
+ device=self.device,
+ )
+ self.k_buffer = self.kv_buffer[0]
+ self.v_buffer = self.kv_buffer[1]
+
+ if self.use_fia:
+ self.k_buffer = []
+ self.v_buffer = []
+ for i in range(self.layer_num):
+ k_buffer_layer = self.kv_buffer[0][i].view(
+ -1, 1, self.head_num, self.head_dim
+ )
+ v_buffer_layer = self.kv_buffer[1][i].view(
+ -1, 1, self.head_num, self.head_dim
+ )
+ self.k_buffer.append(k_buffer_layer)
+ self.v_buffer.append(v_buffer_layer)
+
+ # for disagg
+ def get_contiguous_buf_infos(self):
+ # layer_num x [seq_len, head_num, head_dim]
+ # layer_num x [page_num, page_size, head_num, head_dim]
+ kv_data_ptrs = [
+ self.get_key_buffer(i).data_ptr()
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ] + [
+ self.get_value_buffer(i).data_ptr()
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ]
+ kv_data_lens = [
+ self.get_key_buffer(i).nbytes
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ] + [
+ self.get_value_buffer(i).nbytes
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ]
+ kv_item_lens = [
+ self.get_key_buffer(i)[0].nbytes
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ] + [
+ self.get_value_buffer(i)[0].nbytes
+ for i in range(self.start_layer, self.start_layer + self.layer_num)
+ ]
+ return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+ def set_kv_buffer(
+ self,
+ layer: "RadixAttention",
+ loc: torch.Tensor,
+ cache_k: torch.Tensor,
+ cache_v: torch.Tensor,
+ k_scale: Optional[float] = None,
+ v_scale: Optional[float] = None,
+ layer_id_override: Optional[int] = None,
+ ):
+ if layer_id_override is not None:
+ layer_id = layer_id_override
+ else:
+ layer_id = layer.layer_id
+ if cache_k.dtype != self.dtype:
+ if k_scale is not None:
+ cache_k.div_(k_scale)
+ if v_scale is not None:
+ cache_v.div_(v_scale)
+ cache_k = cache_k.to(self.dtype)
+ cache_v = cache_v.to(self.dtype)
+
+ if self.store_dtype != self.dtype:
+ cache_k = cache_k.view(self.store_dtype)
+ cache_v = cache_v.view(self.store_dtype)
+
+ if self.use_fia:
+ k_buffer_layer = self.k_buffer[layer_id - self.start_layer]
+ v_buffer_layer = self.v_buffer[layer_id - self.start_layer]
+
+ torch_npu.npu_scatter_nd_update_(
+ k_buffer_layer,
+ loc.view(-1, 1),
+ cache_k.view(-1, 1, self.head_num, self.head_dim),
+ )
+ torch_npu.npu_scatter_nd_update_(
+ v_buffer_layer,
+ loc.view(-1, 1),
+ cache_v.view(-1, 1, self.head_num, self.head_dim),
+ )
+ else:
+ loc = loc.to(torch.int32)
+ torch_npu._npu_reshape_and_cache(
+ key=cache_k,
+ value=cache_v,
+ key_cache=self.k_buffer[layer_id - self.start_layer].view(
+ -1, self.page_size, self.head_num, self.head_dim
+ ),
+ value_cache=self.v_buffer[layer_id - self.start_layer].view(
+ -1, self.page_size, self.head_num, self.head_dim
+ ),
+ slot_indices=loc,
+ )
+
+
+class NPUMLATokenToKVPool(MLATokenToKVPool):
+
+ def __init__(
+ self,
+ size: int,
+ page_size: int,
+ dtype: torch.dtype,
+ kv_lora_rank: int,
+ qk_rope_head_dim: int,
+ index_head_dim: Optional[int],
+ layer_num: int,
+ device: str,
+ enable_memory_saver: bool,
+ start_layer: Optional[int] = None,
+ end_layer: Optional[int] = None,
+ ):
+ super(MLATokenToKVPool, self).__init__(
+ size=size,
+ page_size=page_size,
+ dtype=dtype,
+ layer_num=layer_num,
+ device=device,
+ enable_memory_saver=enable_memory_saver,
+ start_layer=start_layer,
+ end_layer=end_layer,
+ )
+
+ self.kv_lora_rank = kv_lora_rank
+ self.qk_rope_head_dim = qk_rope_head_dim
+ self.index_head_dim = index_head_dim
+
+ self.custom_mem_pool = None
+
+ with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+ # The padded slot 0 is used for writing dummy outputs from padded tokens.
+ self.k_buffer = torch.zeros(
+ (
+ layer_num,
+ self.size // self.page_size + 1,
+ self.page_size,
+ 1,
+ self.kv_lora_rank,
+ ),
+ dtype=self.store_dtype,
+ device=self.device,
+ )
+ self.v_buffer = torch.zeros(
+ (
+ layer_num,
+ self.size // self.page_size + 1,
+ self.page_size,
+ 1,
+ self.qk_rope_head_dim,
+ ),
+ dtype=self.store_dtype,
+ device=self.device,
+ )
+ self.index_k_buffer = None
+ if self.index_head_dim is not None:
+ self.index_k_buffer = torch.zeros(
+ (
+ layer_num,
+ self.size // self.page_size + 1,
+ self.page_size,
+ 1,
+ self.index_head_dim,
+ ),
+ dtype=self.store_dtype,
+ device=self.device,
+ )
+
+ self._finalize_allocation_log(size)
+
+ def get_kv_size_bytes(self):
+ assert hasattr(self, "k_buffer")
+ assert hasattr(self, "v_buffer")
+ kv_size_bytes = 0
+ for k_cache in self.k_buffer:
+ kv_size_bytes += get_tensor_size_bytes(k_cache)
+ for v_cache in self.v_buffer:
+ kv_size_bytes += get_tensor_size_bytes(v_cache)
+ if self.index_head_dim is not None:
+ assert hasattr(self, "index_k_buffer")
+ for index_k_cache in self.index_k_buffer:
+ kv_size_bytes += get_tensor_size_bytes(index_k_cache)
+ return kv_size_bytes
+
+ def get_kv_buffer(self, layer_id: int):
+ if self.layer_transfer_counter is not None:
+ self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+ return (
+ self.k_buffer[layer_id - self.start_layer],
+ self.v_buffer[layer_id - self.start_layer],
+ )
+
+ def get_key_buffer(self, layer_id: int):
+ if self.layer_transfer_counter is not None:
+ self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+ if self.store_dtype != self.dtype:
+ return self.k_buffer[layer_id - self.start_layer].view(self.dtype)
+ return self.k_buffer[layer_id - self.start_layer]
+
+ def get_value_buffer(self, layer_id: int):
+ if self.layer_transfer_counter is not None:
+ self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+ if self.store_dtype != self.dtype:
+ return self.v_buffer[layer_id - self.start_layer].view(self.dtype)
+ return self.v_buffer[layer_id - self.start_layer]
+
+ def get_index_k_buffer(self, layer_id: int):
+ if self.layer_transfer_counter is not None:
+ self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+ if self.store_dtype != self.dtype:
+ return self.index_k_buffer[layer_id - self.start_layer].view(self.dtype)
+ return self.index_k_buffer[layer_id - self.start_layer]
+
+ # for disagg
+ def get_contiguous_buf_infos(self):
+ # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
+ kv_data_ptrs = [self.k_buffer[i].data_ptr() for i in range(self.layer_num)] + [
+ self.v_buffer[i].data_ptr() for i in range(self.layer_num)
+ ]
+ kv_data_lens = [self.k_buffer[i].nbytes for i in range(self.layer_num)] + [
+ self.v_buffer[i].nbytes for i in range(self.layer_num)
+ ]
+ kv_item_lens = [self.k_buffer[i][0].nbytes for i in range(self.layer_num)] + [
+ self.v_buffer[i][0].nbytes for i in range(self.layer_num)
+ ]
+ if self.index_head_dim is not None:
+ kv_data_ptrs += [
+ self.index_k_buffer[i].data_ptr() for i in range(self.layer_num)
+ ]
+ kv_data_lens += [
+ self.index_k_buffer[i].nbytes for i in range(self.layer_num)
+ ]
+ kv_item_lens += [
+ self.index_k_buffer[i][0].nbytes for i in range(self.layer_num)
+ ]
+ return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+ def set_kv_buffer(
+ self,
+ layer: "RadixAttention",
+ loc: torch.Tensor,
+ cache_k: torch.Tensor,
+ cache_v: torch.Tensor,
+ ):
+ layer_id = layer.layer_id
+ if cache_k.dtype != self.dtype:
+ cache_k = cache_k.to(self.dtype)
+ cache_v = cache_v.to(self.dtype)
+
+ if self.store_dtype != self.dtype:
+ cache_k = cache_k.view(self.store_dtype)
+ cache_v = cache_v.view(self.store_dtype)
+
+ if cache_v is None:
+ cache_k, cache_v = cache_k.split(
+ [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+ )
+
+ torch_npu.npu_scatter_nd_update_(
+ self.k_buffer[layer_id - self.start_layer].view(-1, 1, self.kv_lora_rank),
+ loc.view(-1, 1),
+ cache_k.view(-1, 1, self.kv_lora_rank),
+ )
+ torch_npu.npu_scatter_nd_update_(
+ self.v_buffer[layer_id - self.start_layer].view(
+ -1, 1, self.qk_rope_head_dim
+ ),
+ loc.view(-1, 1),
+ cache_v.view(-1, 1, self.qk_rope_head_dim),
+ )
+
+ def set_index_k_buffer(
+ self,
+ layer_id: int,
+ loc: torch.Tensor,
+ index_k: torch.Tensor,
+ ):
+ if index_k.dtype != self.dtype:
+ index_k = index_k.to(self.dtype)
+
+ if self.store_dtype != self.dtype:
+ index_k = index_k.view(self.store_dtype)
+
+ torch_npu.npu_scatter_nd_update_(
+ self.index_k_buffer[layer_id - self.start_layer].view(
+ -1, 1, self.index_head_dim
+ ),
+ loc.view(-1, 1),
+ index_k.view(-1, 1, self.index_head_dim),
+ )
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/modules/deepseek_v2_attention_mla_npu.py b/sglang/python/sglang/srt/hardware_backend/npu/modules/deepseek_v2_attention_mla_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd8d97886728b2272c34a58573f29477725600a
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/modules/deepseek_v2_attention_mla_npu.py
@@ -0,0 +1,551 @@
+import re
+from typing import TYPE_CHECKING
+
+import torch
+import torch_npu
+
+from sglang.srt.environ import envs
+from sglang.srt.hardware_backend.npu.attention.mla_preprocess import (
+ NPUFusedMLAPreprocess,
+ is_fia_nz,
+ is_mla_preprocess_enabled,
+)
+from sglang.srt.layers.attention.nsa.nsa_indexer import scattered_to_tp_attn_full
+from sglang.srt.layers.attention.nsa.utils import (
+ cp_split_and_rebuild_position,
+ nsa_use_prefill_cp,
+)
+from sglang.srt.layers.communicator import ScatterMode, get_attn_tp_context
+
+if TYPE_CHECKING:
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+ from sglang.srt.utils import BumpAllocator
+_use_ag_after_qlora = envs.SGLANG_USE_AG_AFTER_QLORA.get()
+
+
+# region MHA
+def forward_mha_prepare_npu(
+ m: "DeepseekV2AttentionMLA",
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+ layer_scatter_modes,
+):
+ if m.q_lora_rank is not None:
+ q, latent_cache = (
+ get_attn_tp_context()
+ .fetch_qkv_latent()
+ .split(
+ [m.q_lora_rank, m.kv_lora_rank + m.qk_rope_head_dim],
+ dim=-1,
+ )
+ )
+
+ # NSA Indexer: cache quantized keys, auto-skip topk for sequences <= nsa_index_topk
+
+ if m.use_nsa:
+ q_lora = m.q_a_layernorm(q)
+ q = m.q_b_proj(q_lora)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+ _ = m.indexer(
+ x=hidden_states,
+ q_lora=q_lora,
+ positions=positions,
+ forward_batch=forward_batch,
+ layer_id=m.layer_id,
+ return_indices=False,
+ )
+
+ else:
+ q = m.q_a_layernorm(q)
+ if (
+ _use_ag_after_qlora
+ and layer_scatter_modes.layer_input_mode == ScatterMode.SCATTERED
+ and layer_scatter_modes.attn_mode == ScatterMode.TP_ATTN_FULL
+ ):
+ q = scattered_to_tp_attn_full(q, forward_batch)
+ latent_cache = scattered_to_tp_attn_full(latent_cache, forward_batch)
+ q = m.q_b_proj(q)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+
+ else:
+ q = m.q_proj(hidden_states)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+ latent_cache = m.kv_a_proj_with_mqa(hidden_states)[0]
+
+ _, q_pe = q.split([m.qk_nope_head_dim, m.qk_rope_head_dim], dim=-1)
+ kv_a, _ = latent_cache.split([m.kv_lora_rank, m.qk_rope_head_dim], dim=-1)
+ latent_cache = latent_cache.unsqueeze(1)
+
+ if m.use_deepseek_yarn_rope:
+ B, S = q.shape[0], 1
+ cos, sin = m.rotary_emb.get_cos_sin_cache(
+ positions, hidden_states.dtype, offsets=None
+ )
+ q_pe = torch_npu.npu_interleave_rope(
+ q_pe.reshape(B, -1, S, m.qk_rope_head_dim),
+ cos,
+ sin,
+ )
+ q_pe = q_pe.reshape(B, -1, m.qk_rope_head_dim)
+
+ ckv_cache, k_rope_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+ m.layer_id
+ )
+ _, _, k_pe, kv_a = torch_npu.npu_kv_rmsnorm_rope_cache(
+ latent_cache.view(-1, 1, 1, m.kv_lora_rank + m.qk_rope_head_dim), # bnsd
+ m.kv_a_layernorm.weight,
+ cos,
+ sin,
+ forward_batch.out_cache_loc.to(torch.int64),
+ k_rope_cache,
+ ckv_cache,
+ k_rope_scale=None,
+ c_kv_scale=None,
+ k_rope_offset=None,
+ c_kv_offset=None,
+ epsilon=m.kv_a_layernorm.variance_epsilon,
+ cache_mode="PA_NZ" if is_fia_nz() else "PA_BNSD",
+ is_output_kv=True,
+ ) # adapter NZ
+
+ k_pe = k_pe.reshape(B, -1, m.qk_rope_head_dim)
+ else:
+ kv_a = m.kv_a_layernorm(kv_a)
+ k_pe = latent_cache[:, :, m.kv_lora_rank :]
+ if m.rotary_emb is not None:
+ q_pe, k_pe = m.rotary_emb(positions, q_pe, k_pe)
+ # this is for model kimi-vl-a3B-instruct
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ m, forward_batch.out_cache_loc, kv_a.unsqueeze(1), k_pe
+ )
+
+ q[..., m.qk_nope_head_dim :] = q_pe
+
+ kv = m.kv_b_proj(kv_a)[0]
+ kv = kv.view(-1, m.num_local_heads, m.qk_nope_head_dim + m.v_head_dim)
+ k_nope = kv[..., : m.qk_nope_head_dim]
+ v = kv[..., m.qk_nope_head_dim :]
+
+ k = m._concat_and_cast_mha_k(k_nope, k_pe, forward_batch)
+ return q, k, v, forward_batch
+
+
+def forward_mha_core_npu(
+ m: "DeepseekV2AttentionMLA",
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ forward_batch: "ForwardBatch",
+) -> torch.Tensor:
+ attn_output = m.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+ attn_output = attn_output.reshape(-1, m.num_local_heads * m.v_head_dim)
+ output, _ = m.o_proj(attn_output)
+ return output
+
+
+# endregion
+
+
+# region MLA
+def forward_mla_prepare_npu(
+ m: "DeepseekV2AttentionMLA",
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+ layer_scatter_modes,
+):
+ if is_mla_preprocess_enabled():
+ if not hasattr(m, "mla_preprocess"):
+ m.mla_preprocess = NPUFusedMLAPreprocess(
+ m.fused_qkv_a_proj_with_mqa,
+ m.q_a_layernorm,
+ m.kv_a_layernorm,
+ m.q_b_proj,
+ m.w_kc,
+ m.rotary_emb,
+ m.layer_id,
+ m.num_local_heads,
+ m.qk_nope_head_dim,
+ m.qk_rope_head_dim,
+ m.quant_config,
+ )
+ (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ ) = m.mla_preprocess.forward(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+ topk_indices = None
+ else:
+ q_lora = None
+ if m.q_lora_rank is not None:
+ q, latent_cache = (
+ get_attn_tp_context()
+ .fetch_qkv_latent()
+ .split(
+ [m.q_lora_rank, m.kv_lora_rank + m.qk_rope_head_dim],
+ dim=-1,
+ )
+ )
+ k_nope = latent_cache[..., : m.kv_lora_rank]
+
+ q = m.q_a_layernorm(q)
+ if (
+ _use_ag_after_qlora
+ and layer_scatter_modes.layer_input_mode == ScatterMode.SCATTERED
+ and layer_scatter_modes.attn_mode == ScatterMode.TP_ATTN_FULL
+ ):
+ q = scattered_to_tp_attn_full(q, forward_batch)
+ latent_cache = scattered_to_tp_attn_full(latent_cache, forward_batch)
+ k_nope = m.kv_a_layernorm(k_nope)
+
+ # q_lora needed by indexer
+ if m.use_nsa:
+ q_lora = q
+
+ k_nope = k_nope.unsqueeze(1)
+ q = m.q_b_proj(q)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+ else:
+ q = m.q_proj(hidden_states)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+ latent_cache = m.kv_a_proj_with_mqa(hidden_states)[0]
+ k_nope = latent_cache[..., : m.kv_lora_rank]
+ k_nope = m.kv_a_layernorm(k_nope).unsqueeze(1)
+
+ q_nope, q_pe = q.split([m.qk_nope_head_dim, m.qk_rope_head_dim], dim=-1)
+ k_pe = latent_cache[..., m.kv_lora_rank :].unsqueeze(1)
+
+ q_nope_out = torch.bmm(q_nope.transpose(0, 1), m.w_kc)
+
+ q_nope_out = q_nope_out.transpose(0, 1)
+
+ if nsa_use_prefill_cp(forward_batch, m.nsa_enable_prefill_cp):
+ positions = cp_split_and_rebuild_position(forward_batch, positions)
+
+ q_pe, k_pe = m.rotary_emb(positions, q_pe, k_pe)
+
+ if nsa_use_prefill_cp(forward_batch, m.nsa_enable_prefill_cp):
+ # support allgather+rerrange
+ k_nope, k_pe = m.rebuild_cp_kv_cache(
+ latent_cache, forward_batch, k_nope, k_pe
+ )
+ topk_indices = None
+ if q_lora is not None:
+ topk_indices = m.indexer(
+ x=hidden_states,
+ q_lora=q_lora,
+ positions=positions,
+ forward_batch=forward_batch,
+ layer_id=m.layer_id,
+ )
+
+ return (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ topk_indices,
+ )
+
+
+def forward_mla_core_npu(
+ m: "DeepseekV2AttentionMLA",
+ q_pe: torch.Tensor,
+ k_pe: torch.Tensor,
+ q_nope_out: torch.Tensor,
+ k_nope: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+ positions: torch.Tensor,
+ topk_indices: torch.Tensor,
+) -> torch.Tensor:
+ attn_output = m.attn_mqa(
+ q_nope_out,
+ k_nope,
+ k_nope,
+ forward_batch,
+ q_rope=q_pe,
+ k_rope=k_pe,
+ **(dict(topk_indices=topk_indices) if topk_indices is not None else {}),
+ )
+
+ attn_output = attn_output.view(-1, m.num_local_heads, m.kv_lora_rank)
+
+ attn_bmm_output = torch.empty(
+ (attn_output.shape[0], m.num_local_heads, m.v_head_dim),
+ dtype=attn_output.dtype,
+ device=attn_output.device,
+ )
+
+ attn_output = attn_output.contiguous()
+ torch.ops.npu.batch_matmul_transpose(attn_output, m.w_vc, attn_bmm_output)
+
+ attn_bmm_output = attn_bmm_output.reshape(-1, m.num_local_heads * m.v_head_dim)
+ output, _ = m.o_proj(attn_bmm_output)
+
+ return output
+
+
+# endregion
+
+
+# region DSA
+def forward_dsa_prepare_npu(
+ m: "DeepseekV2AttentionMLA",
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+ layer_scatter_modes,
+):
+ dynamic_scale = None
+ if is_mla_preprocess_enabled() and forward_batch.forward_mode.is_decode():
+ (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ q_lora,
+ forward_batch,
+ zero_allocator,
+ positions,
+ dynamic_scale,
+ ) = npu_mla_preprocess(
+ m,
+ hidden_states,
+ positions,
+ forward_batch,
+ zero_allocator,
+ )
+ else:
+ fused_qkv_a_proj_out = m.fused_qkv_a_proj_with_mqa(hidden_states)[0]
+ q, latent_cache = fused_qkv_a_proj_out.split(
+ [m.q_lora_rank, m.kv_lora_rank + m.qk_rope_head_dim], dim=-1
+ )
+
+ # overlap qk norm
+ q = m.q_a_layernorm(q)
+ if (
+ _use_ag_after_qlora
+ and layer_scatter_modes.layer_input_mode == ScatterMode.SCATTERED
+ and layer_scatter_modes.attn_mode == ScatterMode.TP_ATTN_FULL
+ ):
+ q = scattered_to_tp_attn_full(q, forward_batch)
+ latent_cache = scattered_to_tp_attn_full(latent_cache, forward_batch)
+ q_lora = q.clone() # required for topk_indices
+
+ q_event = None
+ if m.alt_stream is not None:
+ m.alt_stream.wait_stream(torch.npu.current_stream())
+ with torch.npu.stream(m.alt_stream):
+ q = m.q_b_proj(q_lora)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+ # record q to ensure memory space will not be released
+ q.record_stream(m.alt_stream)
+ q_event = m.alt_stream.record_event()
+ else:
+ q = m.q_b_proj(q_lora)[0].view(-1, m.num_local_heads, m.qk_head_dim)
+
+ k_nope, k_pe = latent_cache.unsqueeze(1).split(
+ [m.kv_lora_rank, m.qk_rope_head_dim], dim=-1
+ )
+ k_nope = m.kv_a_layernorm(k_nope)
+ # main stream waits for the completion of the event on the alt stream to ensure data dependency is complete
+ if q_event is not None:
+ torch.npu.current_stream().wait_event(q_event)
+
+ q_nope, q_pe = q.split([m.qk_nope_head_dim, m.qk_rope_head_dim], dim=-1)
+
+ q_nope_out = torch.bmm(q_nope.transpose(0, 1), m.w_kc)
+
+ q_nope_out = q_nope_out.transpose(0, 1)
+
+ if nsa_use_prefill_cp(forward_batch, m.nsa_enable_prefill_cp):
+ positions = cp_split_and_rebuild_position(forward_batch, positions)
+
+ q_pe, k_pe = m.rotary_emb(positions, q_pe, k_pe)
+
+ if nsa_use_prefill_cp(forward_batch, m.nsa_enable_prefill_cp):
+ # support allgather+rerrange
+ k_nope, k_pe = m.rebuild_cp_kv_cache(
+ latent_cache, forward_batch, k_nope, k_pe
+ )
+
+ topk_indices = m.indexer(
+ hidden_states,
+ q_lora,
+ positions,
+ forward_batch,
+ m.layer_id,
+ layer_scatter_modes,
+ dynamic_scale,
+ )
+
+ return (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ topk_indices,
+ forward_batch,
+ zero_allocator,
+ positions,
+ )
+
+
+def forward_dsa_core_npu(
+ m: "DeepseekV2AttentionMLA",
+ q_pe: torch.Tensor,
+ k_pe: torch.Tensor,
+ q_nope_out: torch.Tensor,
+ k_nope: torch.Tensor,
+ topk_indices: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+ positions: torch.Tensor,
+) -> torch.Tensor:
+ attn_output = m.attn_mqa(
+ q_nope_out.contiguous(),
+ k_nope.contiguous(),
+ k_nope.contiguous(),
+ forward_batch,
+ save_kv_cache=True, # False if forward_batch.forward_mode.is_extend() else True,
+ q_rope=q_pe.contiguous(),
+ k_rope=k_pe.contiguous(),
+ topk_indices=topk_indices,
+ )
+ attn_output = attn_output.view(-1, m.num_local_heads, m.kv_lora_rank)
+
+ attn_bmm_output = torch.empty(
+ (attn_output.shape[0], m.num_local_heads, m.v_head_dim),
+ dtype=attn_output.dtype,
+ device=attn_output.device,
+ )
+
+ if (
+ forward_batch.forward_mode.is_extend()
+ and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
+ and not forward_batch.forward_mode.is_target_verify()
+ ):
+ attn_output = attn_output.transpose(0, 1)
+ torch.bmm(
+ attn_output,
+ m.w_vc,
+ out=attn_bmm_output.view(-1, m.num_local_heads, m.v_head_dim).transpose(
+ 0, 1
+ ),
+ )
+ else:
+ attn_output = attn_output.contiguous()
+ torch.ops.npu.batch_matmul_transpose(attn_output, m.w_vc, attn_bmm_output)
+
+ attn_bmm_output = attn_bmm_output.reshape(-1, m.num_local_heads * m.v_head_dim)
+
+ output, _ = m.o_proj(attn_bmm_output)
+ return output
+
+
+def npu_mla_preprocess(
+ m: "DeepseekV2AttentionMLA",
+ hidden_states: torch.Tensor,
+ positions: torch.Tensor,
+ forward_batch: "ForwardBatch",
+ zero_allocator: "BumpAllocator",
+):
+ dynamic_scale = None
+ if not hasattr(m, "mla_preprocess"):
+ m.mla_preprocess = NPUFusedMLAPreprocess(
+ m.fused_qkv_a_proj_with_mqa,
+ m.q_a_layernorm,
+ m.kv_a_layernorm,
+ m.q_b_proj,
+ m.w_kc,
+ m.rotary_emb,
+ m.layer_id,
+ m.num_local_heads,
+ m.qk_nope_head_dim,
+ m.qk_rope_head_dim,
+ m.v_head_dim,
+ m.quant_config,
+ )
+ # mlaprolog does not require additional calculation of q_lora
+ _is_mlaprolog = hasattr(m.quant_config, "ignore") and any(
+ re.fullmatch(r".*kv_b_proj", l) for l in m.quant_config.ignore
+ )
+ if _is_mlaprolog:
+ (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ q_lora,
+ forward_batch,
+ positions,
+ dynamic_scale,
+ ) = m.mla_preprocess.forward(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+ else:
+ if m.alt_stream is not None:
+ mla_event = torch.npu.Event()
+ mla_event.record()
+ with torch.npu.stream(m.alt_stream):
+ # alt stream waits for the completion of the event on the main stream to ensure data dependency is complete
+ torch.npu.current_stream().wait_event(mla_event)
+ (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ ) = m.mla_preprocess.forward(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+
+ fused_qkv_a_proj_out = m.fused_qkv_a_proj_with_mqa(hidden_states)[0]
+ q, _ = fused_qkv_a_proj_out.split(
+ [m.q_lora_rank, m.kv_lora_rank + m.qk_rope_head_dim], dim=-1
+ )
+ q_lora = m.q_a_layernorm(q)
+ torch.npu.current_stream().wait_event(m.alt_stream)
+ else:
+ (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ ) = m.mla_preprocess.forward(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+ fused_qkv_a_proj_out = m.fused_qkv_a_proj_with_mqa(hidden_states)[0]
+ q, _ = fused_qkv_a_proj_out.split(
+ [m.q_lora_rank, m.kv_lora_rank + m.qk_rope_head_dim], dim=-1
+ )
+ q_lora = m.q_a_layernorm(q)
+
+ return (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ q_lora,
+ forward_batch,
+ zero_allocator,
+ positions,
+ dynamic_scale,
+ )
+
+
+# endregion
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/moe/topk.py b/sglang/python/sglang/srt/hardware_backend/npu/moe/topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..920787ed230f935f63ffb51ea2ea92df654ce9a5
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/moe/topk.py
@@ -0,0 +1,79 @@
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from sgl_kernel_npu.norm.l1_norm import l1_norm
+
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location_dispatch import topk_ids_logical_to_physical
+from sglang.srt.layers.moe.routed_experts_capturer import get_global_experts_capturer
+from sglang.srt.layers.moe.topk import StandardTopKOutput, select_experts
+
+if TYPE_CHECKING:
+ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+ from sglang.srt.layers.moe.topk import TopKConfig, TopKOutput
+
+
+def fused_topk_npu(
+ hidden_states: torch.Tensor,
+ router_logits: torch.Tensor,
+ topk_config: "TopKConfig",
+ num_token_non_padded: Optional[torch.Tensor] = None,
+ expert_location_dispatch_info: Optional["ExpertLocationDispatchInfo"] = None,
+ layer_id: Optional[int] = None,
+) -> "TopKOutput":
+
+ use_grouped_topk = topk_config.use_grouped_topk
+ renormalize = topk_config.renormalize
+ correction_bias = topk_config.correction_bias
+
+ if not use_grouped_topk:
+ topk_weights, topk_ids, _ = torch.ops.npu.npu_moe_gating_top_k_softmax(
+ router_logits,
+ k=topk_config.top_k,
+ )
+
+ if renormalize:
+ topk_weights = l1_norm(
+ topk_weights
+ if topk_config.num_fused_shared_experts == 0
+ else topk_weights[:, :-1]
+ )
+ topk_weights = topk_weights.to(torch.float32)
+
+ elif use_grouped_topk and correction_bias is not None:
+ # Force set routed_scaling_factor = 1 to optimize renormalize
+ topk_weights, topk_ids, _ = torch.ops.npu.npu_moe_gating_top_k(
+ router_logits.to(torch.float32),
+ k=topk_config.top_k,
+ bias=correction_bias.to(torch.float32),
+ k_group=topk_config.topk_group,
+ group_count=topk_config.num_expert_group,
+ group_select_mode=1,
+ renorm=0,
+ norm_type=1,
+ routed_scaling_factor=(
+ 1 if renormalize else topk_config.routed_scaling_factor
+ ),
+ eps=float(1e-20),
+ )
+
+ else:
+ topk_config.torch_native = True
+ return select_experts(
+ hidden_states=hidden_states,
+ layer_id=layer_id,
+ router_logits=router_logits,
+ topk_config=topk_config,
+ num_token_non_padded=num_token_non_padded,
+ expert_location_dispatch_info=expert_location_dispatch_info,
+ )
+
+ if expert_location_dispatch_info is not None:
+ topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+ get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
+ get_global_experts_capturer().capture(
+ layer_id=layer_id,
+ topk_ids=topk_ids,
+ )
+
+ return StandardTopKOutput(topk_weights, topk_ids, router_logits)
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/quantization/__pycache__/linear_method_npu.cpython-311.pyc b/sglang/python/sglang/srt/hardware_backend/npu/quantization/__pycache__/linear_method_npu.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71866726166907e928a34bf6163554dca6d3a3d7
Binary files /dev/null and b/sglang/python/sglang/srt/hardware_backend/npu/quantization/__pycache__/linear_method_npu.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py b/sglang/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..924d45d47f87899a38a70645c39a3c435688bf2e
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/quantization/fused_moe_method_npu.py
@@ -0,0 +1,774 @@
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+
+def npu_fused_experts(
+ hidden_states: torch.Tensor,
+ w13: torch.Tensor,
+ w13_scale: torch.Tensor,
+ w2: torch.Tensor,
+ w2_scale: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ top_k: int,
+ **kwargs,
+):
+ w13_offset = kwargs.get("w13_offset", None)
+ w2_offset = kwargs.get("w2_offset", None)
+ use_wna16 = kwargs.get("use_wna16", False)
+
+ original_shape = hidden_states.shape
+ original_dtype = hidden_states.dtype
+ scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32
+ if len(original_shape) == 3:
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ num_tokens = hidden_states.shape[0]
+ num_experts = w13.shape[0]
+ row_idx_len = num_tokens * top_k
+ row_idx = (
+ torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
+ .view(top_k, -1)
+ .permute(1, 0)
+ .contiguous()
+ )
+ hidden_states, expanded_row_idx, expanded_expert_idx = (
+ torch.ops.npu.npu_moe_init_routing(
+ hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
+ )
+ )
+ expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens(
+ expanded_expert_idx, num_experts
+ )
+ expert_tokens = expert_tokens.to(torch.int64)
+ # gmm1: gate_up_proj
+ if not use_wna16:
+ hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+ scale_args13 = {
+ "scale": [w13_scale.to(scale_dtype)],
+ "per_token_scale": [pertoken_scale],
+ }
+ else:
+ scale_args13 = {
+ "antiquant_scale": [w13_scale],
+ "antiquant_offset": [w13_offset],
+ }
+
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[w13],
+ **scale_args13,
+ split_item=2,
+ group_list_type=0,
+ group_type=0,
+ group_list=expert_tokens,
+ output_dtype=original_dtype,
+ )[0]
+ # act_fn: swiglu
+ hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+ if not use_wna16:
+ hidden_states, pertoken_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+ scale_args2 = {
+ "scale": [w2_scale.to(scale_dtype)],
+ "per_token_scale": [pertoken_scale],
+ }
+ else:
+ scale_args2 = {"antiquant_scale": [w2_scale], "antiquant_offset": [w2_offset]}
+ # gmm2: down_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[w2],
+ **scale_args2,
+ split_item=2,
+ group_list_type=0,
+ group_type=0,
+ group_list=expert_tokens,
+ output_dtype=original_dtype,
+ )[0]
+
+ final_hidden_states = torch.ops.npu.npu_moe_finalize_routing(
+ hidden_states,
+ skip1=None,
+ skip2=None,
+ bias=None,
+ scales=topk_weights,
+ expanded_src_to_dst_row=expanded_row_idx,
+ export_for_source_row=topk_ids,
+ )
+ if len(original_shape) == 3:
+ final_hidden_states = final_hidden_states.view(original_shape)
+ return final_hidden_states
+
+
+def npu_fused_moe_without_routing_weights_bf16(
+ layer, hidden_states, group_list_type, group_list, output_dtype
+):
+ # gmm1: gate_up_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w13_weight],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=output_dtype,
+ )[0]
+ hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+ # gmm2: down_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w2_weight],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=output_dtype,
+ )[0]
+ return hidden_states
+
+
+def fused_moe_npu(
+ x,
+ w1,
+ w2,
+ topk_output,
+ moe_runner_config,
+):
+ # TODO: reuse the codes of UnquantizedFusedMoEMethod-forward_npu
+ topk_weights, topk_ids, _ = topk_output
+ original_dtype = x.dtype
+ num_tokens = x.shape[0]
+ topk_weights = topk_weights.to(x.dtype)
+ topk_ids = topk_ids.to(torch.int32)
+ num_experts = w1.shape[0]
+ top_k = topk_weights.shape[-1]
+ row_idx_len = num_tokens * top_k
+ row_idx = (
+ torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
+ .view(top_k, -1)
+ .permute(1, 0)
+ .contiguous()
+ )
+
+ hidden_states, expanded_row_idx, expanded_expert_idx = (
+ torch.ops.npu.npu_moe_init_routing(
+ x, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
+ )
+ )
+
+ expert_tokens = torch.ops.npu.npu_moe_compute_expert_tokens(
+ expanded_expert_idx, num_experts
+ )
+
+ expert_tokens = expert_tokens.to(torch.int64)
+
+ # gmm1: gate_up_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[w1.permute(0, 2, 1)],
+ bias=None,
+ split_item=2,
+ group_list_type=0,
+ group_type=0,
+ group_list=expert_tokens,
+ output_dtype=original_dtype,
+ )[0]
+
+ # act_fn:
+ if moe_runner_config.activation == "silu":
+ hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+ else:
+ from sglang.srt.layers.activation import GeluAndMul
+
+ hidden_states = GeluAndMul()(hidden_states)
+
+ # gmm2: down_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[w2.permute(0, 2, 1)],
+ bias=None,
+ split_item=2,
+ group_list_type=0,
+ group_type=0,
+ group_list=expert_tokens,
+ output_dtype=original_dtype,
+ )[0]
+
+ final_hidden_states = torch.ops.npu.npu_moe_finalize_routing(
+ hidden_states,
+ skip1=None,
+ skip2=None,
+ bias=None,
+ scales=topk_weights,
+ expanded_src_to_dst_row=expanded_row_idx,
+ export_for_source_row=topk_ids,
+ )
+ return final_hidden_states
+
+
+class _NPUFusedMoEMethodBase(FusedMoEMethodBase):
+
+ def __init__(
+ self,
+ quant_config: Optional["QuantizationConfig"] = None,
+ ):
+ self.quant_config = quant_config
+
+
+class NPUW8A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ layer.w13_weight.data = npu_format_cast(layer.w13_weight.data.transpose(1, 2))
+ layer.w2_weight.data = npu_format_cast(layer.w2_weight.data.transpose(1, 2))
+ layer.w13_weight_scale = torch.nn.Parameter(
+ layer.w13_weight_scale.data.squeeze(-1), requires_grad=False
+ )
+ layer.w2_weight_scale = torch.nn.Parameter(
+ layer.w2_weight_scale.data.squeeze(-1), requires_grad=False
+ )
+ # Compressed-tensors format doesn't have this field
+ if hasattr(layer, "w13_weight_offset"):
+ layer.w13_weight_offset = torch.nn.Parameter(
+ layer.w13_weight_offset.data.squeeze(-1),
+ requires_grad=False,
+ )
+ if hasattr(layer, "w2_weight_offset"):
+ layer.w2_weight_offset = torch.nn.Parameter(
+ layer.w2_weight_offset.data.squeeze(-1),
+ requires_grad=False,
+ )
+
+ def apply(
+ self,
+ layer,
+ dispatch_output: "StandardDispatchOutput",
+ ) -> "CombineInput":
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ topk_weights, topk_ids, _ = topk_output
+ topk_ids = topk_ids.to(torch.int32)
+ topk_weights = topk_weights.to(x.dtype)
+ output = npu_fused_experts(
+ hidden_states=x,
+ w13=layer.w13_weight,
+ w13_scale=layer.w13_weight_scale,
+ w2=layer.w2_weight,
+ w2_scale=layer.w2_weight_scale,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ top_k=topk_ids.shape[1],
+ )
+ return StandardCombineInput(hidden_states=output)
+
+ def apply_without_routing_weights(
+ self,
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ ):
+ # gmm1: gate_up_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w13_weight],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=torch.int32,
+ )[0]
+
+ # act_fn: swiglu
+ hidden_states, swiglu_out_scale = torch.ops.npu.npu_dequant_swiglu_quant(
+ x=hidden_states,
+ weight_scale=layer.w13_weight_scale,
+ activation_scale=hidden_states_scale,
+ bias=None,
+ quant_scale=None,
+ quant_offset=None,
+ group_index=group_list,
+ activate_left=True,
+ quant_mode=1,
+ )
+
+ # gmm2: down_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w2_weight],
+ scale=[layer.w2_weight_scale.to(output_dtype)],
+ per_token_scale=[swiglu_out_scale],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=output_dtype,
+ )[0]
+ return hidden_states
+
+
+class NPUW4A8Int8DynamicMoEMethod(_NPUFusedMoEMethodBase):
+
+ def _process_scale(
+ self, weight: torch.Tensor, scale, per_group_scale, is_per_channel_weight
+ ):
+ scale = scale.transpose(1, 2).contiguous()
+
+ if is_per_channel_weight:
+ scale_np = scale.cpu().numpy()
+ scale_np.dtype = np.uint32
+ scale_uint64_tensor = torch.from_numpy(scale_np.astype(np.int64)).npu()
+ return scale_uint64_tensor, None
+
+ per_group_scale = per_group_scale.transpose(1, 2).contiguous()
+ group_num, k, n = weight.shape
+ # the weight of the new version is reduced by half by pack n, so it needs to be restored
+ n = n * 2
+ per_group_scale = per_group_scale.reshape(group_num, -1, n)
+ group_num, quantgroup_num, n = per_group_scale.shape
+ bias = None
+
+ scale_fp32 = (scale * per_group_scale).to(torch.float16).to(torch.float32)
+ scale_fp32_np = scale_fp32.cpu().numpy()
+ scale_fp32_np.dtype = np.uint32
+ sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), dtype=np.uint32)
+
+ sscale_uint64[..., ::2] = scale_fp32_np
+
+ sscale_uint64_buffer = np.frombuffer(
+ sscale_uint64.tobytes(), dtype=np.int64
+ ).copy()
+ sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape(
+ group_num, quantgroup_num, n
+ )
+ sscale_uint64_tensor = sscale_uint64_tensor.npu()
+ return sscale_uint64_tensor, bias
+
+ def _update_bias(self, layer, w13_bias, w2_bias):
+ layer.w13_scale_bias.data = (
+ layer.w13_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
+ )
+ layer.w2_scale_bias.data = (
+ layer.w2_scale_bias.data.transpose(1, 2).contiguous().sum(axis=1)
+ )
+
+ def _pack_to_int32(self, weight: torch.Tensor):
+ # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
+ assert (
+ weight.shape[-1] % 4 == 0
+ ), "the last dim of weight needs to be divided by 4"
+ return weight.view(torch.int32).contiguous()
+
+ def process_weights_after_loading(
+ self, layer: torch.nn.Module, is_per_channel_weight, activation_use_clip
+ ) -> None:
+ if not activation_use_clip:
+ self._process_weights_without_clip(layer, is_per_channel_weight)
+ else:
+ self._process_weights_with_clip(layer)
+
+ layer.w13_weight = torch.nn.Parameter(
+ layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+ )
+ layer.w2_weight = torch.nn.Parameter(
+ layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+ )
+
+ layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+ layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+
+ layer.w13_weight.data = self._pack_to_int32(layer.w13_weight.data)
+ layer.w2_weight.data = self._pack_to_int32(layer.w2_weight.data)
+
+ def _process_weights_without_clip(
+ self, layer: torch.nn.Module, is_per_channel_weight
+ ) -> None:
+ w13_weight_scale_second = (
+ layer.w13_weight_scale_second.data
+ if hasattr(layer, "w13_weight_scale_second")
+ else None
+ )
+ w2_weight_scale_second = (
+ layer.w2_weight_scale_second.data
+ if hasattr(layer, "w2_weight_scale_second")
+ else None
+ )
+ layer.w13_weight_scale.data, w13_bias = self._process_scale(
+ layer.w13_weight,
+ layer.w13_weight_scale.data,
+ w13_weight_scale_second,
+ is_per_channel_weight,
+ )
+ layer.w2_weight_scale.data, w2_bias = self._process_scale(
+ layer.w2_weight,
+ layer.w2_weight_scale.data,
+ w2_weight_scale_second,
+ is_per_channel_weight,
+ )
+ if hasattr(layer, "w13_weight_scale_second"):
+ # scale_second is no longer used, release this part of the memory
+ del layer.w13_weight_scale_second
+ del layer.w2_weight_scale_second
+ del layer.w13_weight_offset_second
+ del layer.w2_weight_offset_second
+
+ self._update_bias(layer, w13_bias, w2_bias)
+
+ def _process_weights_with_clip(self, layer: torch.nn.Module) -> None:
+ w13_weight_scale = (
+ layer.w13_weight_scale.data.squeeze(-1).contiguous().unsqueeze(1)
+ )
+ w2_weight_scale = (
+ layer.w2_weight_scale.data.squeeze(-1).contiguous().unsqueeze(1)
+ )
+ layer.w13_weight_scale = torch.nn.Parameter(
+ w13_weight_scale, requires_grad=False
+ )
+ layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
+ layer.w13_scale_bias = layer.w13_bias
+ layer.w2_scale_bias = layer.w2_bias
+
+ def apply(
+ self,
+ layer,
+ dispatch_output: "StandardDispatchOutput",
+ ) -> "CombineInput":
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ topk_weights, topk_ids, _ = topk_output
+ top_k = topk_ids.shape[1]
+ group_list_type = 1
+ original_shape = hidden_states.shape
+ topk_weights = topk_weights
+
+ num_tokens = hidden_states.shape[:-1].numel()
+
+ first_expert_idx = 0
+ last_expert_idx = layer.num_experts
+ global_num_experts = layer.num_experts
+
+ sorted_hidden_states, expanded_row_idx, expert_tokens, pertoken_scale = (
+ torch.ops.npu.npu_moe_init_routing_v2(
+ hidden_states,
+ topk_ids,
+ active_num=num_tokens * top_k,
+ expert_num=global_num_experts,
+ expert_tokens_num_type=1,
+ expert_tokens_num_flag=True,
+ active_expert_range=[first_expert_idx, last_expert_idx],
+ quant_mode=1,
+ )
+ )
+
+ expert_tokens = expert_tokens.to(torch.int64)
+
+ bias1 = [layer.w13_scale_bias]
+ bias2 = [layer.w2_scale_bias]
+ w1_scale = [layer.w13_weight_scale]
+ w2_scale = [layer.w2_weight_scale]
+ _output_dtype = torch.bfloat16
+
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[sorted_hidden_states],
+ weight=[layer.w13_weight],
+ scale=w1_scale,
+ bias=bias1,
+ per_token_scale=[pertoken_scale],
+ group_list=expert_tokens,
+ split_item=2,
+ group_type=0,
+ group_list_type=group_list_type,
+ output_dtype=_output_dtype,
+ )[0]
+
+ # act_fn: swiglu
+ hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+ hidden_states, swiglu_out_scale = torch.ops.npu.npu_dynamic_quant(hidden_states)
+
+ output = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w2_weight],
+ scale=w2_scale,
+ bias=bias2,
+ per_token_scale=[swiglu_out_scale],
+ group_list=expert_tokens,
+ split_item=2,
+ group_type=0,
+ group_list_type=group_list_type,
+ output_dtype=_output_dtype,
+ )[0]
+
+ assert original_shape is not None
+ final_hidden_states = torch.ops.npu.npu_moe_token_unpermute(
+ permuted_tokens=output,
+ sorted_indices=torch.abs(expanded_row_idx),
+ probs=topk_weights,
+ )
+ if len(original_shape) == 3:
+ final_hidden_states = final_hidden_states.view(original_shape)
+
+ return StandardCombineInput(hidden_states=final_hidden_states)
+
+ def apply_without_routing_weights(
+ self,
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ ):
+ from sgl_kernel_npu.activation.swiglu_quant import swiglu_quant
+
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w13_weight],
+ scale=[layer.w13_weight_scale],
+ bias=[layer.w13_scale_bias],
+ per_token_scale=[hidden_states_scale],
+ group_list=group_list,
+ split_item=2,
+ group_type=0,
+ group_list_type=group_list_type,
+ output_dtype=output_dtype,
+ )[0]
+
+ hidden_states, swiglu_out_scale = swiglu_quant(
+ hidden_states, group_list, group_list_type
+ )
+
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w2_weight],
+ scale=[layer.w2_weight_scale],
+ bias=[layer.w2_scale_bias],
+ per_token_scale=[swiglu_out_scale],
+ group_list=group_list,
+ split_item=2,
+ group_type=0,
+ group_list_type=group_list_type,
+ output_dtype=output_dtype,
+ )[0]
+
+ return hidden_states
+
+
+class NPUW4A16Int4DynamicMoEMethod(_NPUFusedMoEMethodBase):
+
+ def _pack_to_int32(self, weight: torch.Tensor):
+ assert weight.dim() == 3
+ if weight.dtype == torch.int32:
+ # pack 8 int4 to int32, we use a int32 to represent a int4
+ assert (
+ weight.shape[-1] % 8 == 0
+ ), "the last dim of weight needs to be divided by 8"
+ new_weight = torch.ops.npu.npu_convert_weight_to_int4pack(
+ weight.flatten(0, 1)
+ )
+ new_weight = new_weight.view(weight.shape[0], weight.shape[1], -1)
+ elif weight.dtype == torch.int8:
+ # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4
+ assert (
+ weight.shape[-1] % 4 == 0
+ ), "the last dim of weight needs to be divided by 4"
+ new_weight = weight.view(torch.int32).contiguous()
+ else:
+ raise ValueError(f"{weight.dtype=} is not supported !")
+ return new_weight
+
+ def _unpack_from_int32(
+ self,
+ value: torch.Tensor,
+ num_bits: int,
+ shape: torch.Size = None,
+ packed_dim=1,
+ ) -> torch.Tensor:
+ """
+ Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
+ original bit range.
+
+ Return tensors in int8
+
+ :param value: tensor to unpack
+ :param num_bits: number of bits to unpack each data point into
+ :param shape: shape to unpack into, used to remove padding
+ :returns: unpacked int8 tensor
+ """
+ if value.dtype is not torch.int32:
+ raise ValueError(
+ f"Expected {torch.int32} but got {value.dtype}, Aborting unpack."
+ )
+
+ if num_bits > 8:
+ raise ValueError("Unpacking is only supported for less than 8 bits")
+
+ pack_factor = 32 // num_bits
+
+ # unpack
+ mask = (1 << num_bits) - 1
+
+ if packed_dim == 1:
+ unpacked = torch.zeros(
+ (value.shape[0], value.shape[1] * pack_factor),
+ device=value.device,
+ dtype=torch.int32,
+ )
+ for i in range(pack_factor):
+ unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
+
+ # remove padding
+ if shape is not None:
+ original_row_size = int(shape[1])
+ unpacked = unpacked[:, :original_row_size]
+ else:
+ unpacked = torch.zeros(
+ (value.shape[0] * pack_factor, value.shape[1]),
+ device=value.device,
+ dtype=torch.int32,
+ )
+ for i in range(pack_factor):
+ unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
+
+ # remove padding
+ original_row_size = int(shape[0])
+ unpacked = unpacked[:original_row_size, :]
+
+ # bits are packed in unsigned format, reformat to signed
+ # update the value range from unsigned to signed
+ offset = pow(2, num_bits) // 2
+ unpacked = (unpacked - offset).to(torch.int8)
+
+ return unpacked
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ w13_weight_scale = layer.w13_weight_scale.data.transpose(-1, -2).contiguous()
+ w2_weight_scale = layer.w2_weight_scale.data.transpose(-1, -2).contiguous()
+ layer.w13_weight_scale = torch.nn.Parameter(
+ w13_weight_scale, requires_grad=False
+ )
+ layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
+
+ layer.w13_weight_offset = torch.nn.Parameter(
+ layer.w13_weight_offset.data.transpose(-1, -2).contiguous(),
+ requires_grad=False,
+ )
+ layer.w2_weight_offset = torch.nn.Parameter(
+ layer.w2_weight_offset.data.transpose(-1, -2).contiguous(),
+ requires_grad=False,
+ )
+
+ # w = [n, k // 8] --> [k, n // 8]
+ # w13_weight = layer.w13_weight.data.transpose(1, 2).contiguous()
+ # w2_weight = layer.w2_weight.data.transpose(1, 2).contiguous()
+ unpacked_w13_weight = (
+ self._unpack_from_int32(layer.w13_weight.data.flatten(0, 1), 4)
+ .view(layer.w13_weight.data.shape[0], layer.w13_weight.data.shape[1], -1)
+ .transpose(1, 2)
+ .contiguous()
+ .int()
+ )
+ unpacked_w2_weight = (
+ self._unpack_from_int32(layer.w2_weight.data.flatten(0, 1), 4)
+ .view(layer.w2_weight.data.shape[0], layer.w2_weight.data.shape[1], -1)
+ .transpose(1, 2)
+ .contiguous()
+ .int()
+ )
+
+ w13_weight = self._pack_to_int32(unpacked_w13_weight)
+ w2_weight = self._pack_to_int32(unpacked_w2_weight)
+
+ layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+ layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+
+ def apply(
+ self,
+ layer,
+ dispatch_output: "StandardDispatchOutput",
+ ) -> "CombineInput":
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ topk_weights, topk_ids, _ = topk_output
+ topk_ids = topk_ids.to(torch.int32)
+ topk_weights = topk_weights.to(x.dtype)
+ output = npu_fused_experts(
+ hidden_states=x,
+ w13=layer.w13_weight,
+ w13_scale=layer.w13_weight_scale,
+ w13_offset=layer.w13_weight_offset,
+ w2=layer.w2_weight,
+ w2_scale=layer.w2_weight_scale,
+ w2_offset=layer.w2_weight_offset,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ top_k=topk_ids.shape[1],
+ use_wna16=True,
+ )
+ return StandardCombineInput(hidden_states=output)
+
+ def apply_without_routing_weights(
+ self,
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ ):
+ if hidden_states_scale is None:
+ # gmm1: gate_up_proj
+ hidden_states = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w13_weight],
+ antiquant_scale=[layer.w13_weight_scale],
+ antiquant_offset=[layer.w13_weight_offset],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=output_dtype,
+ )[0]
+
+ # act_fn: swiglu
+ hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+
+ # gmm2: down_proj
+ out_hidden = torch.ops.npu.npu_grouped_matmul(
+ x=[hidden_states],
+ weight=[layer.w2_weight],
+ antiquant_scale=[layer.w2_weight_scale],
+ antiquant_offset=[layer.w2_weight_offset],
+ split_item=2,
+ group_list_type=group_list_type,
+ group_type=0,
+ group_list=group_list,
+ output_dtype=output_dtype,
+ )[0]
+ else:
+ raise ValueError(
+ "when weight is int4, hidden_states only supports non-quant dtype!"
+ )
+
+ return out_hidden
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py b/sglang/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe703a08250fc8544d17314fc23a60dd889f448
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/quantization/linear_method_npu.py
@@ -0,0 +1,143 @@
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers.quantization.base_config import LinearMethodBase
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+
+class _NPULinearMethodBase(LinearMethodBase):
+
+ def __init__(
+ self,
+ quant_config: Optional["QuantizationConfig"] = None,
+ ):
+ self.quant_config = quant_config
+
+
+class NPUW8A8Int8LinearMethod(_NPULinearMethodBase):
+
+ def process_weights_after_loading(self, layer: torch.nn.Module):
+ layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+ layer.weight.data = npu_format_cast(layer.weight.data)
+
+ layer.weight_scale.data = layer.weight_scale.data.flatten()
+ # Compressed-tensors format doesn't have this field
+ if hasattr(layer, "weight_offset"):
+ layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+ expanding_factor = layer.weight.data.shape[0]
+ layer.aclnn_input_scale = torch.nn.Parameter(
+ layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+ requires_grad=False,
+ )
+ layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+ layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+ requires_grad=False,
+ )
+ layer.aclnn_input_offset = torch.nn.Parameter(
+ layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+ requires_grad=False,
+ )
+
+ def apply(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ from sglang.srt.layers.linear import RowParallelLinear
+
+ original_dtype = x.dtype
+ if original_dtype != torch.int8:
+ x = torch.ops.npu.npu_quantize(
+ x,
+ layer.aclnn_input_scale_reciprocal,
+ layer.aclnn_input_offset,
+ torch.qint8,
+ -1,
+ False,
+ )
+ # Only fuse bias add into GEMM for rank 0 (this ensures that
+ # bias will not get added more than once in Attention TP>1 case)
+ if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
+ quant_bias = None
+ else:
+ quant_bias = layer.quant_bias
+ return torch.ops.npu.npu_quant_matmul(
+ x,
+ layer.weight,
+ layer.deq_scale,
+ bias=quant_bias,
+ output_dtype=original_dtype,
+ )
+
+
+class NPUW8A8Int8DynamicLinearMethod(_NPULinearMethodBase):
+
+ def process_weights_after_loading(self, layer: torch.nn.Module):
+ layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+ layer.weight.data = npu_format_cast(layer.weight.data)
+
+ layer.weight_scale.data = layer.weight_scale.data.flatten()
+ # Compressed-tensors format doesn't have this field
+ if hasattr(layer, "weight_offset"):
+ layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+ def apply(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+
+ if isinstance(x, tuple):
+ """dynamic_scale is calculated in malprolog kernel"""
+ original_dtype = torch.bfloat16
+ quant_out, dynamic_scale = x
+ else:
+ original_dtype = x.dtype
+ quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(x)
+ return torch.ops.npu.npu_quant_matmul(
+ quant_out,
+ layer.weight,
+ layer.weight_scale,
+ pertoken_scale=dynamic_scale,
+ bias=bias,
+ output_dtype=original_dtype,
+ )
+
+
+class NPU_W4A4DynamicLinearMethod(_NPULinearMethodBase):
+
+ def process_weights_after_loading(self, layer):
+ layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+ layer.weight_scale.data = layer.weight_scale.data.flatten()
+ layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+ layer.weight_offset.data = layer.weight_offset.data.flatten()
+ layer.weight.data = torch.ops.npu.npu_convert_weight_to_int4pack(
+ layer.weight.data.to(torch.int32)
+ )
+
+ def apply(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ tp_rank: Optional[int] = 0,
+ ) -> torch.Tensor:
+ original_dtype = x.dtype
+ quant_out, dynamic_scale = torch.ops.npu.npu_dynamic_quant(
+ x, dst_type=torch.quint4x2
+ )
+ return torch.ops.npu.npu_quant_matmul(
+ quant_out,
+ layer.weight,
+ layer.weight_scale,
+ pertoken_scale=dynamic_scale,
+ bias=bias,
+ output_dtype=original_dtype,
+ )
diff --git a/sglang/python/sglang/srt/hardware_backend/npu/utils.py b/sglang/python/sglang/srt/hardware_backend/npu/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e09841bdfc58197bdf155a14c6e98f6ec241892
--- /dev/null
+++ b/sglang/python/sglang/srt/hardware_backend/npu/utils.py
@@ -0,0 +1,135 @@
+import functools
+import logging
+from enum import IntEnum
+from typing import TYPE_CHECKING, Callable
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.utils import get_npu_memory_capacity, is_npu
+
+if TYPE_CHECKING:
+ from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+_is_npu = is_npu()
+indexer_weight_stream = None
+
+
+class NPUACLFormat(IntEnum):
+ ACL_FORMAT_UNDEFINED = -1
+ ACL_FORMAT_ND = 2
+ ACL_FORMAT_FRACTAL_NZ = 29
+
+
+def _call_once(fn: Callable):
+
+ @functools.wraps(fn)
+ def wrapper(*args, **kwargs):
+ if getattr(fn, "_has_been_called", False):
+ logger.debug("Function {} has already been called.", fn.__name__)
+ return
+
+ fn._has_been_called = True
+ return fn(*args, **kwargs)
+
+ return wrapper
+
+
+def set_default_server_args(args: "ServerArgs"):
+ """
+ Set default server arguments for NPU backend.
+ """
+
+ # NPU only works with "ascend" attention backend for now
+ args.attention_backend = "ascend"
+ args.prefill_attention_backend = "ascend"
+ args.decode_attention_backend = "ascend"
+ if args.page_size is None:
+ args.page_size = 128
+
+ # NPU memory settings
+ npu_mem = get_npu_memory_capacity()
+ if npu_mem <= 32 * 1024:
+ # Ascend 910B4,910B4_1
+ # (chunked_prefill_size 4k, cuda_graph_max_bs 16 if tp < 4 else 64)
+ if args.chunked_prefill_size is None:
+ args.chunked_prefill_size = 4 * 1024
+ if args.cuda_graph_max_bs is None:
+ if args.tp_size < 4:
+ args.cuda_graph_max_bs = 16
+ else:
+ args.cuda_graph_max_bs = 64
+ elif npu_mem <= 64 * 1024:
+ # Ascend 910B1,910B2,910B2C,910B3,910_9391,910_9392,910_9381,910_9382,910_9372,910_9362
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 64 if tp < 4 else 256)
+ if args.chunked_prefill_size is None:
+ args.chunked_prefill_size = 8 * 1024
+ if args.cuda_graph_max_bs is None:
+ if args.tp_size < 4:
+ args.cuda_graph_max_bs = 64
+ else:
+ args.cuda_graph_max_bs = 256
+
+ # NPU does not support CustomAllReduce
+ args.disable_custom_all_reduce = True
+
+ # handles hierarchical cache configs
+ if args.enable_hierarchical_cache:
+ args.hicache_io_backend = "kernel_ascend"
+ if args.use_mla_backend():
+ args.hicache_mem_layout = "page_first_kv_split"
+ else:
+ args.hicache_mem_layout = "page_first_direct"
+
+
+@_call_once
+def init_npu_backend():
+ """
+ Initialize NPU backend. This function should be called only once.
+ """
+
+ assert _is_npu, "NPU backend initialization called on non-NPU device."
+
+ import sgl_kernel_npu # noqa: F401
+ import torch_npu
+ from torch_npu.contrib import transfer_to_npu # noqa: F401
+
+ # Re-mock torch.cuda.is_available cuz transfer_to_npu mocks it True
+ torch.cuda.is_available = lambda: False
+
+ torch_npu.npu.config.allow_internal_format = True
+ torch_npu.npu.set_compile_mode(jit_compile=False)
+
+
+def npu_format_cast(
+ tensor: torch.Tensor,
+ acl_format: NPUACLFormat = NPUACLFormat.ACL_FORMAT_FRACTAL_NZ,
+) -> torch.Tensor:
+ """
+ Cast a tensor to a specific NPU ACL format.
+
+ Args:
+ tensor (torch.Tensor): The input tensor.
+ acl_format (NPUACLFormat): The target NPU ACL format.
+
+ Returns:
+ torch.Tensor: The tensor cast to the specified NPU ACL format.
+ """
+
+ if not _is_npu:
+ return tensor
+
+ if envs.SGLANG_NPU_DISABLE_ACL_FORMAT_WEIGHT.get():
+ return tensor
+
+ import torch_npu
+
+ return torch_npu.npu_format_cast(tensor, acl_format.value)
+
+
+def get_indexer_weight_stream():
+ global indexer_weight_stream
+ if indexer_weight_stream is None:
+ indexer_weight_stream = torch.npu.Stream()
+ return indexer_weight_stream
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e9ca12810abd40e71b76d6c08ad4eb7ae0f6ba4
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/cutlass_moe_params.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/cutlass_moe_params.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afb3cec831b650bbf89e1bec2a344353d0296ed9
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/cutlass_moe_params.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/kt_ep_wrapper.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/kt_ep_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4264aaf3398204d8e9b4a449f84c1c9bac6b9252
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/kt_ep_wrapper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/rocm_moe_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/rocm_moe_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc7343e1bfa3558b79db356c4c63a0ef39387132
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/rocm_moe_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/routed_experts_capturer.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/routed_experts_capturer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6862a51ed88142cdf2aba80b0880fa15a4a67d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/routed_experts_capturer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/router.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/router.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5fa605157f83302b1fdb6fa1383ded18221b2b0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/router.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/topk.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/topk.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..418b82bde860300825f67217023b6834f2a953ec
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/topk.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..806abf1effdbf40670e9e9469946f106d0c6dc76
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/__init__.py b/sglang/python/sglang/srt/layers/moe/ep_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..703a093b5ba5324db22eda3225f299f0f64ffd6c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/kernels.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/kernels.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc57bdaf1729468b12034f25d96bcb89aac75510
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/kernels.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/layer.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43fa875d7121328cf5c0b2511e4b391136ec0ca9
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/ep_moe/__pycache__/layer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/kernels.py b/sglang/python/sglang/srt/layers/moe/ep_moe/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..044c590f2200a8f48a25090435a4725c693d65f5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/ep_moe/kernels.py
@@ -0,0 +1,1383 @@
+import logging
+
+import torch
+import triton
+
+from sglang.srt.utils import ceil_div, is_cuda
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+if _is_cuda:
+ from sglang.srt.layers.quantization.fp8_kernel import (
+ sglang_per_token_group_quant_fp8 as per_token_group_quant_fp8,
+ )
+
+import triton.language as tl
+
+
+def _get_launch_config_1d(device, numel):
+ MAX_THREADS_PER_BLOCK = 1024
+ MIN_THREADS_PER_BLOCK = 512
+ MAX_WAVES = 8 # empirical numbers
+
+ props = torch.cuda.get_device_properties(device)
+ sm_count = props.multi_processor_count
+ max_threads_per_sm = props.max_threads_per_multi_processor
+ max_num_blocks = sm_count * max_threads_per_sm // MAX_THREADS_PER_BLOCK
+
+ block_dim = MAX_THREADS_PER_BLOCK
+
+ def get_num_blocks(block_dim):
+ return triton.cdiv(numel, block_dim)
+
+ while (
+ block_dim > MIN_THREADS_PER_BLOCK
+ and get_num_blocks(block_dim // 2) <= max_num_blocks
+ ):
+ block_dim = block_dim // 2
+
+ num_blocks = get_num_blocks(block_dim)
+ grid_dim = min(num_blocks, max_num_blocks * MAX_WAVES)
+
+ return (grid_dim,), block_dim
+
+
+def _get_launch_config_2d(device, m, n):
+ MAX_THREADS_PER_BLOCK = 1024
+ MIN_THREADS_PER_BLOCK = 512
+ MAX_WAVES = 8 # empirical numbers
+
+ props = torch.cuda.get_device_properties(device)
+ sm_count = props.multi_processor_count
+ max_threads_per_sm = props.max_threads_per_multi_processor
+ max_num_blocks = sm_count * max_threads_per_sm // MAX_THREADS_PER_BLOCK
+
+ block_dim = MAX_THREADS_PER_BLOCK
+
+ def get_num_blocks(block_dim):
+ return m * triton.cdiv(n, block_dim)
+
+ while (
+ block_dim > MIN_THREADS_PER_BLOCK
+ and get_num_blocks(block_dim // 2) <= max_num_blocks
+ ):
+ block_dim = block_dim // 2
+
+ grid_dim_x = triton.cdiv(n, block_dim)
+ grid_dim_y = max(min(m, max_num_blocks * MAX_WAVES // grid_dim_x), 1)
+
+ return (grid_dim_y, grid_dim_x), block_dim
+
+
+@triton.jit
+def deepep_permute_triton_kernel(
+ input_ptr,
+ gateup_input_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ a1_scales_ptr,
+ topk,
+ hidden_size,
+ BLOCK_SIZE: tl.constexpr,
+):
+ OutDtype = gateup_input_ptr.dtype.element_ty
+
+ src_idx = tl.program_id(0)
+ src2dst_ptr = src2dst_ptr + src_idx * topk
+ topk_ids_ptr = topk_ids_ptr + src_idx * topk
+
+ src_ptr = input_ptr + src_idx * hidden_size
+
+ for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+ offset = start_offset + tl.arange(0, BLOCK_SIZE)
+ mask = offset < hidden_size
+ in_data = tl.load(src_ptr + offset, mask=mask).to(OutDtype)
+
+ for idx in range(topk):
+ dst_idx = tl.load(src2dst_ptr + idx)
+ if dst_idx >= 0:
+ dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+ tl.store(dst_ptr + offset, in_data, mask=mask)
+
+
+@triton.jit
+def deepep_post_reorder_triton_kernel(
+ down_output_ptr,
+ output_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ topk_weights_ptr,
+ topk,
+ hidden_size,
+ BLOCK_SIZE: tl.constexpr,
+):
+ InDtype = down_output_ptr.dtype.element_ty
+
+ src_idx = tl.program_id(0)
+ src2dst_ptr = src2dst_ptr + src_idx * topk
+ topk_ids_ptr = topk_ids_ptr + src_idx * topk
+ topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+ store_ptr = output_ptr + src_idx * hidden_size
+ for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+ offset = start_offset + tl.arange(0, BLOCK_SIZE)
+ mask = offset < hidden_size
+ sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
+ for idx in range(topk):
+ dst_idx = tl.load(src2dst_ptr + idx)
+ if dst_idx >= 0:
+ weigh_scale = tl.load(topk_weights_ptr + idx).to(InDtype)
+ load_ptr = down_output_ptr + dst_idx * hidden_size
+ in_data = tl.load(load_ptr + offset, mask=mask)
+ sum_vec += in_data * weigh_scale
+ tl.store(store_ptr + offset, sum_vec, mask=mask)
+
+
+@triton.jit
+def compute_src2dst_triton_kernel(
+ reorder_ids, src2dst, num_toks, BLOCK_SIZE: tl.constexpr
+):
+ pid = tl.program_id(axis=0)
+ dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+ mask = dst_id < num_toks
+ src_id = tl.load(reorder_ids + dst_id, mask=mask)
+ tl.store(src2dst + src_id, dst_id, mask=mask)
+
+
+@triton.jit
+def deepep_compute_src2dst_triton_kernel(
+ reorder_ids, src2dst, num_toks, num_minus_one, BLOCK_SIZE: tl.constexpr
+):
+ pid = tl.program_id(axis=0)
+ dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+ mask = dst_id < num_toks
+ src_id = tl.load(reorder_ids + dst_id, mask=mask)
+ num_invalid = tl.load(num_minus_one)
+ tl.store(src2dst + src_id, dst_id - num_invalid, mask=mask)
+
+
+def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
+ reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+ seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
+ src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
+
+ # Find offset
+ expert_ids = torch.arange(
+ num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
+ )
+ torch.searchsorted(reorder_topk_ids, expert_ids, out=seg_indptr)
+ num_minus_one = seg_indptr[0]
+ seg_indptr = seg_indptr - num_minus_one
+
+ BLOCK_SIZE = 512
+ grid = (triton.cdiv(topk_ids.numel(), BLOCK_SIZE),)
+ deepep_compute_src2dst_triton_kernel[grid](
+ reorder_ids, src2dst, topk_ids.numel(), num_minus_one, BLOCK_SIZE
+ )
+ reorder_topk_ids = reorder_topk_ids[num_minus_one:]
+ return reorder_topk_ids, src2dst, seg_indptr
+
+
+@triton.jit
+def compute_seg_indptr_triton_kernel(reorder_topk_ids, seg_indptr, num_toks):
+ expert_id_minus_1 = tl.program_id(0) - 1
+ low = 0
+ high = num_toks - 1
+ target_location = -1
+ while low <= high:
+ mid = (low + high) // 2
+
+ if tl.load(reorder_topk_ids + mid) > expert_id_minus_1:
+ high = mid - 1
+ else:
+ low = mid + 1
+ target_location = mid
+ tl.store(seg_indptr + expert_id_minus_1 + 1, target_location + 1)
+
+
+def cutlass_w4_run_moe_ep_preproess(topk_ids: torch.Tensor):
+ _, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+
+ BLOCK_SIZE = 512
+ grid = (triton.cdiv(topk_ids.numel(), BLOCK_SIZE),)
+ src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
+ compute_src2dst_triton_kernel[grid](
+ reorder_ids, src2dst, topk_ids.numel(), BLOCK_SIZE
+ )
+
+ return src2dst
+
+
+@triton.jit
+def pre_reorder_triton_kernel_for_cutlass_moe(
+ input_ptr,
+ gateup_input_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ a1_scales_ptr,
+ num_local_experts,
+ topk,
+ num_tokens,
+ hidden_size,
+ BLOCK_SIZE: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ OutDtype = gateup_input_ptr.dtype.element_ty
+
+ if a1_scales_ptr is not None:
+ a1_scale = 1.0 / tl.load(a1_scales_ptr)
+ else:
+ a1_scale = 1.0
+
+ offset = BLOCK_SIZE * tl.program_id(1) + tl.arange(0, BLOCK_SIZE)
+ mask = offset < hidden_size
+
+ start_src_idx = tl.program_id(0)
+ step = tl.num_programs(0)
+
+ for src_idx_int32 in tl.range(
+ start_src_idx, num_tokens, step, num_stages=NUM_STAGES
+ ):
+ src_idx = src_idx_int32.to(tl.int64)
+ token_src2dst_ptr = src2dst_ptr + src_idx * topk
+ token_topk_ids_ptr = topk_ids_ptr + src_idx * topk
+
+ src_ptr_offs = input_ptr + src_idx * hidden_size + offset
+ dst_ptr_offs = gateup_input_ptr + offset
+ in_data = tl.load(src_ptr_offs, mask=mask).to(tl.float32)
+ out_data = (in_data * a1_scale).to(OutDtype)
+ for idx in range(topk):
+ expert_id = tl.load(token_topk_ids_ptr + idx)
+ if expert_id != num_local_experts:
+ dst_idx = tl.load(token_src2dst_ptr + idx)
+ tl.store(dst_ptr_offs + dst_idx * hidden_size, out_data, mask=mask)
+
+
+def pre_reorder_for_cutlass_moe(
+ input,
+ gateup_input,
+ src2dst,
+ topk_ids,
+ a1_scales,
+ num_local_experts,
+ topk,
+ num_tokens,
+ hidden_size,
+):
+ grid, block_dim = _get_launch_config_2d(input.device, num_tokens, hidden_size)
+
+ pre_reorder_triton_kernel_for_cutlass_moe[grid](
+ input_ptr=input,
+ gateup_input_ptr=gateup_input,
+ src2dst_ptr=src2dst,
+ topk_ids_ptr=topk_ids,
+ a1_scales_ptr=a1_scales,
+ num_local_experts=num_local_experts,
+ topk=topk,
+ num_tokens=num_tokens,
+ hidden_size=hidden_size,
+ BLOCK_SIZE=block_dim,
+ NUM_STAGES=3,
+ )
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/a000ab69098654df4731f5b12587dd4e7f0a4f41/lightllm/common/fused_moe/moe_silu_and_mul_mix_quant_ep.py
+@triton.jit
+def _silu_and_mul_post_quant_kernel(
+ input_ptr,
+ stride_input_0,
+ stride_input_1,
+ stride_input_2,
+ output_ptr,
+ stride_output_0,
+ stride_output_1,
+ stride_output_2,
+ output_scale_ptr,
+ stride_output_scale_0,
+ stride_output_scale_1,
+ stride_output_scale_2,
+ masked_m_ptr,
+ size_n,
+ fp8_max,
+ fp8_min,
+ BLOCK_N: tl.constexpr,
+ NUM_STAGE: tl.constexpr,
+ SCALE_UE8M0: tl.constexpr,
+):
+ expert_id = tl.program_id(2)
+ token_id = tl.program_id(1)
+ hidden_dim_block_index = tl.program_id(0)
+
+ block_num_per_expert = tl.num_programs(1)
+
+ token_num_cur_expert = tl.load(masked_m_ptr + expert_id)
+
+ stride_input_0 = tl.cast(stride_input_0, dtype=tl.int64)
+ stride_output_0 = tl.cast(stride_output_0, dtype=tl.int64)
+ stride_input_1 = tl.cast(stride_input_1, dtype=tl.int64)
+ stride_output_1 = tl.cast(stride_output_1, dtype=tl.int64)
+
+ offs_in_d = hidden_dim_block_index * BLOCK_N + tl.arange(0, BLOCK_N)
+ input_ptr_offs = input_ptr + expert_id * stride_input_0 + offs_in_d
+ output_ptr_offs = output_ptr + expert_id * stride_output_0 + offs_in_d
+ output_scale_offs = (
+ output_scale_ptr
+ + expert_id * stride_output_scale_0
+ + hidden_dim_block_index * stride_output_scale_2
+ )
+
+ for token_index in tl.range(
+ token_id, token_num_cur_expert, block_num_per_expert, num_stages=NUM_STAGE
+ ):
+ gate = tl.load(
+ input_ptr_offs + token_index * stride_input_1,
+ mask=offs_in_d < size_n,
+ other=0.0,
+ ).to(tl.float32)
+ up = tl.load(
+ input_ptr_offs + token_index * stride_input_1 + size_n,
+ mask=offs_in_d < size_n,
+ other=0.0,
+ )
+ gate = gate / (1 + tl.exp(-gate))
+ gate = gate.to(input_ptr.dtype.element_ty)
+ gate_up = up * gate
+ _absmax = tl.maximum(tl.max(tl.abs(gate_up)), 1e-10)
+ output_s = _absmax / fp8_max
+ if SCALE_UE8M0:
+ output_s = tl.exp2(tl.ceil(tl.log2(tl.abs(output_s))))
+ output_q = tl.clamp(gate_up / output_s, fp8_min, fp8_max).to(
+ output_ptr.dtype.element_ty
+ )
+ tl.store(
+ output_ptr_offs + token_index * stride_output_1,
+ output_q,
+ mask=offs_in_d < size_n,
+ )
+ tl.store(
+ output_scale_offs + token_index * stride_output_scale_1,
+ output_s,
+ )
+
+
+def silu_and_mul_masked_post_quant_fwd(
+ input: torch.Tensor,
+ output: torch.Tensor,
+ output_scale: torch.Tensor,
+ quant_group_size: int,
+ masked_m: torch.Tensor,
+ scale_ue8m0: bool = False,
+):
+ """
+ input shape [expert_num, token_num_padded, hidden_dim]
+ output shape [expert_num, token_num_padded, hidden_dim // 2], dtype fp8
+ output_scale [expert_num token_num_paddded, hidden_dim // 2 // 128] dtype float32
+ quant_group_size int,
+ masked_m shape [expert_num],
+ """
+
+ assert input.is_contiguous()
+ assert output.dtype == torch.float8_e4m3fn
+ assert output.is_contiguous()
+ assert len(input.shape) == 3
+ assert input.shape[0] == masked_m.shape[0]
+ assert input.shape[-1] % 2 == 0
+
+ size_n = input.shape[-1] // 2
+ assert size_n % quant_group_size == 0
+
+ expert_num = len(masked_m)
+
+ if expert_num < 4:
+ BLOCK_NUM_PER_EXPERT = 64
+ else:
+ BLOCK_NUM_PER_EXPERT = 32
+
+ BLOCK_N = quant_group_size
+ num_warps = 1
+ NUM_STAGES = 6
+ hidden_dim_split_block_num = triton.cdiv(size_n, BLOCK_N)
+ assert BLOCK_N % quant_group_size == 0
+
+ grid = (
+ hidden_dim_split_block_num,
+ BLOCK_NUM_PER_EXPERT,
+ expert_num,
+ )
+
+ finfo = torch.finfo(torch.float8_e4m3fn)
+ fp8_max = finfo.max
+ fp8_min = -fp8_max
+
+ _silu_and_mul_post_quant_kernel[grid](
+ input,
+ *input.stride(),
+ output,
+ *output.stride(),
+ output_scale,
+ *output_scale.stride(),
+ masked_m,
+ size_n,
+ fp8_max,
+ fp8_min,
+ BLOCK_N=BLOCK_N,
+ NUM_STAGE=NUM_STAGES,
+ num_warps=num_warps,
+ SCALE_UE8M0=scale_ue8m0,
+ )
+ return
+
+
+@triton.jit
+def silu_mul_static_tensorwise_quant_triton_kernel_for_cutlass_moe(
+ input_ptr,
+ output_ptr,
+ scale_ptr,
+ num_tokens_tensor_ptr,
+ intermediate_size,
+ BLOCK_SIZE: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ OutDtype = output_ptr.dtype.element_ty
+
+ num_tokens = tl.load(num_tokens_tensor_ptr)
+ numel = num_tokens * intermediate_size
+ gate_ptr = input_ptr
+ up_ptr = input_ptr + intermediate_size
+ scale = 1.0 / tl.load(scale_ptr)
+
+ start_idx = tl.program_id(0) * BLOCK_SIZE
+ step = tl.num_programs(0) * BLOCK_SIZE
+
+ for id in tl.range(start_idx, numel, step, num_stages=NUM_STAGES):
+ ids = id + tl.arange(0, BLOCK_SIZE)
+ token_ids = ids // intermediate_size
+ mask = ids < numel
+
+ offs = ids + token_ids * intermediate_size
+ gate = tl.load(gate_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+ up = tl.load(up_ptr + offs, mask=mask, other=0.0).to(tl.float32)
+ output = gate / (1 + tl.exp(-gate)) * up * scale
+ tl.store(output_ptr + ids, output.to(OutDtype), mask=mask)
+
+
+def silu_mul_static_tensorwise_quant_for_cutlass_moe(
+ input: torch.Tensor,
+ output: torch.Tensor,
+ scale: torch.Tensor,
+ num_tokens_tensor: torch.Tensor,
+ expected_num_tokens: int,
+ intermediate_size: int,
+):
+ grid, block_dim = _get_launch_config_1d(
+ input.device, expected_num_tokens * intermediate_size
+ )
+
+ silu_mul_static_tensorwise_quant_triton_kernel_for_cutlass_moe[grid](
+ input_ptr=input,
+ output_ptr=output,
+ scale_ptr=scale,
+ num_tokens_tensor_ptr=num_tokens_tensor,
+ intermediate_size=intermediate_size,
+ BLOCK_SIZE=block_dim,
+ NUM_STAGES=3,
+ )
+
+
+@triton.jit
+def post_reorder_triton_kernel_for_cutlass_moe(
+ down_output_ptr,
+ output_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ topk_weights_ptr,
+ num_local_experts,
+ topk,
+ num_tokens,
+ hidden_size,
+ routed_scaling_factor: float,
+ BLOCK_SIZE: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ OutDtype = output_ptr.dtype.element_ty
+
+ offset = BLOCK_SIZE * tl.program_id(1) + tl.arange(0, BLOCK_SIZE)
+ mask = offset < hidden_size
+
+ down_output_ptr_offs = down_output_ptr + offset
+ output_ptr_offs = output_ptr + offset
+
+ start_src_idx = tl.program_id(0)
+ step = tl.num_programs(0)
+
+ for src_idx_int32 in tl.range(
+ start_src_idx, num_tokens, step, num_stages=NUM_STAGES
+ ):
+ src_idx = src_idx_int32.to(tl.int64)
+ token_src2dst_ptr = src2dst_ptr + src_idx * topk
+ token_topk_ids_ptr = topk_ids_ptr + src_idx * topk
+ token_topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+ sum_vec = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+ for idx in range(topk):
+ expert_id = tl.load(token_topk_ids_ptr + idx)
+ if expert_id != num_local_experts:
+ dst_idx_int32 = tl.load(token_src2dst_ptr + idx)
+ dst_idx = dst_idx_int32.to(tl.int64)
+ dst_idx = dst_idx
+ weight_scale = tl.load(token_topk_weights_ptr + idx).to(tl.float32)
+ load_ptr_offs = down_output_ptr_offs + dst_idx * hidden_size
+ in_data = tl.load(load_ptr_offs, mask=mask).to(tl.float32)
+ sum_vec += in_data * weight_scale
+ sum_vec *= routed_scaling_factor
+ store_ptr_offs = output_ptr_offs + src_idx * hidden_size
+ tl.store(store_ptr_offs, sum_vec.to(OutDtype), mask=mask)
+
+
+def post_reorder_for_cutlass_moe(
+ down_output,
+ output,
+ src2dst,
+ topk_ids,
+ topk_weights,
+ num_local_experts,
+ topk,
+ num_tokens,
+ hidden_size,
+ routed_scaling_factor: float,
+):
+ grid, block_dim = _get_launch_config_2d(down_output.device, num_tokens, hidden_size)
+
+ post_reorder_triton_kernel_for_cutlass_moe[grid](
+ down_output_ptr=down_output,
+ output_ptr=output,
+ src2dst_ptr=src2dst,
+ topk_ids_ptr=topk_ids,
+ topk_weights_ptr=topk_weights,
+ num_local_experts=num_local_experts,
+ topk=topk,
+ num_tokens=num_tokens,
+ hidden_size=hidden_size,
+ routed_scaling_factor=routed_scaling_factor,
+ BLOCK_SIZE=block_dim,
+ NUM_STAGES=3,
+ )
+
+
+@triton.jit
+def post_reorder_triton_kernel(
+ down_output_ptr,
+ output_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ topk_weights_ptr,
+ topk,
+ hidden_size,
+ BLOCK_SIZE: tl.constexpr,
+):
+ InDtype = down_output_ptr.dtype.element_ty
+
+ src_idx_int32 = tl.program_id(0)
+ src_idx = src_idx_int32.to(tl.int64)
+ src2dst_ptr = src2dst_ptr + src_idx * topk
+ topk_ids_ptr = topk_ids_ptr + src_idx * topk
+ topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+ store_ptr = output_ptr + src_idx * hidden_size
+
+ vec = tl.arange(0, BLOCK_SIZE)
+
+ for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+ offset = start_offset + vec
+ mask = offset < hidden_size
+
+ sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
+ for idx in range(topk):
+ expert_id = tl.load(topk_ids_ptr + idx)
+ if expert_id > 0:
+ dst_idx_int32 = tl.load(src2dst_ptr + idx)
+ dst_idx = dst_idx_int32.to(tl.int64)
+ weigh_scale = tl.load(topk_weights_ptr + idx).to(InDtype)
+ load_ptr = down_output_ptr + dst_idx * hidden_size
+ in_data = tl.load(load_ptr + offset, mask=mask)
+ sum_vec += in_data * weigh_scale
+ tl.store(store_ptr + offset, sum_vec, mask=mask)
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_1(
+ num_recv_tokens_per_expert,
+ expert_start_loc,
+ m_indices,
+ num_experts: tl.constexpr,
+ BLOCK_E: tl.constexpr,
+ BLOCK_EXPERT_NUM: tl.constexpr,
+):
+ cur_expert = tl.program_id(0)
+
+ offset_cumsum = tl.arange(0, BLOCK_EXPERT_NUM)
+ tokens_per_expert = tl.load(
+ num_recv_tokens_per_expert + offset_cumsum,
+ mask=offset_cumsum < num_experts,
+ other=0,
+ )
+ cumsum = tl.cumsum(tokens_per_expert) - tokens_per_expert
+ tl.store(expert_start_loc + offset_cumsum, cumsum, mask=offset_cumsum < num_experts)
+
+ cur_expert_start = tl.load(expert_start_loc + cur_expert)
+ cur_expert_token_num = tl.load(num_recv_tokens_per_expert + cur_expert)
+
+ m_indices_start_ptr = m_indices + cur_expert_start
+ off_expert = tl.arange(0, BLOCK_E)
+
+ for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4):
+ tl.store(
+ m_indices_start_ptr + start_m + off_expert,
+ cur_expert,
+ )
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_2(
+ total_token_num,
+ expert_start_loc,
+ recv_x,
+ recv_x_stride0,
+ recv_x_stride1,
+ recv_x_scale,
+ recv_x_scale_stride0,
+ recv_x_scale_stride1,
+ recv_topk,
+ recv_topk_stride0,
+ recv_topk_stride1,
+ output_tensor,
+ output_tensor_stride0,
+ output_tensor_stride1,
+ output_tensor_scale,
+ output_tensor_scale_stride0,
+ output_tensor_scale_stride1,
+ output_index,
+ output_index_stride0,
+ output_index_stride1,
+ topk_num: tl.constexpr,
+ HIDDEN_SIZE: tl.constexpr,
+ HIDDEN_SIZE_PAD: tl.constexpr,
+ SCALE_HIDDEN_SIZE: tl.constexpr,
+ SCALE_HIDDEN_SIZE_PAD: tl.constexpr,
+):
+ start_token_id = tl.program_id(0)
+ grid_num = tl.num_programs(0)
+
+ offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
+ mask = offset_in < HIDDEN_SIZE
+
+ index_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
+ mask_s = index_in_s < SCALE_HIDDEN_SIZE
+
+ for token_id_int32 in range(start_token_id, total_token_num, grid_num):
+ token_id = token_id_int32.to(tl.int64)
+ to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
+ to_copy_s = tl.load(
+ recv_x_scale
+ + token_id * recv_x_scale_stride0
+ + index_in_s * recv_x_scale_stride1,
+ mask=mask_s,
+ )
+
+ for topk_idx_int32 in tl.range(0, topk_num, 1, num_stages=4):
+ topk_index = topk_idx_int32.to(tl.int64)
+ expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
+ if expert_id >= 0:
+ dest_token_index_int32 = tl.atomic_add(expert_start_loc + expert_id, 1)
+ dest_token_index = dest_token_index_int32.to(tl.int64)
+
+ tl.store(
+ output_index + token_id * output_index_stride0 + topk_index,
+ dest_token_index_int32,
+ )
+ output_tensor_ptr = (
+ output_tensor + dest_token_index * output_tensor_stride0
+ )
+ output_tensor_scale_ptr = (
+ output_tensor_scale + dest_token_index * output_tensor_scale_stride0
+ )
+ tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
+ tl.store(
+ output_tensor_scale_ptr + index_in_s * output_tensor_scale_stride1,
+ to_copy_s,
+ mask=mask_s,
+ )
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/deepep_scatter_gather.py
+@torch.no_grad()
+def ep_scatter(
+ recv_x: torch.Tensor,
+ recv_x_scale: torch.Tensor,
+ recv_topk: torch.Tensor,
+ num_recv_tokens_per_expert: torch.Tensor,
+ expert_start_loc: torch.Tensor,
+ output_tensor: torch.Tensor,
+ output_tensor_scale: torch.Tensor,
+ m_indices: torch.Tensor,
+ output_index: torch.Tensor,
+ scale_ue8m0: bool = False,
+):
+ BLOCK_E = 128 # token num of per expert is aligned to 128
+ BLOCK_D = 128 # block size of quantization
+ num_warps = 8
+ num_experts = num_recv_tokens_per_expert.shape[0]
+ hidden_size = recv_x.shape[1]
+ # grid = (triton.cdiv(hidden_size, BLOCK_D), num_experts)
+ grid = num_experts
+
+ scale_hidden_size = hidden_size // BLOCK_D
+ if scale_ue8m0:
+ # ue8m0 scales are packed here (4 scales per int32),
+ # hence the effective size of this dimension is divided by 4.
+ scale_hidden_size = ceil_div(scale_hidden_size, 4)
+
+ assert m_indices.shape[0] % BLOCK_E == 0
+ assert (
+ recv_x_scale.dtype == output_tensor_scale.dtype
+ ), f"recv_x_scale.dtype: {recv_x_scale.dtype}, output_tensor_scale.dtype: {output_tensor_scale.dtype}"
+ assert recv_x_scale.shape[1] == output_tensor_scale.shape[1] == scale_hidden_size
+
+ _fwd_kernel_ep_scatter_1[(grid,)](
+ num_recv_tokens_per_expert,
+ expert_start_loc,
+ m_indices,
+ num_experts=num_experts,
+ num_warps=num_warps,
+ BLOCK_E=BLOCK_E,
+ BLOCK_EXPERT_NUM=triton.next_power_of_2(num_experts),
+ )
+
+ grid = min(recv_topk.shape[0], 1024 * 8)
+
+ _fwd_kernel_ep_scatter_2[(grid,)](
+ recv_topk.shape[0],
+ expert_start_loc,
+ recv_x,
+ recv_x.stride(0),
+ recv_x.stride(1),
+ recv_x_scale,
+ recv_x_scale.stride(0),
+ recv_x_scale.stride(1),
+ recv_topk,
+ recv_topk.stride(0),
+ recv_topk.stride(1),
+ output_tensor,
+ output_tensor.stride(0),
+ output_tensor.stride(1),
+ output_tensor_scale,
+ output_tensor_scale.stride(0),
+ output_tensor_scale.stride(1),
+ output_index,
+ output_index.stride(0),
+ output_index.stride(1),
+ topk_num=recv_topk.shape[1],
+ num_warps=num_warps,
+ HIDDEN_SIZE=hidden_size,
+ HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size),
+ SCALE_HIDDEN_SIZE=scale_hidden_size,
+ SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(scale_hidden_size),
+ )
+ return
+
+
+@triton.jit
+def _fwd_kernel_ep_gather(
+ total_token_num,
+ input_tensor,
+ input_tensor_stride0,
+ input_tensor_stride1,
+ recv_topk_ids,
+ recv_topk_ids_stride0,
+ recv_topk_ids_stride1,
+ recv_topk_weight,
+ recv_topk_weight_stride0,
+ recv_topk_weight_stride1,
+ input_index,
+ input_index_stride0,
+ input_index_stride1,
+ output_tensor,
+ output_tensor_stride0,
+ output_tensor_stride1,
+ topk_num: tl.constexpr,
+ BLOCK_D: tl.constexpr,
+):
+ cur_block_int32 = tl.program_id(0)
+ cur_block = cur_block_int32.to(tl.int64)
+
+ start_cur_token_int32 = tl.program_id(1)
+
+ grid_num = tl.num_programs(1)
+
+ for cur_token_int32 in range(start_cur_token_int32, total_token_num, grid_num):
+ cur_token = cur_token_int32.to(tl.int64)
+
+ off_d = tl.arange(0, BLOCK_D)
+ accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+ for topk_index_int32 in range(0, topk_num):
+ topk_index = topk_index_int32.to(tl.int64)
+
+ expert_id = tl.load(
+ recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
+ )
+ if expert_id >= 0:
+ source_token_index_int32 = tl.load(
+ input_index + cur_token * input_index_stride0 + topk_index
+ )
+ source_token_index = source_token_index_int32.to(tl.int64)
+
+ acc_weight = tl.load(
+ recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
+ )
+ tmp = tl.load(
+ input_tensor
+ + source_token_index * input_tensor_stride0
+ + cur_block * BLOCK_D
+ + off_d
+ )
+ accumulator += tmp.to(tl.float32) * acc_weight
+
+ tl.store(
+ output_tensor
+ + cur_token * output_tensor_stride0
+ + cur_block * BLOCK_D
+ + off_d,
+ accumulator.to(output_tensor.dtype.element_ty),
+ )
+
+
+@torch.no_grad()
+def ep_gather(
+ input_tensor: torch.Tensor,
+ recv_topk_ids: torch.Tensor,
+ recv_topk_weight: torch.Tensor,
+ input_index: torch.Tensor,
+ output_tensor: torch.Tensor,
+):
+ num_warps = 2
+ num_tokens = output_tensor.shape[0]
+ hidden_size = input_tensor.shape[1]
+ BLOCK_D = 128 if hidden_size % 1024 != 0 else 1024 # block size of quantization
+ assert hidden_size % BLOCK_D == 0
+ grid = (triton.cdiv(hidden_size, BLOCK_D), min(num_tokens, 1024))
+ _fwd_kernel_ep_gather[grid](
+ num_tokens,
+ input_tensor,
+ input_tensor.stride(0),
+ input_tensor.stride(1),
+ recv_topk_ids,
+ recv_topk_ids.stride(0),
+ recv_topk_ids.stride(1),
+ recv_topk_weight,
+ recv_topk_weight.stride(0),
+ recv_topk_weight.stride(1),
+ input_index,
+ input_index.stride(0),
+ input_index.stride(1),
+ output_tensor,
+ output_tensor.stride(0),
+ output_tensor.stride(1),
+ topk_num=recv_topk_ids.shape[1],
+ num_warps=num_warps,
+ BLOCK_D=BLOCK_D,
+ )
+ return
+
+
+# copy from
+# https://github.com/deepseek-ai/DeepGEMM/blob/bd2a77552886b98c205af12f8d7d2d61247c4b27/deep_gemm/jit_kernels/utils.py#L58
+def get_tma_aligned_size(x: int, element_size: int) -> int:
+ """
+ Global memory address of TMA must be 16-byte aligned.
+ Since we use column-major layout for the LHS scaling tensor,
+ the M-axis of the LHS scaling tensor needs to be padded to a multiple of 16 bytes.
+
+ Arguments:
+ x: original M-axis shape of the LHS scaling tensor.
+ element_size: element size of the LHS scaling tensor.
+
+ Returns:
+ M-axis shape of the LHS scaling tensor after padding.
+ """
+ tma_alignment_bytes = 16
+ assert tma_alignment_bytes % element_size == 0
+ alignment = tma_alignment_bytes // element_size
+ return ceil_div(x, alignment) * alignment
+
+
+@triton.jit
+def _tma_align_input_scale_kernel(
+ input_scale_ptr,
+ output_ptr,
+ m,
+ k_div_block_size,
+ input_scale_stride_m,
+ input_scale_stride_k,
+ output_stride_m,
+ output_stride_k,
+ BLOCK_SIZE_K: tl.constexpr,
+):
+ pid_m = tl.program_id(axis=0)
+ grid_m = tl.num_programs(0)
+ k_offsets = tl.arange(0, BLOCK_SIZE_K)
+
+ for m_base in range(pid_m, m, grid_m):
+ input_offset = (
+ input_scale_ptr
+ + m_base * input_scale_stride_m
+ + k_offsets * input_scale_stride_k
+ )
+ input_data = tl.load(input_offset, mask=k_offsets < k_div_block_size)
+
+ output_offset = (
+ output_ptr + k_offsets * output_stride_k + m_base * output_stride_m
+ )
+ tl.store(output_offset, input_data, mask=k_offsets < k_div_block_size)
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py
+def tma_align_input_scale(input_scale: torch.Tensor):
+ assert input_scale.dim() == 2
+ m, k_div_block_size = input_scale.shape
+ padd_m = get_tma_aligned_size(m, input_scale.element_size())
+ output = torch.empty(
+ (k_div_block_size, padd_m), dtype=input_scale.dtype, device=input_scale.device
+ )
+
+ grid_m = min(m, 8192)
+ BLOCK_SIZE_K = triton.next_power_of_2(k_div_block_size)
+
+ _tma_align_input_scale_kernel[(grid_m,)](
+ input_scale_ptr=input_scale,
+ output_ptr=output,
+ m=m,
+ k_div_block_size=k_div_block_size,
+ input_scale_stride_m=input_scale.stride(0),
+ input_scale_stride_k=input_scale.stride(1),
+ output_stride_m=output.stride(1), # Note: these are swapped
+ output_stride_k=output.stride(0), # for column-major
+ BLOCK_SIZE_K=BLOCK_SIZE_K,
+ )
+ return output.t()[:m]
+
+
+@triton.jit
+def compute_masked_m_triton_kernel(seg_indptr, masked_m):
+ expert_id = tl.program_id(0)
+ start = tl.load(seg_indptr + expert_id)
+ end = tl.load(seg_indptr + expert_id + 1)
+ tl.store(masked_m + expert_id, (end - start))
+
+
+@triton.jit
+def deepgemm_compute_src2dst_triton_kernel(
+ topk_ids,
+ reorder_ids,
+ seg_indptr,
+ src2dst,
+ m_max,
+ num_toks,
+ BLOCK_SIZE: tl.constexpr,
+):
+ pid = tl.program_id(axis=0)
+ dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+ mask = dst_id < num_toks
+ src_id = tl.load(reorder_ids + dst_id, mask=mask)
+ expert_id = tl.load(topk_ids + src_id, mask=(src_id < num_toks))
+ expert_dst_start = tl.load(seg_indptr + expert_id, mask=(expert_id >= 0))
+ expert_dst_offset = dst_id - expert_dst_start
+ dst_id = expert_id * m_max + expert_dst_offset
+ tl.store(src2dst + src_id, dst_id, mask=mask)
+
+
+@triton.jit
+def fill_gateup_input_triton_kernel(
+ input_ptr,
+ scale_ptr,
+ gateup_input_ptr,
+ gateup_input_scale_ptr,
+ src2dst_ptr,
+ topk_ids_ptr,
+ topk,
+ hidden_size,
+ scale_size,
+ BLOCK_SIZE: tl.constexpr,
+):
+
+ src_idx_int32 = tl.program_id(0)
+ src_idx = src_idx_int32.to(tl.int64)
+ src2dst_ptr = src2dst_ptr + src_idx * topk
+ topk_ids_ptr = topk_ids_ptr + src_idx * topk
+ src_ptr = input_ptr + src_idx * hidden_size
+ scale_src_ptr = scale_ptr + src_idx * scale_size
+
+ vec = tl.arange(0, BLOCK_SIZE)
+ for idx in range(topk):
+ expert_id = tl.load(topk_ids_ptr + idx)
+ if expert_id >= 0:
+ dst_idx_int32 = tl.load(src2dst_ptr + idx)
+ dst_idx = dst_idx_int32.to(tl.int64)
+ dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+ for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+ offset = start_offset + vec
+ mask = offset < hidden_size
+ in_data = tl.load(src_ptr + offset, mask=mask)
+ tl.store(dst_ptr + offset, in_data, mask=mask)
+ scale_dst_ptr = gateup_input_scale_ptr + dst_idx * scale_size
+ for start_offset in tl.range(0, scale_size, BLOCK_SIZE):
+ offset = start_offset + vec
+ mask = offset < scale_size
+ in_scale = tl.load(scale_src_ptr + offset, mask=mask)
+ tl.store(scale_dst_ptr + offset, in_scale, mask=mask)
+
+
+def moe_ep_deepgemm_preprocess(
+ topk_ids: torch.Tensor,
+ num_local_experts: int,
+ hidden_states: torch.Tensor,
+ top_k: int,
+ block_shape,
+ output_dtype: torch.dtype = torch.float8_e4m3fn,
+):
+ reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+ seg_indptr = torch.zeros(
+ num_local_experts + 1, device=topk_ids.device, dtype=torch.int64
+ )
+ src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
+ masked_m = torch.empty(num_local_experts, device=topk_ids.device, dtype=torch.int32)
+
+ compute_seg_indptr_triton_kernel[(num_local_experts + 1,)](
+ reorder_topk_ids, seg_indptr, topk_ids.numel()
+ )
+
+ grid = lambda meta: (triton.cdiv(topk_ids.numel(), meta["BLOCK_SIZE"]),)
+ compute_masked_m_triton_kernel[(num_local_experts,)](seg_indptr, masked_m)
+
+ # For masked grouped GEMM, shape M should be multiple of the block M (current block M: {block_m}) https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/jit_kernels/m_grouped_gemm.py#L165
+ m_max = (hidden_states.size(0) // 256 + 1) * 256
+ expected_m = (topk_ids.numel() - 1) // num_local_experts + 1
+ gateup_input = torch.empty(
+ (num_local_experts, m_max, hidden_states.size(1)),
+ device=hidden_states.device,
+ dtype=output_dtype,
+ )
+
+ deepgemm_compute_src2dst_triton_kernel[grid](
+ topk_ids,
+ reorder_ids,
+ seg_indptr,
+ src2dst,
+ m_max,
+ topk_ids.numel(),
+ BLOCK_SIZE=256,
+ )
+
+ if block_shape is None:
+ block_shape = [128, 128]
+ assert len(block_shape) == 2
+ block_n, block_k = block_shape[0], block_shape[1]
+
+ # TODO: fuse this with the preprocess
+ hidden_states, scale = per_token_group_quant_fp8(hidden_states, block_k)
+
+ gateup_input_scale = torch.empty(
+ (gateup_input.size(0), gateup_input.size(1), scale.size(1)),
+ device=hidden_states.device,
+ dtype=scale.dtype,
+ )
+
+ fill_gateup_input_triton_kernel[(hidden_states.shape[0],)](
+ hidden_states,
+ scale,
+ gateup_input,
+ gateup_input_scale,
+ src2dst,
+ topk_ids,
+ top_k,
+ hidden_states.size(1),
+ scale.size(1),
+ BLOCK_SIZE=1024,
+ )
+
+ return (
+ masked_m,
+ expected_m,
+ src2dst,
+ gateup_input,
+ gateup_input_scale,
+ )
+
+
+@triton.jit
+def compute_identity_kernel(
+ top_k,
+ hidden_states_ptr,
+ expert_scales_ptr,
+ num_tokens,
+ output_ptr,
+ hidden_dim,
+ scales_stride,
+ BLOCK_SIZE: tl.constexpr,
+):
+ pid = tl.program_id(0)
+
+ batch_id = pid // (hidden_dim // BLOCK_SIZE)
+ dim_offset = pid % (hidden_dim // BLOCK_SIZE) * BLOCK_SIZE
+
+ if batch_id >= num_tokens or dim_offset >= hidden_dim:
+ return
+
+ h = tl.load(
+ hidden_states_ptr
+ + batch_id * hidden_dim
+ + dim_offset
+ + tl.arange(0, BLOCK_SIZE),
+ mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+ )
+
+ result = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+ for i in range(top_k):
+ scale = tl.load(expert_scales_ptr + batch_id * scales_stride + i)
+ result += h * scale
+
+ tl.store(
+ output_ptr + batch_id * hidden_dim + dim_offset + tl.arange(0, BLOCK_SIZE),
+ result,
+ mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+ )
+
+
+def zero_experts_compute_triton(
+ expert_indices, expert_scales, num_experts, zero_expert_type, hidden_states
+):
+ N = expert_indices.numel()
+ top_k = expert_indices.size(-1)
+ grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),)
+
+ if zero_expert_type == "identity":
+ zero_expert_mask = expert_indices < num_experts
+ zero_expert_scales = expert_scales.clone()
+ zero_expert_scales[zero_expert_mask] = 0.0
+
+ normal_expert_mask = expert_indices >= num_experts
+ expert_indices[normal_expert_mask] = -1
+ expert_scales[normal_expert_mask] = 0.0
+
+ output = torch.zeros_like(hidden_states).to(hidden_states.device)
+ hidden_dim = hidden_states.size(-1)
+ num_tokens = hidden_states.size(0)
+
+ grid = lambda meta: (num_tokens * (hidden_dim // meta["BLOCK_SIZE"]),)
+ compute_identity_kernel[grid](
+ top_k,
+ hidden_states,
+ zero_expert_scales,
+ num_tokens,
+ output,
+ hidden_dim,
+ zero_expert_scales.stride(0),
+ BLOCK_SIZE=256,
+ )
+
+ return output
+
+
+@triton.jit
+def compute_problem_sizes_w4a8_kernel(
+ masked_m_ptr,
+ problem_sizes1_ptr,
+ problem_sizes2_ptr,
+ n,
+ k,
+ num_experts,
+ BLOCK_SIZE: tl.constexpr,
+):
+ pid = tl.program_id(axis=0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+ mask = pid < num_experts
+ final_occurrences = tl.load(masked_m_ptr + pid, mask=mask, other=0)
+
+ ps1_idx_0 = pid * 3
+ ps1_idx_1 = ps1_idx_0 + 1
+ ps1_idx_2 = ps1_idx_0 + 2
+
+ ps2_idx_0 = pid * 3
+ ps2_idx_1 = ps2_idx_0 + 1
+ ps2_idx_2 = ps2_idx_0 + 2
+
+ ps1_mask_0 = ps1_idx_0 < num_experts * 3
+ ps1_mask_1 = ps1_idx_1 < num_experts * 3
+ ps1_mask_2 = ps1_idx_2 < num_experts * 3
+ ps2_mask_0 = ps2_idx_0 < num_experts * 3
+ ps2_mask_1 = ps2_idx_1 < num_experts * 3
+ ps2_mask_2 = ps2_idx_2 < num_experts * 3
+
+ tl.store(problem_sizes1_ptr + ps1_idx_0, 2 * n, mask=ps1_mask_0)
+ tl.store(problem_sizes1_ptr + ps1_idx_1, final_occurrences, mask=ps1_mask_1)
+ tl.store(problem_sizes1_ptr + ps1_idx_2, k, mask=ps1_mask_2)
+
+ tl.store(problem_sizes2_ptr + ps2_idx_0, k, mask=ps2_mask_0)
+ tl.store(problem_sizes2_ptr + ps2_idx_1, final_occurrences, mask=ps2_mask_1)
+ tl.store(problem_sizes2_ptr + ps2_idx_2, n, mask=ps2_mask_2)
+
+
+def compute_problem_sizes_w4a8(
+ masked_m, problem_sizes1, problem_sizes2, n, k, num_experts
+):
+ BLOCK_SIZE = 256
+ grid = lambda meta: (triton.cdiv(num_experts, meta["BLOCK_SIZE"]),)
+ compute_problem_sizes_w4a8_kernel[grid](
+ masked_m,
+ problem_sizes1,
+ problem_sizes2,
+ n,
+ k,
+ num_experts,
+ BLOCK_SIZE=BLOCK_SIZE,
+ )
+ return problem_sizes1, problem_sizes2
+
+
+def deepep_ll_get_cutlass_w4a8_moe_mm_data(
+ masked_m,
+ problem_sizes1,
+ problem_sizes2,
+ num_experts,
+ n,
+ k,
+):
+ problem_sizes1, problem_sizes2 = compute_problem_sizes_w4a8(
+ masked_m, problem_sizes1, problem_sizes2, n, k, num_experts
+ )
+ return (
+ problem_sizes1.to(torch.int32),
+ problem_sizes2.to(torch.int32),
+ )
+
+
+@triton.jit
+def _silu_and_mul_post_per_tensor_quant_kernel(
+ input_ptr,
+ stride_input_expert,
+ stride_input_token,
+ stride_input_dim,
+ output_ptr,
+ stride_output_expert,
+ stride_output_token,
+ stride_output_dim,
+ scale_ptr,
+ masked_m_ptr,
+ inner_dim,
+ fp8_max,
+ fp8_min,
+ BLOCK_N: tl.constexpr,
+ NUM_STAGE: tl.constexpr,
+):
+ """
+ Triton kernel: fused SiLU(gate) * up + per-tensor FP8 quantization.
+
+ Shape:
+ input: [E, T_padded, 2*D] -> gate: [:,:,D], up: [:,:,D]
+ output: [E, T_padded, D], dtype=float8_e4m3fn
+ """
+ expert_id = tl.program_id(2)
+ block_id_token = tl.program_id(1)
+ block_id_dim = tl.program_id(0)
+
+ num_token_blocks = tl.num_programs(1)
+
+ token_num_cur_expert = tl.load(masked_m_ptr + expert_id)
+
+ scale = 1.0 / tl.load(scale_ptr).to(tl.float32)
+
+ stride_input_expert = tl.cast(stride_input_expert, tl.int32)
+ stride_output_expert = tl.cast(stride_output_expert, tl.int32)
+ stride_input_token = tl.cast(stride_input_token, tl.int32)
+ stride_output_token = tl.cast(stride_output_token, tl.int32)
+
+ offset_d = block_id_dim * BLOCK_N + tl.arange(0, BLOCK_N)
+ mask_d = offset_d < inner_dim
+
+ # base pointers for current expert and dim block
+ input_base_offs = input_ptr + expert_id * stride_input_expert + offset_d
+ output_base_offs = output_ptr + expert_id * stride_output_expert + offset_d
+
+ for token_idx in tl.range(
+ block_id_token, token_num_cur_expert, num_token_blocks, num_stages=NUM_STAGE
+ ):
+ gate_ptr = input_base_offs + token_idx * stride_input_token
+ up_ptr = gate_ptr + inner_dim
+ gate = tl.load(gate_ptr, mask=mask_d, other=0.0).to(tl.float32)
+ up = tl.load(up_ptr, mask=mask_d, other=0.0).to(tl.float32)
+
+ # SiLU: x * sigmoid(x)
+ gate = gate / (1 + tl.exp(-gate))
+ gate = gate.to(input_ptr.dtype.element_ty)
+ gate_up = up * gate
+
+ scaled = gate_up * scale
+ output_q = tl.clamp(scaled, fp8_min, fp8_max).to(output_ptr.dtype.element_ty)
+ out_ptr = output_base_offs + token_idx * stride_output_token
+ tl.store(out_ptr, output_q, mask=mask_d)
+
+
+def silu_and_mul_masked_post_per_tensor_quant_fwd(
+ input: torch.Tensor,
+ output: torch.Tensor,
+ masked_m: torch.Tensor,
+ scale: torch.Tensor,
+) -> torch.Tensor:
+ """
+ Fused SiLU + Mul + Per-Tensor Quantization to FP8.
+
+ Args:
+ input: [expert_num, token_num_padded, 2 * inner_dim]
+ output: [expert_num, token_num_padded, inner_dim], dtype=torch.float8_e4m3fn
+ masked_m: [expert_num], actual token count for each expert
+ scale: [1] or [expert_num], quantization scale (per-tensor or per-expert)
+
+ Returns:
+ output tensor
+ """
+ assert input.is_contiguous()
+ assert output.is_contiguous()
+ assert output.dtype == torch.float8_e4m3fn
+ assert input.ndim == 3
+ assert input.shape[0] == masked_m.shape[0]
+ assert input.shape[-1] % 2 == 0
+ assert scale.numel() == 1 or scale.shape[0] == input.shape[0]
+
+ expert_num = input.shape[0]
+ # 3584
+ inner_dim = input.shape[-1] // 2
+
+ BLOCK_N = 256
+ BLOCK_M = 64 if expert_num < 4 else 32
+ NUM_STAGES = 3
+ hidden_dim_split_block_num = triton.cdiv(inner_dim, BLOCK_N)
+
+ grid = (hidden_dim_split_block_num, BLOCK_M, expert_num)
+ finfo = torch.finfo(torch.float8_e4m3fn)
+ fp8_max = finfo.max
+ fp8_min = -fp8_max
+
+ _silu_and_mul_post_per_tensor_quant_kernel[grid](
+ input,
+ *input.stride(),
+ output,
+ *output.stride(),
+ scale,
+ masked_m,
+ inner_dim,
+ fp8_max,
+ fp8_min,
+ BLOCK_N=BLOCK_N,
+ NUM_STAGE=NUM_STAGES,
+ )
+ return output
diff --git a/sglang/python/sglang/srt/layers/moe/ep_moe/layer.py b/sglang/python/sglang/srt/layers/moe/ep_moe/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54908a8f279c6679a539e2076f43e4dff49d83ef
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -0,0 +1,746 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+import torch
+
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.environ import envs
+from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.moe import (
+ get_deepep_mode,
+ get_moe_a2a_backend,
+ get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.fused_moe_triton.layer import (
+ FusedMoE,
+ moe_forward_piecewise_cuda_graph_impl,
+)
+from sglang.srt.layers.moe.rocm_moe_utils import upscale
+from sglang.srt.layers.moe.token_dispatcher.deepep import (
+ DeepEPLLCombineInput,
+ DeepEPNormalCombineInput,
+)
+from sglang.srt.layers.moe.token_dispatcher.moriep import (
+ MoriEPLLCombineInput,
+ MoriEPNormalCombineInput,
+)
+from sglang.srt.layers.moe.topk import TopKOutput, TopKOutputChecker
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+ CompressedTensorsFusedMoEMethod,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ NPUCompressedTensorsW4A16Int4DynamicMoE,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.quark.schemes import QuarkW4A4MXFp4MoE
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
+from sglang.srt.utils import get_bool_env_var, is_hip, is_npu
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ DeepEPLLDispatchOutput,
+ DeepEPNormalDispatchOutput,
+ DispatchOutput,
+ )
+
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+ from aiter import ActivationType, QuantType
+ from aiter.fused_moe import fused_moe
+elif _is_npu:
+ import torch_npu
+
+
+logger = logging.getLogger(__name__)
+
+
+if _is_npu:
+ import torch_npu
+
+
+class DeepEPMoE(FusedMoE):
+ """
+ MoE Expert Parallel Impl based on DeepEP (https://github.com/deepseek-ai/DeepEP/tree/main)
+ Mooncake EP shares the same class, as they expose the same interface.
+ """
+
+ _has_printed = False
+
+ def __init__(
+ self,
+ num_experts: int,
+ top_k: int,
+ hidden_size: int,
+ intermediate_size: int,
+ layer_id: int,
+ num_fused_shared_experts: int = 0,
+ params_dtype: Optional[torch.dtype] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ activation: str = "silu",
+ routed_scaling_factor: Optional[float] = None,
+ **kwargs,
+ ):
+ super().__init__(
+ num_experts=num_experts,
+ top_k=top_k,
+ hidden_size=hidden_size,
+ intermediate_size=intermediate_size,
+ layer_id=layer_id,
+ num_fused_shared_experts=num_fused_shared_experts,
+ params_dtype=params_dtype,
+ quant_config=quant_config,
+ prefix=prefix,
+ activation=activation,
+ routed_scaling_factor=routed_scaling_factor,
+ **kwargs,
+ )
+ if _use_aiter or _is_npu:
+ self.deprecate_flag = False
+ elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and isinstance(
+ quant_config, Fp8Config
+ ):
+ self.deprecate_flag = True
+ else:
+ self.deprecate_flag = False
+
+ if self.deprecate_flag:
+ return
+
+ if isinstance(quant_config, Fp8Config):
+ self.use_block_quant = getattr(self.quant_method, "block_quant", False)
+ self.use_fp8_w8a8 = True
+ self.fp8_dtype = torch.float8_e4m3fn
+ self.use_w4afp8 = False
+ elif isinstance(quant_config, W4AFp8Config):
+ self.use_w4afp8 = True
+ self.use_fp8_w8a8 = False
+ self.use_block_quant = False
+ else:
+ self.use_w4afp8 = False
+ self.use_fp8_w8a8 = False
+ self.use_block_quant = False
+
+ self.deepep_mode = get_deepep_mode()
+
+ if (
+ self.deepep_mode.enable_low_latency()
+ and not _is_npu
+ and not _is_hip
+ and not (
+ get_moe_runner_backend().is_flashinfer_cutedsl()
+ and self.quant_config.get_name() == "modelopt_fp4"
+ )
+ ):
+ # AMD HIP, NPU supports low_latency deepep without deepgemm
+ # NV FP4 quantization with flashinfer_cutedsl also supports low_latency deepep without deepgemm
+ assert (
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+ ), f"DeepEP {self.deepep_mode} mode requires deep_gemm"
+ if _use_aiter:
+ # expert_mask is of size (self.num_local_experts + 1),
+ # the extra 1 is for invalid rank_id (in original deepep, the invalid rank_id is -1, but aiter does not allow -1, we use a mask to make those ids invalid)
+ # for instance, if we have 4 experts on this rank, we would have a expert_mask like:
+ # self.expert_mask = [1, 1, 1, 1, 0]
+ # idx from 0-3 is valid and will be processed, while idx == 4 will be masked out
+ self.expert_mask = torch.zeros(
+ (self.num_local_experts + 1),
+ device=torch.cuda.current_device(),
+ dtype=torch.int,
+ )
+ # the last one is invalid rank_id
+ self.expert_mask[:-1] = 1
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ if is_in_piecewise_cuda_graph():
+ assert TopKOutputChecker.format_is_standard(
+ topk_output
+ ), "Only standard topk output is supported for piecewise cuda graph"
+ return moe_forward_piecewise_cuda_graph_impl(
+ hidden_states,
+ topk_output.topk_weights,
+ topk_output.topk_ids,
+ topk_output.router_logits,
+ self.layer_id,
+ )
+ else:
+ return self.forward_impl(hidden_states, topk_output)
+
+ def forward_impl(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+
+ if self.deprecate_flag:
+ return super().forward_impl(
+ hidden_states,
+ topk_output,
+ )
+
+ # TODO: can we call super().forward here?
+ dispatch_output = self.dispatcher.dispatch(
+ hidden_states=hidden_states, topk_output=topk_output
+ )
+ combine_input = self.run_moe_core(dispatch_output)
+ hidden_states = self.dispatcher.combine(
+ combine_input=combine_input,
+ )
+
+ return hidden_states
+
+ def dispatch(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ return self.dispatcher.dispatch(
+ hidden_states=hidden_states,
+ topk_output=topk_output,
+ )
+
+ def run_moe_core(
+ self,
+ dispatch_output: DispatchOutput,
+ ):
+
+ if self.deprecate_flag:
+ return super().run_moe_core(
+ dispatch_output,
+ )
+
+ from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+ if _use_aiter:
+ assert DispatchOutputChecker.format_is_deepep(dispatch_output)
+ # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
+ output = self.forward_aiter(dispatch_output)
+ elif _is_npu:
+ assert DispatchOutputChecker.format_is_deepep(dispatch_output)
+ output = self.forward_npu(dispatch_output)
+ elif DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
+ if self.use_w4afp8:
+ output = self.forward_cutlass_w4afp8(dispatch_output)
+ else:
+ assert False, "forward_deepgemm_contiguous is deprecated"
+ elif DispatchOutputChecker.format_is_deepep_ll(dispatch_output):
+ if (
+ get_moe_runner_backend().is_flashinfer_cutedsl()
+ and self.quant_config.get_name() == "modelopt_fp4"
+ ):
+ output = self.forward_flashinfer_cutedsl(dispatch_output)
+ elif self.use_w4afp8:
+ output = self.forward_cutlass_w4afp8_masked(dispatch_output)
+ else:
+ assert False, "forward_deepgemm_masked is deprecated"
+
+ combine_input_wrapper = (
+ DeepEPNormalCombineInput
+ if DispatchOutputChecker.format_is_deepep_normal(dispatch_output)
+ else DeepEPLLCombineInput
+ )
+
+ return combine_input_wrapper(
+ hidden_states=output,
+ topk_ids=dispatch_output.topk_ids,
+ topk_weights=dispatch_output.topk_weights,
+ )
+
+ def combine(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ overlap_args: Optional[Dict[str, Any]] = None,
+ ):
+ return self.dispatcher.combine(
+ hidden_states=hidden_states,
+ topk_ids=topk_ids,
+ topk_weights=topk_weights,
+ overlap_args=overlap_args,
+ )
+
+ def forward_aiter(
+ self,
+ dispatch_output: Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput],
+ ):
+ hidden_states, topk_ids, topk_weights = (
+ dispatch_output.hidden_states,
+ dispatch_output.topk_ids,
+ dispatch_output.topk_weights,
+ )
+
+ if hidden_states.shape[0] == 0:
+ return hidden_states
+
+ # in original deepep, idx == -1 meaning invalid and will not be processed.
+ # aiter does not accept -1, we use a expert mask to make these idx invalid
+ # (idx == num_local_experts) meaning not used in aiter fused_moe
+ topk_ids_copy = topk_ids.to(torch.int32)
+ topk_ids_copy[topk_ids_copy == -1] = self.num_local_experts
+
+ return fused_moe(
+ hidden_states,
+ self.w13_weight,
+ self.w2_weight,
+ topk_weights,
+ topk_ids_copy,
+ w1_scale=self.w13_weight_scale_inv,
+ w2_scale=self.w2_weight_scale_inv,
+ quant_type=QuantType.per_128x128,
+ activation=(
+ ActivationType.Silu
+ if self.moe_runner_config.activation == "silu"
+ else ActivationType.Gelu
+ ),
+ expert_mask=self.expert_mask,
+ )
+
+ def forward_flashinfer_cutedsl(
+ self,
+ dispatch_output: DeepEPLLDispatchOutput,
+ ):
+ hidden_states, hidden_states_scale, _, _, masked_m, _ = dispatch_output
+ assert self.quant_method is not None
+ assert self.moe_runner_config.activation == "silu"
+
+ output = self.quant_method.apply_without_routing_weights(
+ layer=self,
+ x=(hidden_states, hidden_states_scale),
+ masked_m=masked_m,
+ moe_runner_config=self.moe_runner_config,
+ )
+ return output
+
+ def forward_cutlass_w4afp8(
+ self,
+ dispatch_output: DeepEPNormalDispatchOutput,
+ ):
+ assert self.moe_runner_config.activation == "silu"
+ assert isinstance(self.quant_method, W4AFp8MoEMethod)
+ return self.quant_method.apply_deepep_normal(
+ layer=self,
+ dispatch_output=dispatch_output,
+ )
+
+ def forward_cutlass_w4afp8_masked(
+ self,
+ dispatch_output: DeepEPLLDispatchOutput,
+ ):
+ assert self.moe_runner_config.activation == "silu"
+ assert isinstance(self.quant_method, W4AFp8MoEMethod)
+ assert (
+ envs.SGLANG_DEEPEP_BF16_DISPATCH.get()
+ ), "W4AFP8 does not support FP8 dispatch; please set SGLANG_DEEPEP_BF16_DISPATCH=1."
+ return self.quant_method.apply_deepep_ll(
+ layer=self,
+ dispatch_output=dispatch_output,
+ )
+
+ def forward_npu(
+ self,
+ dispatch_output: Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput],
+ ):
+ assert self.quant_method is not None
+ assert self.moe_runner_config.activation == "silu"
+
+ from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+ npu_fused_moe_without_routing_weights_bf16,
+ )
+ from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+ # NOTE: Ascend's Dispatch & Combine does not support FP16
+ output_dtype = torch.bfloat16
+ group_list_type = 1
+
+ if DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
+ if TYPE_CHECKING:
+ assert isinstance(dispatch_output, DeepEPNormalDispatchOutput)
+ hidden_states, hidden_states_scale, _, _, num_recv_tokens_per_expert = (
+ dispatch_output
+ )
+
+ group_list = torch.tensor(
+ num_recv_tokens_per_expert,
+ dtype=torch.int64,
+ device=hidden_states.device,
+ )
+
+ if self.w13_weight.dtype == torch.bfloat16:
+ hidden_states = npu_fused_moe_without_routing_weights_bf16(
+ self, hidden_states, group_list_type, group_list, output_dtype
+ )
+ else:
+ input_quant = get_bool_env_var("DEEP_NORMAL_MODE_USE_INT8_QUANT")
+ if not input_quant and not isinstance(
+ self.quant_method,
+ (
+ NPUCompressedTensorsW4A16Int4DynamicMoE,
+ CompressedTensorsFusedMoEMethod,
+ ),
+ ):
+ hidden_states, hidden_states_scale = torch_npu.npu_dynamic_quant(
+ hidden_states
+ )
+ hidden_states = self.quant_method.apply_without_routing_weights(
+ self,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ )
+ elif DispatchOutputChecker.format_is_deepep_ll(dispatch_output):
+ if TYPE_CHECKING:
+ assert isinstance(dispatch_output, DeepEPLLDispatchOutput)
+ (
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ topk_weights,
+ group_list,
+ _,
+ ) = dispatch_output
+
+ group_list = group_list.to(torch.int64)
+
+ if self.w13_weight.dtype == torch.bfloat16:
+ hidden_states = npu_fused_moe_without_routing_weights_bf16(
+ self, hidden_states, group_list_type, group_list, output_dtype
+ )
+ else:
+ hidden_states = self.quant_method.apply_without_routing_weights(
+ self,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ )
+ else:
+ raise ValueError(f"Not Supported DeepEP format {dispatch_output.format}")
+
+ return hidden_states
+
+
+class NpuFuseEPMoE(DeepEPMoE):
+ def __init__(
+ self,
+ num_experts: int,
+ top_k: int,
+ hidden_size: int,
+ intermediate_size: int,
+ layer_id: int,
+ num_fused_shared_experts: int = 0,
+ params_dtype: Optional[torch.dtype] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ activation: str = "silu",
+ routed_scaling_factor: Optional[float] = None,
+ **kwargs,
+ ):
+ super().__init__(
+ num_experts=num_experts,
+ top_k=top_k,
+ hidden_size=hidden_size,
+ intermediate_size=intermediate_size,
+ layer_id=layer_id,
+ num_fused_shared_experts=num_fused_shared_experts,
+ params_dtype=params_dtype,
+ quant_config=quant_config,
+ prefix=prefix,
+ activation=activation,
+ routed_scaling_factor=routed_scaling_factor,
+ **kwargs,
+ )
+
+ self.quant_method.process_weights_after_loading = (
+ self._process_weights_after_loading
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ forward_shared_experts=None,
+ alt_stream=None,
+ disable_sbo=False,
+ ):
+ return self.dispatcher.dispatch(
+ hidden_states=hidden_states,
+ topk_output=topk_output,
+ gmm1_permuted_weight=self.w13_weight,
+ gmm1_permuted_weight_scale=self.w13_weight_scale,
+ gmm2_weight=self.w2_weight,
+ gmm2_weight_scale=self.w2_weight_scale,
+ ).hidden_state
+
+ def permute_w13_weight_scale(self, w: torch.Tensor, tile_n: int):
+ if tile_n % 2 != 0:
+ raise ValueError(f"tile_n must be even, got {tile_n}")
+
+ *dims, n = w.shape
+ if n % tile_n != 0:
+ raise ValueError(f"Last dimension {n} must be divisible by tile_n {tile_n}")
+
+ w_reshaped = w.reshape(*dims, 2, n // tile_n, tile_n // 2)
+
+ # Permute the last two dimensions.
+ perm_order = list(range(len(dims))) + [-2, -3, -1]
+ w_permuted = w_reshaped.permute(perm_order)
+
+ return w_permuted.reshape(*dims, n)
+
+ def reshape_w13_weight(self, weight: torch.Tensor, dim: int, chunk_size: int = 64):
+ # Achieving greater computing power through reshape on Ascend.
+ original_shape = weight.shape
+ if dim < 0:
+ dim += len(original_shape)
+
+ if original_shape[dim] % (2 * chunk_size) != 0:
+ raise ValueError(
+ f"Dimension {dim} size {original_shape[dim]} must be divisible by {2 * chunk_size}"
+ )
+
+ new_shape = (
+ *original_shape[:dim],
+ 2,
+ original_shape[dim] // (2 * chunk_size),
+ chunk_size,
+ *original_shape[dim + 1 :],
+ )
+
+ weight = weight.view(new_shape)
+ weight = weight.transpose(dim, dim + 1).contiguous()
+
+ return weight.view(*original_shape[:dim], -1, *original_shape[dim + 1 :])
+
+ def _process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ cpu_w13 = layer.w13_weight.data.transpose(1, 2).cpu()
+ layer.w13_weight.data = self.reshape_w13_weight(cpu_w13, -1).npu()
+ layer.w13_weight.data = npu_format_cast(layer.w13_weight.data)
+
+ layer.w2_weight.data = npu_format_cast(layer.w2_weight.data)
+
+ w13_scale = layer.w13_weight_scale.data.squeeze(-1).contiguous()
+ w13_scale = self.permute_w13_weight_scale(w13_scale, 128)
+ layer.w13_weight_scale = torch.nn.Parameter(
+ w13_scale.to(torch.float32), requires_grad=False
+ )
+
+ w2_scale = layer.w2_weight_scale.data.squeeze(-1).contiguous()
+ layer.w2_weight_scale = torch.nn.Parameter(
+ w2_scale.to(torch.float32), requires_grad=False
+ )
+
+ if hasattr(layer, "w13_weight_offset"):
+ layer.w13_weight_offset = torch.nn.Parameter(
+ layer.w13_weight_offset.data.squeeze(-1).contiguous(),
+ requires_grad=False,
+ )
+ if hasattr(layer, "w2_weight_offset"):
+ layer.w2_weight_offset = torch.nn.Parameter(
+ layer.w2_weight_offset.data.squeeze(-1).contiguous(),
+ requires_grad=False,
+ )
+
+
+class MoriEPMoE(DeepEPMoE):
+ def __init__(
+ self,
+ num_experts: int,
+ top_k: int,
+ hidden_size: int,
+ intermediate_size: int,
+ layer_id: int,
+ num_fused_shared_experts: int = 0,
+ params_dtype: Optional[torch.dtype] = None,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ activation: str = "silu",
+ routed_scaling_factor: Optional[float] = None,
+ **kwargs,
+ ):
+ super().__init__(
+ num_experts=num_experts,
+ top_k=top_k,
+ hidden_size=hidden_size,
+ intermediate_size=intermediate_size,
+ layer_id=layer_id,
+ num_fused_shared_experts=num_fused_shared_experts,
+ params_dtype=params_dtype,
+ quant_config=quant_config,
+ prefix=prefix,
+ activation=activation,
+ routed_scaling_factor=routed_scaling_factor,
+ **kwargs,
+ )
+
+ assert _use_aiter, "Mori need to be used together with aiter as of now"
+ self.expert_mask = torch.zeros(
+ (self.num_experts),
+ device=torch.cuda.current_device(),
+ dtype=torch.int32,
+ )
+ expert_start_idx = self.moe_ep_rank * self.num_local_experts
+ expert_end_idx = expert_start_idx + self.num_local_experts
+ self.expert_mask[expert_start_idx:expert_end_idx] = 1
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ num_token = hidden_states.shape[0]
+ dispatch_output = self.dispatcher.dispatch(
+ hidden_states=hidden_states, topk_output=topk_output
+ )
+ combine_input = self.run_moe_core(dispatch_output)
+ hidden_states = self.dispatcher.combine(
+ combine_input=combine_input,
+ )
+
+ return hidden_states[:num_token]
+
+ def run_moe_core(
+ self,
+ dispatch_output: DispatchOutput,
+ ):
+ scale = None
+ is_fp8_quant = isinstance(self.quant_method, Fp8MoEMethod)
+ is_quark_w4a4 = hasattr(self, "scheme") and isinstance(
+ self.scheme, QuarkW4A4MXFp4MoE
+ )
+
+ (
+ dispatch_a1,
+ dispatch_scale,
+ dispatch_ids,
+ dispatch_weights,
+ dispatch_recv_token_num,
+ origin_topk_ids,
+ origin_topk_weights,
+ output_dtype,
+ ) = (
+ dispatch_output.hidden_states,
+ dispatch_output.hidden_states_scale,
+ dispatch_output.topk_ids,
+ dispatch_output.topk_weights,
+ dispatch_output.num_recv_tokens_per_expert,
+ dispatch_output.origin_topk_ids,
+ dispatch_output.origin_topk_weights,
+ dispatch_output.out_dtype,
+ )
+
+ w13_weight = self.w13_weight
+ w2_weight = self.w2_weight
+
+ w13_scale = None
+ w2_scale = None
+
+ quant_type = QuantType.No
+
+ if not is_fp8_quant and dispatch_scale is not None:
+ dispatch_a1 = upscale(
+ dispatch_a1, dispatch_scale, dispatch_recv_token_num, output_dtype
+ )
+ dispatch_scale = None
+
+ if is_quark_w4a4:
+ if hasattr(torch, "float4_e2m1fn_x2"):
+ w13_weight = self.w13_weight.view(torch.float4_e2m1fn_x2)
+ w2_weight = self.w2_weight.view(torch.float4_e2m1fn_x2)
+
+ w13_scale = self.w13_weight_scale
+ w2_scale = self.w2_weight_scale
+ quant_type = QuantType.per_1x32
+
+ if hasattr(self.w13_weight, "is_shuffled"):
+ w13_weight.is_shuffled = True
+ w2_weight.is_shuffled = True
+ elif is_fp8_quant:
+ if hasattr(self, "w13_weight_scale_inv"):
+ w13_scale = self.w13_weight_scale_inv
+ if hasattr(self, "w2_weight_scale_inv"):
+ w2_scale = self.w2_weight_scale_inv
+
+ quant_type = QuantType.per_128x128
+
+ # [KK TODO] should to call the apply of quant method to handle fused moe
+ hidden_states = fused_moe(
+ hidden_states=dispatch_a1,
+ w1=w13_weight,
+ w2=w2_weight,
+ w1_scale=w13_scale,
+ w2_scale=w2_scale,
+ a1_scale=dispatch_scale,
+ topk_weight=dispatch_weights,
+ topk_ids=dispatch_ids,
+ quant_type=quant_type,
+ activation=(
+ ActivationType.Silu
+ if self.moe_runner_config.activation == "silu"
+ else ActivationType.Gelu
+ ),
+ expert_mask=self.expert_mask,
+ num_local_tokens=dispatch_recv_token_num,
+ dtype=output_dtype,
+ )
+
+ from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+ combine_input_wrapper = (
+ MoriEPNormalCombineInput
+ if DispatchOutputChecker.format_is_deepep_normal(dispatch_output)
+ else MoriEPLLCombineInput
+ )
+
+ return combine_input_wrapper(
+ hidden_states=hidden_states,
+ topk_ids=dispatch_output.origin_topk_ids,
+ topk_weights=dispatch_output.origin_topk_weights,
+ )
+
+
+def get_moe_impl_class(quant_config: Optional[QuantizationConfig]):
+ # [TODO] kk, temporary solution
+ if get_moe_a2a_backend().is_mori():
+ return MoriEPMoE
+ if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+ return DeepEPMoE
+ if get_moe_a2a_backend().is_ascend_fuseep():
+ return NpuFuseEPMoE
+
+ if get_moe_runner_backend().is_flashinfer_trtllm():
+ # NEW: Direct FP4 detection (bypasses EP requirements)
+ # Check for FP4 quantization with TRTLLM flag, regardless of EP
+ # FlashInferFP4MoE must be paired with ModelOptNvFp4FusedMoEMethod.
+ if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+ from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFP4MoE
+
+ return FlashInferFP4MoE
+ elif (
+ quant_config is None
+ or quant_config.get_name() == "fp8"
+ or quant_config.get_name() == "modelopt_fp8"
+ or quant_config.get_name() == "compressed_tensors"
+ ):
+ # FlashInferFusedMoE support bf16, fp8 and compressed_tensors
+ return FusedMoE
+
+ if get_moe_runner_backend().is_flashinfer_cutlass():
+ return FusedMoE
+ return FusedMoE
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3ed3af41218dda60ac39b9cd9b1882e7a62712
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py
@@ -0,0 +1,42 @@
+from contextlib import contextmanager
+from typing import Any, Dict, Optional
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_config import (
+ get_config_file_name,
+ try_get_optimal_moe_config,
+)
+from sglang.srt.layers.moe.fused_moe_triton.layer import (
+ FusedMoE,
+ FusedMoeWeightScaleSupported,
+)
+from sglang.srt.layers.moe.fused_moe_triton.moe_align_block_size import (
+ moe_align_block_size,
+)
+
+_config: Optional[Dict[str, Any]] = None
+
+
+@contextmanager
+def override_config(config):
+ global _config
+ old_config = _config
+ _config = config
+ yield
+ _config = old_config
+
+
+def get_config() -> Optional[Dict[str, Any]]:
+ return _config
+
+
+__all__ = [
+ "FusedMoE",
+ "FusedMoeWeightScaleSupported",
+ "override_config",
+ "get_config",
+ "fused_experts",
+ "get_config_file_name",
+ "moe_align_block_size",
+ "try_get_optimal_moe_config",
+]
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc841f134d6b3b8e23c0b881d70bd722e3b9a913
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dab2d908ebb5850b94cf385d972bc5d6c922944
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_config.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55633ffed3eaa40ea45726ea3a59130a2d867e38
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_kernels.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_kernels.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae6b4551ad9a479053c02a6c9bc5a72f7ad3781a
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/fused_moe_triton_kernels.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/layer.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/layer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58297e8bcf942accbcbb3f382f344e1cae41dc5e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/layer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/moe_align_block_size.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/moe_align_block_size.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dd04c8c0a21569344f057dcc2c22108813fe9ee
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/__pycache__/moe_align_block_size.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cec1f5cd980712eb7b1e3892c0f30e86f0c6d56
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md
@@ -0,0 +1,40 @@
+# Fused MoE Triton Kernel Configurations
+
+This directory contains tuned configurations for different settings of the fused_moe kernel.
+
+## Configuration Parameters
+
+Each configuration file is generated based on the following parameters:
+
+- **E** (number of experts): Total number of experts in the MoE layer
+- **N** (intermediate size): The intermediate/hidden dimension size
+ - For Tensor Parallelism (TP): `N = original_intermediate_size / tp_size`
+ - Example: Mixtral has N = 14336. For TP=2, N = 7168; for TP=4, N = 3584
+- **device_name**: GPU device name from `torch.cuda.get_device_name()`
+ - Examples: `NVIDIA_H100_80GB_HBM3`, `NVIDIA_A100-SXM4-80GB`, `NVIDIA_GeForce_RTX_4090`
+- **dtype**: Data type for computation
+ - Supported types: `fp8_w8a8`, `int8_w8a8`, `int8_w8a16`, `int4_w4a16`, etc.
+ - Determines precision and quantization scheme for weights and activations
+- **block_shape**: Block quantization shape (for DeepSeek V3/R1 models)
+ - Defines granularity for block-wise quantization, specified as `[block_n, block_k]`
+ - Example: DeepSeek V3 commonly uses `[128, 128]` for efficient block-wise FP8 quantization
+- **tp_size**: Tensor Parallelism size (affects N parameter)
+- **ep_size**: Expert Parallelism size (affects E parameter when EP is enabled)
+- **per_channel_quant**: Whether per-channel quantization is used
+
+## Configuration File Format
+
+Each JSON file contains a mapping from **M** (batch size) to the optimal kernel configuration for that batch size. The configuration includes parameters like `BLOCK_M`, `BLOCK_N`, `BLOCK_K`, `GROUP_M`, number of warps, and pipeline stages.
+
+**Filename Format**:
+```
+E={E},N={N},device_name={device_name},dtype={dtype}[,block_shape={block_shape}][,per_channel_quant={bool}].json
+```
+
+## Generating Configuration Files
+
+To generate new configuration files for your specific hardware and model settings, use the tuning tools:
+
+**📖 Full Documentation**: [Tuning Triton MoE Kernels](https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton)
+
+After tuning, move the generated JSON files to this directory to use them in SGLang.
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..56c1a4e3af0b4a93fff71028d8e04bf73f0abb29
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..21924c7e7f0b6e60990cec415207c6812fa4e59b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..598993c61187122602028708f1573f95da39aa4e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7c626e9b00b7cd7e2e589028218f65de427a599
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..14bf13935af2c07944455096d8e37aaca74b2871
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..29972c5f3606ecac86c43fcb124f375fd470ff81
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..082fba99b47da011ff858c5d7cab242c1cb1dac0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbed867695f564dfbbb51a99e41aa2de3f7b379e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..b89336a26884e14bec7bfa27f682db7c49bc7eda
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..39bc5b9e520c35d82833c8693648a76dca3d60a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..35d67f83714ee20d13f6790225dc419d9d5a2b5b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..587fb2f2e3e06aead98a3171af31fd83d4bd75de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..089894816b7e5db7865191b9b0c14ecd51ef1687
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..9814a3819d9d39f29d7cf127dacd0213b8e022f8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..9262a74a4a0e1e3789f260a3ef7f6cb9551f3f2b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..d251f9b5accaec977fc87a0999cd56ee387fc650
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ecf814a28a9441e89f892eb3d63dcf8dcb0dd97
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..6443c5cd016fe570cde97b311c55b9a80a84b615
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fe9aad9617cbf4b7882ec30a4c19d82bca3375a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b48fe26dacf21e4ba43b89d6bd85a573d5de1d9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ff4a1600cd3c5a0c5b3389c7b6032b4e9bf8082
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e09be1fff77e58d49abb39d88815c28e8d2c080
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..039a10ed127b77836a7f41c03513292613852b30
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..3793fcafee60bc7e8f5f12d601cb3192abfa9ca8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1f626667353e5955d38fc8e49cf328df97cacdc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c809a096651edd68141e6cd966981b245fdb816
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a9e2ef0f1470b0b8cbad1b5695ad10a129ab877
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+ "3328": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "768": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2560": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2816": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3584": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3840": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1280": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2304": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bfd02a4104654c57b95e64339e731eed22276de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bb90b7e4754a4eb967328761bb528fbe234d0ca
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "5120": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "9216": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "13312": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "17408": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "25600": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "33792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "41984": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "50176": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "58368": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..033ba506d7161272015b759dc5dc0a47a38093d6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+ "3840": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3584": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2816": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1280": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "768": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3328": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2560": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2304": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..14723c5763b4b6dc3964fcaa1f4bf85fee287735
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e9d27b1c40734a7ddffc1c0371020a3453278f9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ac5ba746c55b1d7ee4e80f7a4ac2403949f566f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..c87ca0de8a36cd7996dde83a4d0b333ffec895c2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1792": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3328": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2560": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "768": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2816": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2304": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1280": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3840": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3584": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..09d812937487ace048db173b9d2da8c07222bbba
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d471f98216372f387bc79fe7a74ae7ea9b11e7b7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe8dab46e10abe6c21c1b60de20dc9012e5ffac2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1cd253d118e71d636a00c883ac452416106724ee
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..9acb73b027afa5d6286e772ff79ddc03c82fe256
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..ddd55abe086cd87ba385da0a11c1c2b8c0cf5c92
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ce62139bede3595e8a6e536ea9683925831eb0d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..2e692a1583a4ad95f020634d55f7f4e327be5da0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4e7a2a1a03c4075c824d67b4d8217c97a423826c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..857d11e488917b22dabd44f58de013bf61f754c6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..90facc4a8669292fbd3da3247cc11a3ec938ce23
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e25d0492d493f4ffee8880ad9ca02fbbe9b49011
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e25d0492d493f4ffee8880ad9ca02fbbe9b49011
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5f32e830a6b64feffa9ba7b8fbd70a698f356b9b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..2840e9f4727b0e2de816fda4d4604c379d4fc6a6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c6c98c28e83d8840521103afe69278565e453c7a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6fcf408755f5d3e28ff636472f89e4fa52c8da22
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dcfefa5b9f7aa7066ab4608203776970bc38d5d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac9349fef6a0472b85231364f709320e5b971fae
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..b99349fb86243e9967ca7dc1e57962e6dda7ae52
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e9c5b9251eba7abddcdd6bdd64d114ad4e502509
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cc6c643f236d2f7f9ad29354d9e469d00b20d3f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..283ffd8ff1df6c8d105cbd2b2a1a968abd716e41
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2f8164cc6b56db5b287dd71ec77ca6bf0dcef52
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef6a0479cbd77c90db902b0571f8143caf3afc92
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..67bf2b720fecde3738451208036a004f04df63f9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..da71451a78ba4b378cabab3d11567b5d33c68a1d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..14c6f8c1a35af4dd6352cba24594042ad924822c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..2b974a78d39149cc51d5775cfe8be215129d2256
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..054873f38fc370b0eea9b257aff55cc2d5ecb1de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..869f7512771eb6ad6b551ae52ad24c36492d4c14
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..40d85e43e8735ee37e152c5fa665e90a7938dc93
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ebd7a7816e26fd9e840618fd4f557d85559df64
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..90416662e49ec12670668887b88bd1f6777bcea2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3483f05a2f7b89639a83a332629aef8caed301a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2799ed3a866e25b78d60d92910c000ebb21ff71
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a18afe7d6d2c14c7db032a29f02d42b7ff64273
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1ec0b3cee722ec3ad13f54bca36d2c95ca0d767
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..96437b5ba2c221292869567ebc62974e8cbb36d6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e60506948d828443d10e1a38c7b38eef35acfcc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..824009a59698df783476b86e189dbc49a329eee8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf2aa0d5f4dcf3b3886d066b6ba343edfb448076
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b752fa5d64a68a057ec39baede59fc86362dd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b752fa5d64a68a057ec39baede59fc86362dd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b752fa5d64a68a057ec39baede59fc86362dd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f3ccdafa88f3452a695efad4cb9622d6ae79e6a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,138 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..62fd21136d068a74c01707b657aa4e3df40340ea
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..da40db132805757929a1d92f5aa3feacf3c43eae
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000000000000000000000000000000..a218fc40642c1a8e018027e9d7856e71e197309a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000000000000000000000000000000000000..a218fc40642c1a8e018027e9d7856e71e197309a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 0000000000000000000000000000000000000000..a218fc40642c1a8e018027e9d7856e71e197309a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4c0f8417b384870050a95e0cf57edbdf6352b23
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c8185cfdeec167ec4b88de51b4b395e28769cc5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..97c9f4445b166657ad29f1db9fc8281f9c463ec4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d0db1cdc20e13eddf8a9d651540134eb786fa98
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c906494acedfb35f6bd1751ec44bedabd7ae3a1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bb423b28f5ab3825929a4870b96393262a9dd9f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..55571873395464a3b58f549523905f439a8f1716
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..26bcbf26970c7a77c99e2c8eacd83eefa86967bf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..430f50090f4aef940a40f2bdf1fe94c3a8c358e3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..c51565f67b4e94c28e9af0072ae5b8864144fb4b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000000000000000000000000000000..3682cc548f3525b6095931994359bcbdd7aa9f10
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000000000000000000000000000000000000..3682cc548f3525b6095931994359bcbdd7aa9f10
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 0000000000000000000000000000000000000000..3682cc548f3525b6095931994359bcbdd7aa9f10
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..b41f9d443e50678334f906b44fce6d018d69500e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..edf2a38d12ad3f420f232d2cd61ab149ad138725
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..7532826a6509cdb096f8bd8d0f856ef2103c138d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..673bae2ba8ef80ed4d4930739ca7daf0e8f28ee1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2100cebb7f589747430be9ca8c8db368c152d78
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..449a1338428457e82eab654b3507c48f9503c7ae
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..d317994cb7140e3defd7cd10b78f49dda6548c4f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000000000000000000000000000000..6499c8586336f7876324a9f6ae68bdb753daf129
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json
@@ -0,0 +1,173 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 2,
+ "num_warps": 4,
+ "num_ctas": 1,
+ "num_stages": 7
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 128,
+ "num_warps": 2,
+ "num_ctas": 1,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_ctas": 1,
+ "num_stages": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_ctas": 1,
+ "num_stages": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 2,
+ "num_warps": 4,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 2,
+ "num_warps": 4,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 2,
+ "num_warps": 4,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "192": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 16,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 128,
+ "num_warps": 2,
+ "num_ctas": 1,
+ "num_stages": 8
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 16,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 16,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "6144": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_ctas": 1,
+ "num_stages": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 16,
+ "num_ctas": 1,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7f14d6656eb539aa41444be0436a149fbdee546
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7f14d6656eb539aa41444be0436a149fbdee546
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7f14d6656eb539aa41444be0436a149fbdee546
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbc624731f5cb9afcdc9213183d00d1e5edd4a00
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc614e635ea57327c610ce79e99ae5339614f22e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..32c0c9da471cbe479044095e0ed14a0f54b73620
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..72c3f560be967d5690ceb3862989d8339f6d0c8f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd07b3f6ee0240479b53655cc8b1dae0f978f6e6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 0000000000000000000000000000000000000000..21742854c613f0d0141c7dc42ed1cb1e1a283255
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 0000000000000000000000000000000000000000..21742854c613f0d0141c7dc42ed1cb1e1a283255
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 0000000000000000000000000000000000000000..21742854c613f0d0141c7dc42ed1cb1e1a283255
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 32,
+ "kpack": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..f578c8d0160ac3ef85b53c8539d3675455a97173
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..918f6839620cbab1f30b0f9383a9129c2cf2cf3d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e341a67917d5177bacb3f6767e7b6d92539826ad
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..a841518ca67103986415d304368512c98b1dd4fd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..13cc2cee1d2503d2e0e5f7330d69948add8457a1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9d2f5eac52ff84771e272625b607c67ea037191
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9d2f5eac52ff84771e272625b607c67ea037191
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9d2f5eac52ff84771e272625b607c67ea037191
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 4,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ },
+ "8192": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "16384": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "32768": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 0,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "65536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2,
+ "waves_per_eu": 1,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 1
+ },
+ "131072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 2,
+ "matrix_instr_nonkdim": 16,
+ "kpack": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..34b916e574f88c65db1dac5889d74a990dc25e9b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..b50cfc13da0d7119627b7b0fdb63ef65ae80d284
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..952b750aa7884d4e7f76956a88cb9ae3e959b1e9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..5de5605d401c2e84b42134e4a3ed7e5811a8ffe3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..2221e99cd1adccee247d3bc3f221e47210c9cbaf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..74374c573f3fcb5d407b92fcb64de2a9d640f079
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..663f32539974cafbc8baadda05c118d12641437d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b34b6e4e8a8e7985384acc0b88975a9cb30384b1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab169a0183ddc11ace79bc480aefd7db154bea67
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b11acb1eec24ccef85c92401e499a7a8306067f0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab6e15552909b795ad63eff23c3161fd29c7b824
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..249359fb93d77432712a11f83e4cde87d8a8005f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d3fcf75cee1273024e620599138f56f0e5b46a4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9259aa780e741b055a691d685d3eea8ceae3fda4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b4efc9b7e44ceca6da12658441d1303c71ae925b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..03dfc73b6c0a1157baeba25098b00e7a87cd3559
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e7f9143f58f9fcc76b8fd648df2574915b320fca
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..beaac7f641e442734102dfadb36dce4083dec392
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..ebff99e26dc7fac0a3e4007593bd3821dbd65a6b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff3fc2d0191b0e18ec1847eeca037b1ecb3a5b84
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..550d314f0652a9ddc0106721b827c3d207f1a353
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..763a1657fab52ca11deadde2620c137a352a4db6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..63ba315b0b0fe0781e7d3184d65ec45031851986
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..42d00fe60b8a30ce0e570d7e48d808c9411936c5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..739979fad796e5d1888df1b69296fa5d7d2dc2c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0762b9084c7e2eb8ef22c1c191818406542fa001
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..29b5aab37262c9ea9c4f4d3ccf5dff58b0211f72
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb5b9b741e1aaee491d4f663aa6ca828ea5de8ab
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..741060e3227692a09886be2dee6944134ff846c3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f3f8f7198bd26f35f9014cd6f5633504065dcd06
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..52b459d2e8d0243c58335fb5d6e3db6421e2fa68
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..da6cf2a048b389facdd388918d195f350ff5b956
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e078fd08349032bc6b2fa7e6057cb297b90d344e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed7ad20fc6bfc4d65b0d37839eb23de80c557e9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..b00455054f1561cd08a2cf90cd1c5ca72be6c239
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8f336e26425502a5ce174612c51b3338fe566c2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..e341a67917d5177bacb3f6767e7b6d92539826ad
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fa444bca150ad482f34a017f0b24059a5c713f6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8fd97b5e416b794e76c341bf42e9087b25049c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c4751c00fe3cb3930a295e97f46cbfefd0028181
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6d6af0808476982f61f6da3f742fb5d6c1b339ee
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..379708af4e295da53b80fffed653e0c20fdf83b1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f3648ca8ff527b3ec1bbe58701b720de18965b3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..adeaacb0efc6bb52d2cd0f136cd31e3c1b67c4d3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3022a0545894ad124588c4955a9936d12bd715e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3c6e0ac051ec5ad1709b3c369241e36210ad5a3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1eba1e4b8c063a3d63ab96be709cf7fd58f66473
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8e966888979911b0b7c2aa04f1921179552b2237
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3bac45f89aab8e4b463fba069a6be597750519d0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c9c8bb66e0e97b3555db9c95cb45bc8c91cb6ae3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..68f6fb5aca9b97952d0b9d3288f0bff6349fb70d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf8beae2cdafdb19c989cf73d4a6848a5eaf18
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..532c16e899269ebf6c02340149737b7ba8bffb4b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5ee3d4d35d5586de2064b8c912dc44356587592b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..358873315860462f055617b34d97bce652e6f574
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..02a1195089cb64cb8ad2c6ae95ef2a6f3080cc4d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..faf1aa4d4ce0135a9a5f04fb6051b4cfba318d5c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b5c45dd7231e7ee723ebffc4546c76947187480f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f586d1facee67cec12dd726f066dd38386cc1a1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea50fa9ec5e70e738594e8ec4817fc29260742a5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_L40S.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbf1adb131df6f9c74ff3d299954c23e67df29cb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=1856,device_name=NVIDIA_L40S.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..3862570f4a5f035eef7c13fb1db406c057fec7f6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6d4a49044c492acce4b26950d650b817b22ef13
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c848ea5776e71a942a89595f338ce4e5e913e575
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=928,device_name=NVIDIA_L40S.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=928,device_name=NVIDIA_L40S.json
new file mode 100644
index 0000000000000000000000000000000000000000..b3ccd09bb437719e1edae8c9a581ded04e17a6fb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=928,device_name=NVIDIA_L40S.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..41d97b17b56a37a06755e5c30afcdb09f0ac5dbf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8fd97b5e416b794e76c341bf42e9087b25049c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..b962d19506ce5896ef71d0d260c57575c796f1e3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f0bb3e9a994f9dbe05332401a8c5f38b1a46475
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dd4e49ffc3c37fe0855ca5bf66c52c81acb6a1a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8fd97b5e416b794e76c341bf42e9087b25049c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4e36c1544df769137b9dd67843f0bfa3593903ec
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0c57c185aa8d3d68c86c6470defee273bc223e8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8e49def8da61597c60eeea081d8db7d8cf841726
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..500e5f0e28c9343d04797d7ed361f49917eed2a6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json
new file mode 100644
index 0000000000000000000000000000000000000000..786f367898ef67fd4b67df4a674afeefd6616bc8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a6c635be47ecb2a56a711bb1452216cc0759e5b9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f05e32900c50d34f247da43854456b9c671a9642
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d674e9ebb97ace9dcc3b182bb02cad0ca233955
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5,
+ "USE_TMA": false
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": false
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4,
+ "USE_TMA": false
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5,
+ "USE_TMA": false
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4,
+ "USE_TMA": false
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": false
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc8d6d68b6601fcc3636f2600d86ca31c16d5683
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16.json
new file mode 100644
index 0000000000000000000000000000000000000000..66313c12af2325f8565d10ab0eda3a8175ba4373
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..66313c12af2325f8565d10ab0eda3a8175ba4373
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=128,device_name=,dtype=int4_w4a16_down.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 1,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 2,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..486b94720a4c45a4d0d46adcab50dbbaa6e2e197
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8f35b62e2d04c77cc65c0ae47df9b082959eacf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000000000000000000000000000000000..039d5ade739909a1ce8e4be9629437021436af0b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..2cfedb390d4dc22d429882b15cb6c3eba45379d4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..01689145a445c28f9245ae8d3df10d8aa423e0e9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..b785658b30a3f05597a11a7df10b50e5afac524c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 0000000000000000000000000000000000000000..991b315f704591e240d655864891ae7af3b40f6a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..548688425ad84e48feb65bcd5e1b5c64edfe8762
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..64861b390c9907c612a759485ba5566bf925c46c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..04e8a5477798cd961aed65cd47817d87ca253d04
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce586e771f37dd0cfde0c213d792b46892417e37
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..de8f161020fa80c0ce204d6bb85a9eb0f3655eaf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b1a5ef987bdf4fc1d32be02730567eeb4ac9455
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..10dfbc9eadd079075c265f4ec756d9c92261f310
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a8ef8cb807addcb8629bf83ad473492099c8b3c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..11cbe842db85564f9d49a1fdd3ae006b7d1b550f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..548a2a7a08def1dc43d00948e5474ba4ba5b0752
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..24a47bc596c7c58a769135d4a73f33098bbd62e3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..c039c01f3c93bbb1c1dad99f9a9998e732ea37dc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..227961f871e2e2192f9085e2ec01b866fd4fa5ef
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..09cbbb3709ec707b6bdf1a16ef77d9218b0a8e33
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bda8762a55931ecbda30b9e7a5c84a2320756af1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..091744d7f6193f189e1777d326a1eba56321b46f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ac2651fc86cdb1f6013b425fa33b5bf5ae2cb3b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=2048,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=2048,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6da9701fc891cf3e973ca86ef72f63d9152b100
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=2048,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..66dd6874d97d155660f691703ee69cf81392e200
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..def2ae6c18569215e395d96d6df0d6808a8482b3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe2a7e76b9bfa055fa2bd533ca54744bb43685c2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..105a84ab3208cea1a8d3d3c74642f398d5e3e39c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ddf143032c98fbe5fdbf405057a52ad2a918cf5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..85f86a96eb1ccbdb82aefc789f5b4c0c710394eb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bc5c086840b3554c5367995861dbe00a3330fb0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..40cbdf70b5f03704a669c9f23baf0a2b0e5c3a4e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5c161ebd9f1e4c94d5705f34a8cd1e7108a429e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..262d0de8c8bfe083ed644633d09623d8f8109a7c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..43ab81c5ba615a967ce162aec6fb0da3530e6f76
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=20,N=1536,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..22b8427ea2331e56f7ddc6e53b687812a7faccd2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7ca8326f153901b6d3ca7ca3d607b3a1217e1ca
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..54fff622a6bf0b6a6399483aa31b7efaa3d80b4f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e8ca2d6f2d27f1cef473eb61139d28d19b17f09
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..09bb6b05201a2f137e5a35cc9f2e8f15adefe601
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..481f39f6c24bce8c8993451c8bba32a620f8ff0b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d9e06655cfb8d56d05557c2d57256d624f5130f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..709d423464de4693944a4d128e4fda0ba0ffce30
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..317aa18cd9b566630c90e2346290586fc33ec769
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cfd871fec26693651dd0fcfddaaa4b41eb04644
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a240dd111c62b983fb531ddc136658ad58fb89de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2f6db88bb8b3bf05a1a36846ba9f9a0529d9ed8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b9abbb58d9549c59e1e3faa5091642a13655df62
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,114 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..f85600f64b35baf78e7e89679b6584f691ae2732
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
@@ -0,0 +1,128 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..be559b7af82ce0d37238f9d2ba8d74d31eb62112
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf38baf80ce3ed65ba98ebdfdd649f1cf66eb822
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdd31e733923e6c757c8b58e7aba4806ecdf9119
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbb5c55efaeaaf7b11dfa6ca12bf080c83174aec
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bd6db49ea3fddd1cdaaa4eb4e130377ff042059
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..f59f7b4e9ff148c42b07b4881b43a5c02d3c8f6d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b32d763e533cbfe834bcedf6d23701b65dba585
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=128,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..36d1234a767f376fca661d4d48727a649c3e8e81
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..36042ea9731ae51ef7e85619bb8721d9db4c7442
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..649c5b6a4f245c96f005b4b350c90c5fb6f820df
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..60ea104ec6d568873fba67abbbb8b1cf9dab40a8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a085b9a496661dc9a8440b9f2fe37b92fca5492
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=256,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..e00c8ea98d26ce1c5c9cd60a102bbb4eb6a81173
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a3f5e2a7fc7fe668a6d0dbd03fa5a65eecd76c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf262f0e8c7fc74be7cc4c3c5cae61c2ce173493
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..579a8c3916c5822691c96f1cdb8a3a227337a986
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..395ac8b0c1486f62b2de96f426569ac539963521
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..42c7055c35f563a2e27c32d953a31425efec3b9a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d9efb1b26634d420f27021abbd5f301ea1767fb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..773b378e3ab0671959bb8d5d4d0241fa505b699c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd2d8b7bcd5dbd95519221fdaee11fea57f4d130
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc1624db18a0756af46d52d7c864f122485f7102
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..e12fec12db53013bf1d84ab40dea724420ae8775
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e45a13b835523bb9931e9a068ba935fdee2e9af
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_B200.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..0294bebcea57435b2d604bb9c159e4272f2784b1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 256,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..44549e37b619493c71188071783220dd83eb6194
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..58f42d41ba2fcb8d64b19017f940c29a16ddec2b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
new file mode 100644
index 0000000000000000000000000000000000000000..a43498dafa15b1d3fa2c6e75227a737ad0ea3f11
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4,
+ "USE_TMA": true
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "USE_TMA": true
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3,
+ "USE_TMA": true
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4410f07f327e0464b8b0ccc4ee8ae55171e5737e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_marlin_moe.py
@@ -0,0 +1,220 @@
+from typing import Optional
+
+import torch
+
+from sglang.srt.utils import is_cuda
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+ from sgl_kernel import moe_sum_reduce, silu_and_mul
+
+ from sglang.jit_kernel.moe_wna16_marlin import moe_wna16_marlin_gemm
+
+
+def get_scalar_type(num_bits: int, has_zp: bool):
+ from sgl_kernel.scalar_type import scalar_types
+
+ if has_zp:
+ assert num_bits == 4
+ return scalar_types.uint4
+ else:
+ return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+
+
+@register_custom_op(out_shape="hidden_states")
+def fused_marlin_moe(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ gating_output: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ g_idx1: Optional[torch.Tensor] = None,
+ g_idx2: Optional[torch.Tensor] = None,
+ sort_indices1: Optional[torch.Tensor] = None,
+ sort_indices2: Optional[torch.Tensor] = None,
+ w1_zeros: Optional[torch.Tensor] = None,
+ w2_zeros: Optional[torch.Tensor] = None,
+ workspace: Optional[torch.Tensor] = None,
+ num_bits: int = 8,
+ is_k_full: bool = True,
+ inplace: bool = False,
+ routed_scaling_factor: Optional[float] = None,
+) -> torch.Tensor:
+ """
+ This function computes a Mixture of Experts (MoE) layer using two sets of
+ weights, w1 and w2, and top-k gating mechanism.
+
+ Parameters:
+ - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+ - w1 (torch.Tensor): The first set of expert weights.
+ - w2 (torch.Tensor): The second set of expert weights.
+ - w1_scale (torch.Tensor): Scale to be used for w1.
+ - w2_scale (torch.Tensor): Scale to be used for w2.
+ - gating_output (torch.Tensor): The output of the gating operation
+ (before softmax).
+ - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
+ - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
+ - sort_indices1 (Optional[torch.Tensor]): The first act_order input
+ permutation.
+ - sort_indices2 (Optional[torch.Tensor]): The second act_order input
+ permutation.
+ - topk_weights (torch.Tensor): Top-k weights.
+ - topk_ids (torch.Tensor): Indices of topk-k elements.
+ - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+ - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+ - num_bits (int): The number of bits in expert weights quantization.
+
+ Returns:
+ - torch.Tensor: The output tensor after applying the MoE layer.
+ """
+ from sglang.srt.layers.moe.fused_moe_triton import moe_align_block_size
+
+ assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+ assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+ assert hidden_states.shape[1] == w2.shape[2] // (
+ num_bits // 2
+ ), "Hidden size mismatch w2"
+ assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+ assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+ assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+ assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+ assert (
+ hidden_states.dtype == w1_scale.dtype
+ ), f"moe_wna16_marlin_gemm assumes hidden_states.dtype ({hidden_states.dtype}) == w1_scale.dtype ({w1_scale.dtype})"
+ assert (
+ hidden_states.dtype == w2_scale.dtype
+ ), f"moe_wna16_marlin_gemm assumes hidden_states.dtype ({hidden_states.dtype}) == w2_scale.dtype ({w2_scale.dtype})"
+ assert num_bits in [4, 8]
+
+ M, K = hidden_states.shape
+ E = w1.shape[0]
+ N = w2.shape[1] * 16
+ topk = topk_ids.shape[1]
+
+ # M block size selection logic
+ # TODO: tune this further for specific models
+ for block_size_m in [8, 16, 32, 48, 64]:
+ if M * topk / E / block_size_m < 0.9:
+ break
+
+ if global_num_experts == -1:
+ global_num_experts = E
+ sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+ topk_ids, block_size_m, global_num_experts
+ )
+
+ if workspace is None:
+ max_workspace_size = (max(2 * N, K) // 64) * (
+ sorted_token_ids.size(0) // block_size_m
+ )
+ device = hidden_states.device
+ sms = torch.cuda.get_device_properties(device).multi_processor_count
+ max_workspace_size = min(max_workspace_size, sms * 4)
+ workspace = torch.zeros(
+ max_workspace_size, dtype=torch.int, device=device, requires_grad=False
+ )
+
+ scalar_type1 = get_scalar_type(num_bits, w1_zeros is not None)
+ scalar_type2 = get_scalar_type(num_bits, w2_zeros is not None)
+
+ intermediate_cache2 = torch.empty(
+ (M * topk_ids.shape[1], N),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ intermediate_cache13 = torch.empty(
+ (M * topk_ids.shape[1] * max(2 * N, K),),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ intermediate_cache1 = intermediate_cache13[: M * topk_ids.shape[1] * 2 * N]
+ intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+ intermediate_cache3 = intermediate_cache13[: M * topk_ids.shape[1] * K]
+ intermediate_cache3 = intermediate_cache3.view(-1, K)
+
+ use_atomic_add = (
+ hidden_states.dtype == torch.half
+ or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+ )
+
+ intermediate_cache1 = moe_wna16_marlin_gemm(
+ hidden_states,
+ intermediate_cache1,
+ w1,
+ None, # b_bias_or_none
+ w1_scale,
+ None, # global_scale_or_none
+ w1_zeros,
+ g_idx1,
+ sort_indices1,
+ workspace,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ topk_weights,
+ moe_block_size=block_size_m,
+ top_k=topk,
+ mul_topk_weights=False,
+ is_ep=expert_map is not None,
+ b_q_type=scalar_type1,
+ size_m=M,
+ size_n=2 * N,
+ size_k=K,
+ is_k_full=is_k_full,
+ use_atomic_add=use_atomic_add,
+ use_fp32_reduce=True,
+ is_zp_float=False,
+ )
+
+ silu_and_mul(intermediate_cache1.view(-1, 2 * N), intermediate_cache2)
+
+ if expert_map is not None:
+ intermediate_cache3.zero_()
+
+ intermediate_cache3 = moe_wna16_marlin_gemm(
+ intermediate_cache2,
+ intermediate_cache3,
+ w2,
+ None, # b_bias_or_none
+ w2_scale,
+ None, # global_scale_or_none
+ w2_zeros,
+ g_idx2,
+ sort_indices2,
+ workspace,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ topk_weights,
+ moe_block_size=block_size_m,
+ top_k=1,
+ mul_topk_weights=True,
+ is_ep=expert_map is not None,
+ b_q_type=scalar_type2,
+ size_m=M * topk,
+ size_n=K,
+ size_k=N,
+ is_k_full=is_k_full,
+ use_atomic_add=use_atomic_add,
+ use_fp32_reduce=True,
+ is_zp_float=False,
+ ).view(-1, topk, K)
+
+ output = hidden_states if inplace else torch.empty_like(hidden_states)
+
+ if routed_scaling_factor is None:
+ routed_scaling_factor = 1.0
+
+ moe_sum_reduce(
+ intermediate_cache3,
+ output,
+ routed_scaling_factor,
+ )
+ return output
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f62c955629eda83de531bb2d0327a87dfd24e1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -0,0 +1,805 @@
+# NOTE: this file will be separated into sglang/srt/layers/moe/moe_runner/triton_utils.py
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/fused_moe.py
+
+"""Fused MoE kernel."""
+
+from __future__ import annotations
+
+import functools
+import os
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch.nn.functional as F
+import triton.language as tl
+
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.utils import (
+ cpu_has_amx_support,
+ get_bool_env_var,
+ is_cpu,
+ is_cuda,
+ is_hip,
+ is_xpu,
+ use_intel_xpu_backend,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+
+from .fused_moe_triton_config import get_config_dtype_str, try_get_optimal_moe_config
+from .fused_moe_triton_kernels import (
+ act_and_mul_triton,
+ invoke_fused_moe_kernel,
+ moe_sum_reduce_triton,
+ support_tensor_descriptor,
+)
+from .moe_align_block_size import moe_align_block_size
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.topk import StandardTopKOutput
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_xpu = is_xpu()
+_use_sgl_xpu = use_intel_xpu_backend()
+
+from sglang.srt.server_args import get_global_server_args
+
+if _is_cuda:
+ from sgl_kernel import gelu_and_mul, moe_sum_reduce, silu_and_mul
+elif _is_cpu and _is_cpu_amx_available:
+ pass
+elif _is_hip:
+ from sgl_kernel import gelu_and_mul, silu_and_mul
+
+ if _use_aiter:
+ try:
+ from aiter import moe_sum
+ except ImportError:
+ raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+ # Note: vllm_ops is not needed for HIP when _use_aiter=False
+ # because the code uses moe_sum_reduce_triton as fallback (line 619)
+elif _is_xpu:
+ from sgl_kernel import moe_sum_reduce, silu_and_mul
+
+# Try to import vllm_ops for non-CUDA/HIP/XPU platforms
+_has_vllm_ops = False
+if not _is_cuda and not _is_hip and not _is_xpu:
+ try:
+ from vllm import _custom_ops as vllm_ops
+
+ _has_vllm_ops = True
+ except ImportError:
+ # Fallback: vllm not available, will use native PyTorch implementations
+ _has_vllm_ops = False
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+@register_custom_op(mutates_args=["hidden_states"])
+def inplace_fused_experts(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ b1: Optional[torch.Tensor] = None,
+ b2: Optional[torch.Tensor] = None,
+ activation: str = "silu",
+ is_gated: bool = True,
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ use_int4_w4a16: bool = False,
+ per_channel_quant: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ w1_zp: Optional[torch.Tensor] = None,
+ w2_zp: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[List[int]] = None,
+ routed_scaling_factor: Optional[float] = None,
+ gemm1_alpha: Optional[float] = None,
+ gemm1_limit: Optional[float] = None,
+ filter_expert: bool = True,
+) -> None:
+ fused_experts_impl(
+ hidden_states,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ b1,
+ b2,
+ True,
+ activation,
+ is_gated,
+ apply_router_weight_on_input,
+ use_fp8_w8a8,
+ use_int8_w8a8,
+ use_int8_w8a16,
+ use_int4_w4a16,
+ per_channel_quant,
+ w1_scale,
+ w2_scale,
+ w1_zp,
+ w2_zp,
+ a1_scale,
+ a2_scale,
+ block_shape,
+ False,
+ routed_scaling_factor,
+ gemm1_alpha,
+ gemm1_limit,
+ filter_expert,
+ )
+
+
+@register_custom_op(out_shape="hidden_states")
+def outplace_fused_experts(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ b1: Optional[torch.Tensor] = None,
+ b2: Optional[torch.Tensor] = None,
+ activation: str = "silu",
+ is_gated: bool = True,
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ use_int4_w4a16: bool = False,
+ per_channel_quant: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ w1_zp: Optional[torch.Tensor] = None,
+ w2_zp: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[List[int]] = None,
+ no_combine: bool = False,
+ routed_scaling_factor: Optional[float] = None,
+ gemm1_alpha: Optional[float] = None,
+ gemm1_limit: Optional[float] = None,
+ filter_expert: bool = True,
+) -> torch.Tensor:
+ return fused_experts_impl(
+ hidden_states,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ b1,
+ b2,
+ False,
+ activation,
+ is_gated,
+ apply_router_weight_on_input,
+ use_fp8_w8a8,
+ use_int8_w8a8,
+ use_int8_w8a16,
+ use_int4_w4a16,
+ per_channel_quant,
+ w1_scale,
+ w2_scale,
+ w1_zp,
+ w2_zp,
+ a1_scale,
+ a2_scale,
+ block_shape,
+ no_combine=no_combine,
+ routed_scaling_factor=routed_scaling_factor,
+ gemm1_alpha=gemm1_alpha,
+ gemm1_limit=gemm1_limit,
+ filter_expert=filter_expert,
+ )
+
+
+def fused_experts(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_output: StandardTopKOutput,
+ moe_runner_config: MoeRunnerConfig,
+ b1: Optional[torch.Tensor] = None,
+ b2: Optional[torch.Tensor] = None,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ use_int4_w4a16: bool = False,
+ per_channel_quant: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ w1_zp: Optional[torch.Tensor] = None,
+ w2_zp: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[List[int]] = None,
+):
+ topk_weights, topk_ids, _ = topk_output
+ filter_expert = (
+ moe_runner_config.num_experts is None
+ or moe_runner_config.num_experts != moe_runner_config.num_local_experts
+ )
+ if moe_runner_config.inplace:
+ assert not moe_runner_config.no_combine, "no combine + inplace makes no sense"
+ inplace_fused_experts(
+ hidden_states,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ b1,
+ b2,
+ moe_runner_config.activation,
+ moe_runner_config.is_gated,
+ moe_runner_config.apply_router_weight_on_input,
+ use_fp8_w8a8,
+ use_int8_w8a8,
+ use_int8_w8a16,
+ use_int4_w4a16,
+ per_channel_quant,
+ w1_scale,
+ w2_scale,
+ w1_zp,
+ w2_zp,
+ a1_scale,
+ a2_scale,
+ block_shape,
+ moe_runner_config.routed_scaling_factor,
+ moe_runner_config.gemm1_alpha,
+ moe_runner_config.gemm1_clamp_limit,
+ filter_expert,
+ )
+ return hidden_states
+ else:
+ return outplace_fused_experts(
+ hidden_states,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ b1,
+ b2,
+ moe_runner_config.activation,
+ moe_runner_config.is_gated,
+ moe_runner_config.apply_router_weight_on_input,
+ use_fp8_w8a8,
+ use_int8_w8a8,
+ use_int8_w8a16,
+ use_int4_w4a16,
+ per_channel_quant,
+ w1_scale,
+ w2_scale,
+ w1_zp,
+ w2_zp,
+ a1_scale,
+ a2_scale,
+ block_shape,
+ no_combine=moe_runner_config.no_combine,
+ routed_scaling_factor=moe_runner_config.routed_scaling_factor,
+ gemm1_alpha=moe_runner_config.gemm1_alpha,
+ gemm1_limit=moe_runner_config.gemm1_clamp_limit,
+ filter_expert=filter_expert,
+ )
+
+
+@torch.compile
+def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
+ torch.sum(x, dim=1, out=out)
+ out.mul_(routed_scaling_factor)
+
+
+@torch.compile
+def _swiglu_silu_clamp_mul(x, gemm1_limit):
+ gate, up = x.chunk(2, dim=-1)
+ gate = F.silu(gate)
+ gate = gate.clamp(min=None, max=gemm1_limit)
+ up = up.clamp(min=-gemm1_limit, max=gemm1_limit)
+ return gate * up
+
+
+@torch.compile
+def _swiglu_gpt_oss_sigmoid_alpha(x, gemm1_alpha, gemm1_limit):
+ # NOTE: This variant uses gemm1_alpha, unlike _swiglu_silu_clamp_mul.
+ # At present, only GPT-OSS uses this variant.
+ gate, up = x[..., ::2], x[..., 1::2]
+ gate = gate.clamp(min=None, max=gemm1_limit)
+ up = up.clamp(min=-gemm1_limit, max=gemm1_limit)
+ return gate * torch.sigmoid(gate * gemm1_alpha) * (up + 1)
+
+
+@functools.lru_cache()
+def _down_moe_use_tma():
+ return support_tensor_descriptor()
+
+
+def fused_experts_impl(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ b1: Optional[torch.Tensor] = None,
+ b2: Optional[torch.Tensor] = None,
+ inplace: bool = False,
+ activation: str = "silu",
+ is_gated: bool = True,
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ use_int4_w4a16: bool = False,
+ per_channel_quant: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ w1_zp: Optional[torch.Tensor] = None,
+ w2_zp: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[List[int]] = None,
+ no_combine: bool = False,
+ routed_scaling_factor: Optional[float] = None,
+ gemm1_alpha: Optional[float] = None,
+ gemm1_limit: Optional[float] = None,
+ filter_expert: bool = True,
+):
+ padded_size = padding_size
+ if not (use_fp8_w8a8 or use_int8_w8a8) or block_shape is not None or _use_aiter:
+ padded_size = 0
+
+ # Check constraints.
+ if use_int4_w4a16:
+ assert hidden_states.shape[1] // 2 == w1.shape[2], "Hidden size mismatch"
+ else:
+ assert (
+ hidden_states.shape[1] == w1.shape[2] - padded_size
+ ), f"Hidden size mismatch"
+ assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+ assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+ assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+ assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+ assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+
+ num_tokens, _ = hidden_states.shape
+ E, N, _ = w1.shape
+ # We execute the fused_moe kernel in chunks to circumvent this issue:
+ # https://github.com/vllm-project/vllm/issues/5938
+ CHUNK_SIZE = 64 * 1024
+ M = min(num_tokens, CHUNK_SIZE)
+ config_dtype = get_config_dtype_str(
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ dtype=hidden_states.dtype,
+ )
+
+ get_config_func = functools.partial(
+ try_get_optimal_moe_config,
+ w1.shape,
+ (w2.shape[0], w2.shape[1], w2.shape[2] - padded_size),
+ topk_ids.shape[1],
+ config_dtype,
+ block_shape=block_shape,
+ per_channel_quant=per_channel_quant,
+ return_down_config=True,
+ )
+
+ config, (down_config, max_block_m) = get_config_func(M)
+ down_moe_use_tma = (
+ _down_moe_use_tma()
+ and down_config is not None
+ and down_config.pop("USE_TMA", False)
+ )
+ topk = topk_ids.shape[1]
+ max_padded_tokens = (
+ min(M * topk, E + 1) * (max_block_m - 1) if down_moe_use_tma else 0
+ )
+ total_tokens = M * topk + max_padded_tokens
+ cache = torch.empty(
+ total_tokens * max(N, w2.shape[1]),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ intermediate_cache3 = cache[: M * topk * w2.shape[1]].view(
+ (M, topk, w2.shape[1]),
+ )
+
+ compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+
+ if no_combine:
+ assert not inplace
+ out_hidden_states = torch.empty(
+ (num_tokens, topk, w2.shape[1]),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ elif inplace:
+ out_hidden_states = hidden_states
+ else:
+ out_hidden_states = torch.empty_like(hidden_states)
+
+ for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+ begin_chunk_idx, end_chunk_idx = (
+ chunk * CHUNK_SIZE,
+ min((chunk + 1) * CHUNK_SIZE, num_tokens),
+ )
+ curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+ tokens_in_chunk, _ = curr_hidden_states.shape
+
+ if tokens_in_chunk == 0:
+ break
+
+ if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+ # Adjust the intermediate cache size and config for the last
+ # chunk. Note that in most cases we only have one chunk
+ # so the cache size and config are already set correctly and
+ # do not need to be adjusted.
+ config, (down_config, _) = get_config_func(tokens_in_chunk)
+ down_moe_use_tma = (
+ _down_moe_use_tma()
+ and down_config is not None
+ and down_config.pop("USE_TMA", False)
+ )
+ intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+
+ padded_tokens = (
+ min(tokens_in_chunk * topk, E + 1) * (config["BLOCK_SIZE_M"] - 1)
+ if down_moe_use_tma
+ else 0
+ )
+ total_tokens = tokens_in_chunk * topk + padded_tokens
+ intermediate_cache1 = cache[: total_tokens * N].view(
+ (total_tokens, N),
+ )
+ intermediate_cache2 = torch.empty(
+ (total_tokens, N // 2),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+ curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+ use_fused_moe_sum_all_reduce = (
+ get_global_server_args().enable_fused_moe_sum_all_reduce
+ and (not no_combine)
+ and (curr_topk_ids.shape[1] > 2)
+ and (not use_int8_w8a16)
+ and (not use_int4_w4a16)
+ )
+
+ sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+ curr_topk_ids, config["BLOCK_SIZE_M"], E
+ )
+
+ invoke_fused_moe_kernel(
+ curr_hidden_states,
+ w1,
+ b1,
+ intermediate_cache1,
+ a1_scale,
+ w1_scale,
+ w1_zp,
+ curr_topk_weights,
+ curr_topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ apply_router_weight_on_input,
+ topk_ids.shape[1],
+ config,
+ compute_type=compute_type,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ per_channel_quant=per_channel_quant,
+ block_shape=block_shape,
+ c_sorted=down_moe_use_tma,
+ filter_expert=filter_expert,
+ )
+
+ # Activation function with multiplication
+ if activation == "silu" and is_gated:
+ # - gemm1_alpha != None: GPT-OSS-style swiglu(alpha, limit)
+ # - gemm1_alpha == None and gemm1_limit != None: silu+clamp+mul(limit-only)
+ if gemm1_alpha is not None:
+ assert gemm1_limit is not None
+ intermediate_cache2 = _swiglu_gpt_oss_sigmoid_alpha(
+ intermediate_cache1.view(-1, N), gemm1_alpha, gemm1_limit
+ )
+ elif gemm1_limit is not None:
+ intermediate_cache2 = _swiglu_silu_clamp_mul(
+ intermediate_cache1.view(-1, N), gemm1_limit
+ )
+ elif _is_cuda or _is_hip or _is_xpu:
+ if not filter_expert:
+ silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ else:
+ act_and_mul_triton(
+ intermediate_cache1.view(-1, N),
+ intermediate_cache2,
+ config,
+ topk_ids,
+ expert_ids,
+ down_moe_use_tma,
+ activation,
+ )
+ else:
+ if _has_vllm_ops:
+ vllm_ops.silu_and_mul(
+ intermediate_cache2, intermediate_cache1.view(-1, N)
+ )
+ else:
+ # Fallback: native PyTorch silu_and_mul
+ x = intermediate_cache1.view(-1, N)
+ d = x.shape[-1] // 2
+ intermediate_cache2.copy_(F.silu(x[..., :d]) * x[..., d:])
+ elif activation == "gelu" and is_gated:
+ assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
+ assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
+ if _is_cuda or _is_hip:
+ if not filter_expert:
+ gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ else:
+ act_and_mul_triton(
+ intermediate_cache1.view(-1, N),
+ intermediate_cache2,
+ config,
+ topk_ids,
+ expert_ids,
+ down_moe_use_tma,
+ activation,
+ )
+ else:
+ if _has_vllm_ops:
+ vllm_ops.gelu_and_mul(
+ intermediate_cache2, intermediate_cache1.view(-1, N)
+ )
+ else:
+ # Fallback: native PyTorch gelu_and_mul
+ x = intermediate_cache1.view(-1, N)
+ d = x.shape[-1] // 2
+ intermediate_cache2.copy_(F.gelu(x[..., :d]) * x[..., d:])
+ # Activation function without multiplication
+ elif activation == "silu" and not is_gated:
+ intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
+ elif activation == "gelu" and not is_gated:
+ intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
+ elif activation == "relu2" and not is_gated:
+ intermediate_cache2 = torch.square(F.relu(intermediate_cache1.view(-1, N)))
+ else:
+ raise ValueError(f"Unsupported activation: {activation=}, with {is_gated=}")
+
+ out_slice = None
+ if use_fused_moe_sum_all_reduce:
+ out_slice = out_hidden_states[begin_chunk_idx:end_chunk_idx]
+ out_slice.zero_()
+
+ invoke_fused_moe_kernel(
+ intermediate_cache2,
+ w2,
+ b2,
+ (
+ out_slice
+ if use_fused_moe_sum_all_reduce
+ else (
+ intermediate_cache3
+ if not no_combine and topk_ids.shape[1] != 1
+ else out_hidden_states[begin_chunk_idx:end_chunk_idx].unsqueeze(0)
+ )
+ ),
+ a2_scale,
+ w2_scale,
+ w2_zp,
+ curr_topk_weights,
+ curr_topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ not apply_router_weight_on_input,
+ 1,
+ down_config or config,
+ compute_type=compute_type,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ per_channel_quant=per_channel_quant,
+ block_shape=block_shape,
+ a_use_tma=down_moe_use_tma,
+ b_use_tma=down_moe_use_tma,
+ filter_expert=filter_expert,
+ fuse_sum_all_reduce=use_fused_moe_sum_all_reduce,
+ router_topk=curr_topk_ids.shape[1],
+ )
+
+ if routed_scaling_factor is None:
+ routed_scaling_factor = 1.0
+
+ if no_combine:
+ pass
+ elif _is_cuda:
+ if use_fused_moe_sum_all_reduce:
+ if routed_scaling_factor is None:
+ routed_scaling_factor = 1.0
+ if routed_scaling_factor != 1.0:
+ assert out_slice is not None
+ out_slice.mul_(routed_scaling_factor)
+ elif topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0:
+ pass # we write directly into out_hidden_states
+ elif topk_ids.shape[1] == 2 and routed_scaling_factor == 1.0:
+ torch.add(
+ intermediate_cache3[:, 0],
+ intermediate_cache3[:, 1],
+ out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ ).squeeze(dim=1)
+ else:
+ # According to micro benchmark results, torch.compile can get better performance for small token.
+ if tokens_in_chunk <= 32:
+ moe_sum_reduce_torch_compile(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+ else:
+ moe_sum_reduce(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+
+ elif _is_hip:
+ if _use_aiter:
+ moe_sum(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ )
+ else:
+ # According to micro benchmark results, torch.compile can get better performance for small token.
+ if tokens_in_chunk <= 32:
+ moe_sum_reduce_torch_compile(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+ else:
+ moe_sum_reduce_triton(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+ elif _is_xpu:
+ moe_sum_reduce(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+ else:
+ if _has_vllm_ops:
+ vllm_ops.moe_sum(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ )
+ else:
+ # Fallback: use triton moe_sum_reduce when vllm is not available
+ moe_sum_reduce_triton(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states[begin_chunk_idx:end_chunk_idx],
+ routed_scaling_factor,
+ )
+
+ return out_hidden_states
+
+
+def fused_moe(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_output: StandardTopKOutput,
+ moe_runner_config: MoeRunnerConfig = MoeRunnerConfig(),
+ b1: Optional[torch.Tensor] = None,
+ b2: Optional[torch.Tensor] = None,
+ use_fp8_w8a8: bool = False,
+ use_int8_w8a8: bool = False,
+ use_int8_w8a16: bool = False,
+ use_int4_w4a16: bool = False,
+ per_channel_quant: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ w1_zp: Optional[torch.Tensor] = None,
+ w2_zp: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[List[int]] = None,
+) -> torch.Tensor:
+ """
+ This function computes a Mixture of Experts (MoE) layer using two sets of
+ weights, w1 and w2, and top-k gating mechanism.
+
+ Parameters:
+ - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+ - w1 (torch.Tensor): The first set of expert weights.
+ - w2 (torch.Tensor): The second set of expert weights.
+ - topk_output (StandardTopKOutput): The top-k output of the experts.
+ - moe_runner_config (MoeRunnerConfig): The configuration for the MoE runner.
+ - b1 (Optional[torch.Tensor]): Optional bias for w1.
+ - b2 (Optional[torch.Tensor]): Optional bias for w2.
+ - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+ products for w1 and w2. Defaults to False.
+ - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+ products for w1 and w2. Defaults to False.
+ - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
+ products for w1 and w2. Defaults to False.
+ - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+ activation to compute the inner products for w1 and w2.
+ Defaults to False.
+ - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+ w1.
+ - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+ w2.
+ - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+ a1.
+ - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+ a2.
+ - block_shape: (Optional[List[int]]): Optional block size for block-wise
+ quantization.
+ - gemm1_alpha (Optional[float]): Optional gemm1_alpha for the activation
+ function.
+ - gemm1_limit (Optional[float]): Optional gemm1_limit for the swiglu activation
+ function.
+
+ Returns:
+ - torch.Tensor: The output tensor after applying the MoE layer.
+ """
+ if _use_sgl_xpu:
+ topk_weight, topk_ids, _ = topk_output
+ from sgl_kernel import fused_experts as sgl_fused_experts
+
+ return sgl_fused_experts(
+ hidden_states,
+ w1,
+ w2,
+ topk_weight,
+ topk_ids,
+ b1=b1,
+ b2=b2,
+ use_fp8_w8a8=use_fp8_w8a8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ w1_zp=w1_zp,
+ w2_zp=w2_zp,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
+
+ return fused_experts(
+ hidden_states,
+ w1,
+ w2,
+ topk_output,
+ moe_runner_config=moe_runner_config,
+ b1=b1,
+ b2=b2,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ per_channel_quant=per_channel_quant,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ w1_zp=w1_zp,
+ w2_zp=w2_zp,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a6a31c70c09fb78c21b9a5d4428edb8f0d5214
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
@@ -0,0 +1,290 @@
+from __future__ import annotations
+
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import get_device_name, is_hip
+
+logger = logging.getLogger(__name__)
+_is_hip = is_hip()
+
+
+def get_config_file_name(
+ E: int,
+ N: int,
+ dtype: Optional[str],
+ block_shape: Optional[int] = None,
+ per_channel_quant: bool = False,
+ down_moe: bool = False,
+) -> str:
+ device_name = get_device_name().replace(" ", "_")
+ dtype_selector = "" if not dtype else f",dtype={dtype}"
+ block_shape_selector = (
+ "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
+ )
+ per_channel_quant_selector = ",per_channel_quant=True" if per_channel_quant else ""
+ down_moe_selector = "_down" if down_moe else ""
+ return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}{per_channel_quant_selector}{down_moe_selector}.json"
+
+
+@functools.lru_cache
+def get_moe_configs(
+ E: int,
+ N: int,
+ dtype: Optional[str],
+ block_n: Optional[int] = 0,
+ block_k: Optional[int] = 0,
+ per_channel_quant: bool = False,
+ down_moe: bool = False,
+) -> Optional[Dict[int, Any]]:
+ """
+ Return optimized configurations for the fused MoE kernel.
+
+ The return value will be a dictionary that maps an irregular grid of
+ batch sizes to configurations of the fused_moe kernel. To evaluate the
+ kernel on a given batch size bs, the closest batch size in the grid should
+ be picked and the associated configuration chosen to invoke the kernel.
+ """
+ if get_global_server_args().enable_deterministic_inference:
+ logger.warning(
+ "Deterministic inference is enabled, using default MoE kernel config."
+ )
+ return None
+ # Supported Triton versions, should be sorted from the newest to the oldest
+ supported_triton_versions = ["3.4.0", "3.3.1", "3.2.0", "3.1.0"]
+
+ # First look up if an optimized configuration is available in the configs
+ # directory
+ json_file_name = get_config_file_name(
+ E,
+ N,
+ dtype,
+ [block_n, block_k],
+ per_channel_quant,
+ down_moe=down_moe,
+ )
+
+ # We found that using the fused_moe_kernel config from Triton 3.1.0 with Triton 3.2.0 results in negative performance gains,
+ # so we also include the Triton version as a key for finding the fused_moe_kernel config to achieve the best performance.
+ config_dir = os.environ.get(
+ "SGLANG_MOE_CONFIG_DIR", os.path.dirname(os.path.realpath(__file__))
+ )
+
+ triton_version = triton.__version__
+ version_dir = f"triton_{triton_version.replace('.', '_')}"
+ config_file_path = os.path.join(
+ config_dir,
+ "configs",
+ version_dir,
+ json_file_name,
+ )
+ if os.path.exists(config_file_path):
+ with open(config_file_path) as f:
+ # Please note that although we find the config files, performance might still be suboptimal.
+ # This is because the tuning environment might differ from your current environment.
+ # For example, updating the Triton version might cause all old configs to become suboptimal.
+ # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
+ # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
+ logger.info(f"Using MoE kernel config from {config_file_path}.")
+ # If a configuration has been found, return it
+ return {int(key): val for key, val in json.load(f).items()}
+
+ # Searching for other triton versions that supports the same config
+ for try_triton_version in supported_triton_versions:
+ if try_triton_version == triton_version:
+ continue
+ try_config_file_path = os.path.join(
+ config_dir,
+ "configs",
+ f"triton_{try_triton_version.replace('.', '_')}",
+ json_file_name,
+ )
+ if os.path.exists(try_config_file_path):
+ with open(try_config_file_path) as f:
+ logger.warning(
+ f"Config file not found at {config_file_path}. Fallback to triton version {try_triton_version} and use MoE kernel config from {try_config_file_path}. Performance might be sub-optimal!",
+ )
+ # If a configuration has been found, return it
+ return {int(key): val for key, val in json.load(f).items()}
+
+ # If no optimized configuration is available, we will use the default configuration when down_moe is False
+ # When down_moe is True, we will try to use the config for down_moe=False
+ if down_moe:
+ logger.warning(
+ (
+ "Using MoE kernel config with down_moe=False. Performance might be sub-optimal! "
+ "Config file not found at %s, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton"
+ ),
+ config_file_path,
+ )
+ else:
+ logger.warning(
+ (
+ "Using default MoE kernel config. Performance might be sub-optimal! "
+ "Config file not found at %s, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton"
+ ),
+ config_file_path,
+ )
+ return None
+
+
+def get_default_config(
+ M: int,
+ E: int,
+ N: int,
+ K: int,
+ topk: int,
+ dtype: Optional[str],
+ is_marlin: bool,
+ block_shape: Optional[List[int]] = None,
+) -> Dict[str, int]:
+ if get_global_server_args().enable_deterministic_inference:
+ config = {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ }
+ return config
+ if dtype == "fp8_w8a8":
+ if block_shape is None:
+ config = {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2 if _is_hip else 4,
+ }
+ if M <= E:
+ config = {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2 if _is_hip else 4,
+ }
+ else:
+ # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
+ config = {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": block_shape[0],
+ "BLOCK_SIZE_K": block_shape[1],
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2 if _is_hip else 3,
+ }
+ else:
+ config = {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 32,
+ "GROUP_SIZE_M": 8,
+ }
+ # A heuristic: fused marlin works faster with this config for small M
+ if M <= E or (is_marlin and M <= 32):
+ config = {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ }
+ return config
+
+
+def try_get_optimal_moe_config(
+ w1_shape: Tuple[int, ...],
+ w2_shape: Tuple[int, ...],
+ top_k: int,
+ dtype: Optional[str],
+ M: int,
+ is_marlin: bool = False,
+ block_shape: Optional[List[int]] = None,
+ per_channel_quant: bool = False,
+ return_down_config: bool = False,
+):
+ from sglang.srt.layers.moe.fused_moe_triton import get_config
+
+ down_config = None
+ max_block_m = None
+ override_config = get_config()
+ if override_config:
+ config = override_config
+ else:
+ # First try to load optimal config from the file
+ E, _, N = w2_shape
+ block_n = block_shape[0] if block_shape else 0
+ block_k = block_shape[1] if block_shape else 0
+ configs = get_moe_configs(
+ E,
+ N,
+ dtype,
+ block_n,
+ block_k,
+ per_channel_quant=per_channel_quant,
+ down_moe=False,
+ )
+
+ if configs:
+ # If an optimal configuration map has been found, look up the
+ # optimal config
+ config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+ else:
+ # Else use the default config
+ config = get_default_config(
+ M, E, N, w1_shape[2], top_k, dtype, is_marlin, block_shape
+ )
+ if return_down_config:
+ down_configs = get_moe_configs(
+ E,
+ N,
+ dtype,
+ block_n,
+ block_k,
+ per_channel_quant=per_channel_quant,
+ down_moe=True,
+ )
+ if down_configs:
+ down_config = down_configs[
+ min(down_configs.keys(), key=lambda x: abs(x - M))
+ ]
+ down_config = dict(**down_config)
+ max_block_m = max(
+ [cfg["BLOCK_SIZE_M"] for cfg in down_configs.values()]
+ )
+ if return_down_config:
+ assert (
+ down_config is None or config["BLOCK_SIZE_M"] == down_config["BLOCK_SIZE_M"]
+ )
+ return config, (down_config, max_block_m)
+ return config
+
+
+def get_config_dtype_str(
+ dtype: torch.dtype,
+ use_int8_w8a16: Optional[bool] = False,
+ use_int4_w4a16: Optional[bool] = False,
+ use_fp8_w8a8: Optional[bool] = False,
+ use_int8_w8a8: Optional[bool] = False,
+):
+ if use_fp8_w8a8:
+ return "fp8_w8a8"
+ elif use_int8_w8a8:
+ return "int8_w8a8"
+ elif use_int4_w4a16:
+ return "int4_w4a16"
+ elif use_int8_w8a16:
+ return "int8_w8a16"
+ elif dtype == torch.float:
+ # avoiding cases where kernel fails when float32 MoE
+ # use fp16/bfloat16 configs
+ return "float32"
+ return None
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80e1827baead0c30705c8bdcb2bdf89b0237ba9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
@@ -0,0 +1,1173 @@
+from __future__ import annotations
+
+import functools
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.batch_invariant_ops import is_batch_invariant_mode_enabled
+from sglang.srt.layers.quantization.fp8_kernel import (
+ per_token_group_quant_fp8,
+ scaled_fp8_quant,
+ sglang_per_token_group_quant_fp8,
+)
+from sglang.srt.layers.quantization.int8_kernel import (
+ per_token_group_quant_int8,
+ per_token_quant_int8,
+ sglang_per_token_group_quant_int8,
+)
+from sglang.srt.utils import (
+ cpu_has_amx_support,
+ get_bool_env_var,
+ is_cpu,
+ is_cuda,
+ is_hip,
+ is_sm90_supported,
+)
+
+try:
+ from triton.tools.tensor_descriptor import TensorDescriptor
+
+ _support_tensor_descriptor = True
+except:
+ _support_tensor_descriptor = False
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+ pass
+elif _is_cpu and _is_cpu_amx_available:
+ pass
+elif _is_hip:
+ pass
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+def support_tensor_descriptor():
+ return _support_tensor_descriptor
+
+
+# swap_ab benefits SM90 GPUs (H20, H100, H200, etc.) for certain block shapes.
+@functools.lru_cache(maxsize=8)
+def should_enable_swap_ab(
+ BLOCK_SIZE_M: int,
+ BLOCK_SIZE_N: int,
+) -> bool:
+ if not _is_cuda or is_batch_invariant_mode_enabled():
+ return False
+
+ return is_sm90_supported() and BLOCK_SIZE_M < 64 and BLOCK_SIZE_N >= 64
+
+
+@triton.jit
+def write_zeros_to_output(
+ c_ptr,
+ stride_cm,
+ stride_cn,
+ pid_n,
+ N,
+ offs_token,
+ token_mask,
+ BLOCK_SIZE_M,
+ BLOCK_SIZE_N,
+ compute_type,
+):
+ accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+ offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+ c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+ c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+ tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+ # Pointers to matrices
+ a_ptr,
+ b_ptr,
+ c_ptr,
+ b_scale_ptr,
+ b_zp_ptr,
+ topk_weights_ptr,
+ sorted_token_ids_ptr,
+ expert_ids_ptr,
+ num_tokens_post_padded_ptr,
+ # Matrix dimensions
+ N: tl.constexpr,
+ K: tl.constexpr,
+ EM,
+ num_valid_tokens,
+ # The stride variables represent how much to increase the ptr by when
+ # moving by 1 element in a particular dimension. E.g. `stride_am` is
+ # how much to increase `a_ptr` by to get the element one row down
+ # (A has M rows).
+ stride_am,
+ stride_ak,
+ stride_be,
+ stride_bk,
+ stride_bn,
+ stride_cm,
+ stride_cn,
+ stride_bse,
+ stride_bsk,
+ stride_bsn,
+ stride_bze,
+ stride_bzk,
+ stride_bzn,
+ group_size: tl.constexpr,
+ # Meta-parameters
+ BLOCK_SIZE_M: tl.constexpr,
+ BLOCK_SIZE_N: tl.constexpr,
+ BLOCK_SIZE_K: tl.constexpr,
+ GROUP_SIZE_M: tl.constexpr,
+ MUL_ROUTED_WEIGHT: tl.constexpr,
+ top_k: tl.constexpr,
+ compute_type: tl.constexpr,
+ has_zp: tl.constexpr,
+ use_int4_w4a16: tl.constexpr,
+ use_int8_w8a16: tl.constexpr,
+ even_Ks: tl.constexpr,
+ filter_expert: tl.constexpr,
+):
+ """
+ Implements the fused computation for a Mixture of Experts (MOE) using
+ token and expert matrices.
+ Key Parameters:
+ - A: The input tensor representing tokens with shape (*, K), where '*' can
+ be any shape representing batches and K is the feature dimension of
+ each token.
+ - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+ the number of experts, K is the input feature dimension, and N is
+ the output feature dimension.
+ - C: The output cache tensor with shape (M, topk, N), where M is the
+ total number of tokens post padding, topk is the number of times
+ each token is repeated, and N is the output feature dimension.
+ - sorted_token_ids: A tensor containing the sorted indices of tokens,
+ repeated topk times and arranged by the expert index they are
+ assigned to.
+ - expert_ids: A tensor containing the indices of the expert for each
+ block. It determines which expert matrix from B should be used for
+ each block in A.
+ This kernel performs the multiplication of a token by its corresponding
+ expert matrix as determined by `expert_ids`. The sorting of
+ `sorted_token_ids` by expert index and padding ensures divisibility by
+ BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+ multiplication across different blocks processed by the same expert.
+ """
+ # -----------------------------------------------------------
+ # Map program ids `pid` to the block of C it should compute.
+ # This is done in a grouped ordering to promote L2 data reuse.
+ pid = tl.program_id(axis=0)
+ num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+ num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+ num_pid_in_group = GROUP_SIZE_M * num_pid_n
+ group_id = pid // num_pid_in_group
+ first_pid_m = group_id * GROUP_SIZE_M
+ group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+ pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+ pid_n = (pid % num_pid_in_group) // group_size_m
+
+ # ----------------------------------------------------------
+ # Create pointers for the first blocks of A and B.
+ # We will advance this pointer as we move in the K direction
+ # and accumulate
+ # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+ # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+ num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+ if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+ return
+ offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+ offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+ token_mask = offs_token < num_valid_tokens
+
+ off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+ if filter_expert and off_experts == -1:
+ # -----------------------------------------------------------
+ # Write back zeros to the output when the expert is not
+ # in the current expert parallel rank.
+ write_zeros_to_output(
+ c_ptr,
+ stride_cm,
+ stride_cn,
+ pid_n,
+ N,
+ offs_token,
+ token_mask,
+ BLOCK_SIZE_M,
+ BLOCK_SIZE_N,
+ compute_type,
+ )
+ return
+
+ offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+ offs_k = tl.arange(0, BLOCK_SIZE_K)
+ a_ptrs = a_ptr + (
+ offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+ )
+
+ if use_int4_w4a16:
+ b_ptrs = (
+ b_ptr
+ + off_experts * stride_be
+ + (offs_k[:, None] // 2) * stride_bk
+ + offs_bn[None, :] * stride_bn
+ )
+ b_shifter = (offs_k[:, None] % 2) * 4
+ elif use_int8_w8a16:
+ b_ptrs = (
+ b_ptr
+ + off_experts * stride_be
+ + offs_k[:, None] * stride_bk
+ + offs_bn[None, :] * stride_bn
+ )
+
+ if not has_zp and use_int4_w4a16:
+ b_zp_num = 8
+ if not has_zp and use_int8_w8a16:
+ b_zp_num = 128
+ elif has_zp and use_int4_w4a16:
+ b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+ # -----------------------------------------------------------
+ # Iterate to compute a block of the C matrix.
+ # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+ # of fp32 values for higher accuracy.
+ # `accumulator` will be converted back to fp16 after the loop.
+ accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+ for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+ # Load the next block of A and B, generate a mask by checking the
+ # K dimension.
+
+ if not even_Ks:
+ k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+ k_other = 0.0
+ else:
+ k_mask = None
+ k_other = None
+
+ a = tl.load(
+ a_ptrs,
+ mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+ other=0.0,
+ )
+ b = tl.load(b_ptrs)
+ if use_int4_w4a16:
+ b = (b >> b_shifter) & 0xF
+
+ b_scale_ptrs = (
+ b_scale_ptr
+ + off_experts * stride_bse
+ + offs_bn[None, :] * stride_bsn
+ + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+ )
+ b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+ b_scale = b_scale.to(tl.float32)
+
+ if has_zp and use_int4_w4a16:
+ offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+ b_zp_ptrs = (
+ b_zp_ptr
+ + off_experts * stride_bze
+ + (offs_bn[None, :] // 2) * stride_bzn
+ + offs_k_true * stride_bzk
+ )
+ b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+ b_zp = (b_zp >> b_zp_shifter) & 0xF
+ b_zp = b_zp.to(tl.float32)
+ elif has_zp and use_int8_w8a16:
+ offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+ b_zp_ptrs = (
+ b_zp_ptr
+ + off_experts * stride_bze
+ + offs_bn[None, :] * stride_bzn
+ + offs_k_true * stride_bzk
+ )
+ b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+ b_zp = b_zp.to(tl.float32)
+
+ # We accumulate along the K dimension.
+ if has_zp:
+ b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+ else:
+ b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+ accumulator = tl.dot(a, b, acc=accumulator)
+
+ # Advance the ptrs to the next K block.
+ a_ptrs += BLOCK_SIZE_K * stride_ak
+ if use_int4_w4a16:
+ b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+ else:
+ b_ptrs += BLOCK_SIZE_K * stride_bk
+
+ if MUL_ROUTED_WEIGHT:
+ moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+ accumulator = accumulator * moe_weight[:, None]
+
+ accumulator = accumulator.to(compute_type)
+ # -----------------------------------------------------------
+ # Write back the block of the output
+ offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+ c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+ c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+ tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel(
+ # Pointers to matrices
+ a_ptr,
+ a_desc,
+ b_ptr,
+ b_desc,
+ bias_ptr,
+ c_ptr,
+ a_scale_ptr,
+ b_scale_ptr,
+ topk_weights_ptr,
+ sorted_token_ids_ptr,
+ expert_ids_ptr,
+ num_tokens_post_padded_ptr,
+ # Matrix dimensions
+ N,
+ K,
+ EM,
+ num_valid_tokens,
+ # The stride variables represent how much to increase the ptr by when
+ # moving by 1 element in a particular dimension. E.g. `stride_am` is
+ # how much to increase `a_ptr` by to get the element one row down
+ # (A has M rows).
+ stride_am,
+ stride_ak,
+ stride_be,
+ stride_bk,
+ stride_bn,
+ stride_bias_e,
+ stride_bias_n,
+ stride_cm,
+ stride_cn,
+ stride_asm,
+ stride_ask,
+ stride_bse,
+ stride_bsk,
+ stride_bsn,
+ # Block size for block-wise quantization
+ group_n: tl.constexpr,
+ group_k: tl.constexpr,
+ # Meta-parameters
+ BLOCK_SIZE_M: tl.constexpr,
+ BLOCK_SIZE_N: tl.constexpr,
+ BLOCK_SIZE_K: tl.constexpr,
+ GROUP_SIZE_M: tl.constexpr,
+ MUL_ROUTED_WEIGHT: tl.constexpr,
+ top_k: tl.constexpr,
+ compute_type: tl.constexpr,
+ use_fp8_w8a8: tl.constexpr,
+ use_int8_w8a8: tl.constexpr,
+ use_int8_w8a16: tl.constexpr,
+ per_channel_quant: tl.constexpr,
+ even_Ks: tl.constexpr,
+ c_sorted: tl.constexpr,
+ filter_expert: tl.constexpr,
+ swap_ab: tl.constexpr,
+ FUSE_SUM_ALL_REDUCE: tl.constexpr,
+ ROUTER_TOPK: tl.constexpr,
+):
+ """
+ Implements the fused computation for a Mixture of Experts (MOE) using
+ token and expert matrices.
+
+ Key Parameters:
+ - A: The input tensor representing tokens with shape (*, K), where '*' can
+ be any shape representing batches and K is the feature dimension of
+ each token.
+ - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+ the number of experts, K is the input feature dimension, and N is
+ the output feature dimension.
+ - C: The output cache tensor with shape (M, topk, N), where M is the
+ total number of tokens post padding, topk is the number of times
+ each token is repeated, and N is the output feature dimension.
+ - sorted_token_ids: A tensor containing the sorted indices of tokens,
+ repeated topk times and arranged by the expert index they are
+ assigned to.
+ - expert_ids: A tensor containing the indices of the expert for each
+ block. It determines which expert matrix from B should be used for
+ each block in A.
+
+ This kernel performs the multiplication of a token by its corresponding
+ expert matrix as determined by `expert_ids`. The sorting of
+ `sorted_token_ids` by expert index and padding ensures divisibility by
+ BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+ multiplication across different blocks processed by the same expert.
+ """
+ # -----------------------------------------------------------
+ # Map program ids `pid` to the block of C it should compute.
+ # This is done in a grouped ordering to promote L2 data reuse.
+ pid = tl.program_id(axis=0)
+ num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+ num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+ num_pid_in_group = GROUP_SIZE_M * num_pid_n
+ group_id = pid // num_pid_in_group
+ first_pid_m = group_id * GROUP_SIZE_M
+ group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+ pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+ pid_n = (pid % num_pid_in_group) // group_size_m
+
+ # ----------------------------------------------------------
+ # Create pointers for the first blocks of A and B.
+ # We will advance this pointer as we move in the K direction
+ # and accumulate
+ # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+ # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+ num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+ if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+ return
+ offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+ offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+ offs_token = offs_token.to(tl.int64)
+ token_mask = offs_token < num_valid_tokens
+
+ off_experts_i32 = tl.load(expert_ids_ptr + pid_m)
+ off_experts = off_experts_i32.to(tl.int64)
+
+ if filter_expert and off_experts == -1:
+ # -----------------------------------------------------------
+ # Write back zeros to the output when the expert is not
+ # in the current expert parallel rank.
+ write_zeros_to_output(
+ c_ptr,
+ stride_cm,
+ stride_cn,
+ pid_n,
+ N,
+ offs_token,
+ token_mask,
+ BLOCK_SIZE_M,
+ BLOCK_SIZE_N,
+ compute_type,
+ )
+ return
+
+ offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+ offs_k = tl.arange(0, BLOCK_SIZE_K)
+ if a_desc is not None:
+ assert use_fp8_w8a8 and group_n > 0 and group_k > 0
+ start_offs_m = pid_m * BLOCK_SIZE_M
+ else:
+ a_ptrs = a_ptr + (
+ offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+ )
+
+ if b_desc is not None:
+ start_offs_n = pid_n * BLOCK_SIZE_N
+ else:
+ b_ptrs = (
+ b_ptr
+ + off_experts * stride_be
+ + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+ )
+
+ if bias_ptr is not None:
+ bias = tl.load(
+ bias_ptr + off_experts * stride_bias_e + offs_bn[None, :] * stride_bias_n
+ )
+ if use_int8_w8a16:
+ b_scale_ptrs = (
+ b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+ )
+ b_scale = tl.load(b_scale_ptrs)
+
+ if use_fp8_w8a8 or use_int8_w8a8:
+ # block-wise
+ if group_k > 0 and group_n > 0:
+ if a_desc is not None:
+ a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm
+ else:
+ a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+ if BLOCK_SIZE_N > group_n:
+ offs_bsn = offs_bn // group_n
+ else:
+ offs_bsn = pid_n * BLOCK_SIZE_N // group_n
+ b_scale_ptrs = (
+ b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+ )
+ # channel-wise
+ elif per_channel_quant:
+ b_scale_ptrs = (
+ b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+ )
+ b_scale = tl.load(b_scale_ptrs)
+ # Load per-token scale for activations
+ a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+ a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None]
+ # tensor-wise
+ else:
+ a_scale = tl.load(a_scale_ptr)
+ b_scale = tl.load(b_scale_ptr + off_experts)
+
+ # -----------------------------------------------------------
+ # Iterate to compute a block of the C matrix.
+ # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+ # of fp32 values for higher accuracy.
+ # `accumulator` will be converted back to fp16 after the loop.
+ if swap_ab:
+ accumulator = tl.zeros((BLOCK_SIZE_N, BLOCK_SIZE_M), dtype=tl.float32)
+ else:
+ accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+ for k_start in range(0, K, BLOCK_SIZE_K):
+ # Load the next block of A and B, generate a mask by checking the
+ # K dimension.
+ if a_desc is not None:
+ a = a_desc.load([start_offs_m, k_start])
+ elif even_Ks:
+ a = tl.load(
+ a_ptrs,
+ mask=token_mask[:, None],
+ other=0.0,
+ )
+ else:
+ a = tl.load(
+ a_ptrs,
+ mask=token_mask[:, None] & (offs_k[None, :] < K - k_start),
+ other=0.0,
+ )
+
+ if b_desc is not None:
+ b = (
+ b_desc.load([off_experts_i32, start_offs_n, k_start])
+ .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K)
+ .T
+ )
+ elif even_Ks:
+ b = tl.load(b_ptrs)
+ else:
+ b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k_start, other=0.0)
+
+ # We accumulate along the K dimension.
+ if use_int8_w8a16:
+ accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+ elif use_fp8_w8a8 or use_int8_w8a8:
+ if group_k > 0 and group_n > 0:
+ offs_ks = k_start // group_k
+ a_scale = tl.load(
+ a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0
+ )
+ b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+ if swap_ab:
+ a, b = tl.trans(b, (1, 0)), tl.trans(a, (1, 0))
+ a_scale, b_scale = b_scale, a_scale
+ if BLOCK_SIZE_N > group_n:
+ accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
+ else:
+ accumulator += tl.dot(a, b) * (a_scale[:, None] * b_scale)
+ else:
+ if use_fp8_w8a8:
+ if swap_ab:
+ a, b = tl.trans(b, (1, 0)), tl.trans(a, (1, 0))
+ accumulator = tl.dot(a, b, acc=accumulator)
+ else:
+ accumulator += tl.dot(a, b)
+ else:
+ accumulator += tl.dot(a, b)
+ # Advance the ptrs to the next K block.
+ if a_desc is None:
+ a_ptrs += BLOCK_SIZE_K * stride_ak
+ if b_desc is None:
+ b_ptrs += BLOCK_SIZE_K * stride_bk
+
+ if swap_ab:
+ accumulator = tl.trans(accumulator, (1, 0))
+
+ if use_int8_w8a16:
+ accumulator *= b_scale
+ elif use_fp8_w8a8 or use_int8_w8a8:
+ if group_k == 0 or group_n == 0:
+ accumulator *= a_scale * b_scale
+
+ if bias_ptr is not None:
+ accumulator += bias
+
+ if MUL_ROUTED_WEIGHT:
+ moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+ accumulator *= moe_weight[:, None]
+
+ accumulator = accumulator.to(compute_type)
+ # -----------------------------------------------------------
+ # Write back the block of the output
+ offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+ if FUSE_SUM_ALL_REDUCE:
+ offs_token_out = offs_token // ROUTER_TOPK
+ c_ptrs = (
+ c_ptr + stride_cm * offs_token_out[:, None] + stride_cn * offs_cn[None, :]
+ )
+ c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+ tl.atomic_add(c_ptrs, accumulator, mask=c_mask)
+ else:
+ if c_sorted:
+ c_ptrs = (
+ c_ptr
+ + stride_cm * offs_token_id[:, None]
+ + stride_cn * offs_cn[None, :]
+ )
+ else:
+ c_ptrs = (
+ c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+ )
+ c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+ tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+# -----------------------------------------------------------------------------
+# TMA allocator: set once per process (avoid per-call triton.set_allocator)
+# -----------------------------------------------------------------------------
+_TMA_ALLOCATOR_SET = False
+
+
+def _set_triton_tma_allocator():
+ """TMA descriptors require a global allocator; set it once to avoid per-call overhead."""
+ global _TMA_ALLOCATOR_SET
+ if _TMA_ALLOCATOR_SET:
+ return
+
+ # TMA descriptors require a global memory allocation
+ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
+ # NOTE: keep this allocation on CUDA device
+ return torch.empty(size, device="cuda", dtype=torch.int8)
+
+ triton.set_allocator(alloc_fn)
+ _TMA_ALLOCATOR_SET = True
+
+
+# --- B TensorDescriptor cache (LRU) ---
+_B_DESC_CACHE_MAX = 64
+_B_DESC_CACHE: "OrderedDict[tuple, TensorDescriptor]" = OrderedDict()
+
+
+def _get_b_tma_desc_cached(B: torch.Tensor, block_n: int, block_k: int):
+ """
+ Cache TensorDescriptor for constant weight B.
+ Keyed by storage ptr + shape/stride/dtype + tile shape.
+ """
+ key = (
+ int(B.data_ptr()),
+ tuple(B.shape),
+ tuple(B.stride()),
+ str(B.dtype),
+ int(block_n),
+ int(block_k),
+ )
+
+ desc = _B_DESC_CACHE.get(key, None)
+ if desc is not None:
+ _B_DESC_CACHE.move_to_end(key)
+ return desc
+
+ # Create outside lock to reduce lock hold time (ok if duplicated rarely)
+ desc = TensorDescriptor(
+ B,
+ B.shape,
+ B.stride(),
+ [1, block_n, block_k],
+ )
+
+ _B_DESC_CACHE[key] = desc
+ _B_DESC_CACHE.move_to_end(key)
+ if len(_B_DESC_CACHE) > _B_DESC_CACHE_MAX:
+ _B_DESC_CACHE.popitem(last=False)
+
+ return desc
+
+
+def invoke_fused_moe_kernel(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ bias: Optional[torch.Tensor],
+ C: torch.Tensor,
+ A_scale: Optional[torch.Tensor],
+ B_scale: Optional[torch.Tensor],
+ B_zp: Optional[torch.Tensor],
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ sorted_token_ids: torch.Tensor,
+ expert_ids: torch.Tensor,
+ num_tokens_post_padded: torch.Tensor,
+ mul_routed_weight: bool,
+ top_k: int,
+ config: Dict[str, Any],
+ compute_type: tl.dtype,
+ use_fp8_w8a8: bool,
+ use_int8_w8a8: bool,
+ use_int8_w8a16: bool,
+ use_int4_w4a16: bool,
+ per_channel_quant: bool,
+ block_shape: Optional[List[int]] = None,
+ no_combine: bool = False,
+ a_use_tma: bool = False,
+ b_use_tma: bool = False,
+ c_sorted: bool = False,
+ filter_expert: bool = True,
+ fuse_sum_all_reduce: bool = False,
+ router_topk: int = 1,
+) -> None:
+ assert topk_weights.stride(1) == 1
+ assert sorted_token_ids.stride(0) == 1
+
+ if use_fp8_w8a8:
+ swap_ab = should_enable_swap_ab(config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"])
+ else:
+ swap_ab = False
+
+ padded_size = 0
+ if use_fp8_w8a8:
+ assert B_scale is not None
+ if block_shape is None:
+ # activation tensor-wise fp8 quantization, dynamic or static
+ padded_size = padding_size
+ # activations apply per-token quantization when weights apply per-channel quantization by default
+ A, A_scale = scaled_fp8_quant(
+ A, A_scale, use_per_token_if_dynamic=per_channel_quant
+ )
+ else:
+ # activation block-wise fp8 quantization
+ assert len(block_shape) == 2
+ block_n, block_k = block_shape[0], block_shape[1]
+ if _is_cuda:
+ A, A_scale = sglang_per_token_group_quant_fp8(A, block_k)
+ else:
+ A, A_scale = per_token_group_quant_fp8(A, block_k)
+ assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+ assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+ assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+ elif use_int8_w8a8:
+ assert B_scale is not None
+ if block_shape is None:
+ # activation channel-wise int8 quantization
+ assert (
+ per_channel_quant
+ ), "int8 quantization only supports channel-wise quantization except for block-wise quantization"
+ A, A_scale = per_token_quant_int8(A)
+ else:
+ # activation block-wise int8 quantization
+ assert len(block_shape) == 2
+ block_n, block_k = block_shape[0], block_shape[1]
+ if _is_cuda:
+ A, A_scale = sglang_per_token_group_quant_int8(A, block_k)
+ else:
+ A, A_scale = per_token_group_quant_int8(A, block_k)
+ assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+ assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+ assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+ elif use_int8_w8a16 or use_int4_w4a16:
+ assert B_scale is not None
+ assert block_shape is None or block_shape[0] == 0
+ else:
+ assert A_scale is None
+ assert B_scale is None
+
+ grid = lambda META: (
+ triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"])
+ * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]),
+ )
+
+ K = B.shape[2] - padded_size
+ if K % config["BLOCK_SIZE_K"] == 0:
+ even_Ks = True
+ else:
+ even_Ks = False
+
+ if fuse_sum_all_reduce:
+ assert not c_sorted, "fuse_sum_all_reduce only supports c_sorted=False"
+
+ if (
+ (use_int8_w8a16 or use_int4_w4a16)
+ and block_shape is not None
+ and block_shape[1] > 0
+ ):
+ assert (
+ not fuse_sum_all_reduce
+ ), "fuse_sum_all_reduce is not supported for GPTQ/AWQ kernels"
+ assert B_scale is not None and B_scale.ndim == 3
+ assert B_zp is None or B_zp.ndim == 3
+ assert bias is None
+ fused_moe_kernel_gptq_awq[grid](
+ A,
+ B,
+ C,
+ B_scale,
+ B_zp,
+ topk_weights,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ B.shape[1],
+ A.shape[1],
+ sorted_token_ids.shape[0],
+ topk_ids.numel(),
+ A.stride(0),
+ A.stride(1),
+ B.stride(0),
+ B.stride(2),
+ B.stride(1),
+ C.stride(-2),
+ C.stride(-1),
+ B_scale.stride(0),
+ B_scale.stride(2),
+ B_scale.stride(1),
+ B_zp.stride(0) if B_zp is not None else 0,
+ B_zp.stride(2) if B_zp is not None else 0,
+ B_zp.stride(1) if B_zp is not None else 0,
+ group_size=block_shape[1],
+ MUL_ROUTED_WEIGHT=mul_routed_weight,
+ top_k=top_k,
+ compute_type=compute_type,
+ has_zp=B_zp is not None,
+ use_int4_w4a16=use_int4_w4a16,
+ use_int8_w8a16=use_int8_w8a16,
+ even_Ks=even_Ks,
+ filter_expert=filter_expert,
+ **config,
+ )
+
+ else:
+ if a_use_tma or b_use_tma:
+ _set_triton_tma_allocator()
+
+ if a_use_tma:
+ a_desc = TensorDescriptor(
+ A, A.shape, A.stride(), [config["BLOCK_SIZE_M"], config["BLOCK_SIZE_K"]]
+ )
+ else:
+ a_desc = None
+ if b_use_tma:
+ # B is constant weights -> cache descriptor
+ b_desc = _get_b_tma_desc_cached(
+ B,
+ config["BLOCK_SIZE_N"],
+ config["BLOCK_SIZE_K"],
+ )
+ else:
+ b_desc = None
+
+ fused_moe_kernel[grid](
+ A,
+ a_desc,
+ B,
+ b_desc,
+ bias,
+ C,
+ A_scale,
+ B_scale,
+ topk_weights,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ B.shape[1],
+ B.shape[2] - padded_size,
+ sorted_token_ids.shape[0],
+ topk_ids.numel(),
+ A.stride(0),
+ A.stride(1),
+ B.stride(0),
+ B.stride(2),
+ B.stride(1),
+ bias.stride(0) if bias is not None else 0,
+ bias.stride(1) if bias is not None else 0,
+ C.stride(-2),
+ C.stride(-1),
+ A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+ A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+ B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+ B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+ B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+ 0 if block_shape is None else block_shape[0],
+ 0 if block_shape is None else block_shape[1],
+ MUL_ROUTED_WEIGHT=mul_routed_weight,
+ top_k=top_k,
+ compute_type=compute_type,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ per_channel_quant=per_channel_quant,
+ even_Ks=even_Ks,
+ c_sorted=c_sorted,
+ filter_expert=filter_expert,
+ swap_ab=swap_ab,
+ FUSE_SUM_ALL_REDUCE=fuse_sum_all_reduce,
+ ROUTER_TOPK=router_topk,
+ **config,
+ )
+
+
+@triton.jit
+def tanh(x):
+ return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _apply_activation(x, ACTIVATION_TYPE: tl.constexpr):
+ """
+ Apply activation function based on compile-time constant.
+
+ Args:
+ x: Input tensor (converted to float32 inside)
+ ACTIVATION_TYPE: Compile-time constant string ("silu" or "gelu")
+
+ Returns:
+ Activated output in the same dtype as input
+ """
+ x = x.to(tl.float32)
+ if ACTIVATION_TYPE == "silu":
+ return x * tl.sigmoid(x)
+ elif ACTIVATION_TYPE == "gelu":
+ kAlpha = 0.7978845608028654
+ return 0.5 * x * (1 + tanh(kAlpha * (x + 0.044715 * x * x * x)))
+ else:
+ raise ValueError(f"Unsupported activation: {ACTIVATION_TYPE}")
+
+
+@triton.jit
+def act_and_mul_kernel(
+ gateup_output,
+ down_input,
+ hidden_size,
+ expert_ids_ptr,
+ expert_step: tl.constexpr,
+ BLOCK_SIZE: tl.constexpr,
+ ACTIVATION_TYPE: tl.constexpr,
+):
+ """
+ Unified activation and multiply kernel that handles both sorted and unsorted routing,
+ and both SiLU and GELU activations using compile-time constants.
+ """
+ InDtype = gateup_output.dtype.element_ty
+ OutDtype = down_input.dtype.element_ty
+
+ half_hidden_size = hidden_size // 2
+ pid = tl.program_id(0)
+
+ expert_id = tl.load(expert_ids_ptr + pid // expert_step)
+
+ if expert_id == -1:
+ return
+
+ gateup_output_ptr = gateup_output + pid * hidden_size
+ down_input_ptr = down_input + pid * half_hidden_size
+ gate_output_ptr = gateup_output_ptr
+ up_output_ptr = gateup_output_ptr + half_hidden_size
+
+ for start_offset in tl.range(0, half_hidden_size, BLOCK_SIZE):
+ offset = start_offset + tl.arange(0, BLOCK_SIZE)
+ mask = offset < half_hidden_size
+
+ gate_output = tl.load(gate_output_ptr + offset, mask=mask)
+ up_output = tl.load(up_output_ptr + offset, mask=mask)
+
+ gate_output_activated = _apply_activation(gate_output, ACTIVATION_TYPE)
+ gate_output_activated = gate_output_activated.to(InDtype)
+
+ act_mul_output = gate_output_activated * up_output
+ act_mul_output = act_mul_output.to(OutDtype)
+ tl.store(down_input_ptr + offset, act_mul_output, mask=mask)
+
+
+def act_and_mul_triton(
+ gateup_output: torch.Tensor,
+ down_input: torch.Tensor,
+ config: Dict[str, Any],
+ topk_ids: Optional[torch.Tensor] = None,
+ expert_ids: Optional[torch.Tensor] = None,
+ down_moe_use_tma: bool = False,
+ activation: str = "silu",
+) -> None:
+ """
+ Args:
+ gateup_output: Input tensor containing gate and up outputs concatenated
+ down_input: Output tensor for the result
+ config: Configuration dictionary with BLOCK_SIZE_M and BLOCK_SIZE_N
+ topk_ids: Expert IDs for unsorted routing (used when down_moe_use_tma=False)
+ expert_ids: Expert IDs for sorted routing (used when down_moe_use_tma=True)
+ down_moe_use_tma: Whether to use sorted routing layout
+ activation: Activation type ("silu" or "gelu")
+ """
+ grid = (down_input.shape[0],)
+ hidden_size = gateup_output.shape[1]
+ expert_ids_row = topk_ids.view(-1) if not down_moe_use_tma else expert_ids
+ expert_step = 1 if not down_moe_use_tma else config["BLOCK_SIZE_M"]
+ act_and_mul_kernel[grid](
+ gateup_output,
+ down_input,
+ hidden_size,
+ expert_ids_row,
+ expert_step,
+ BLOCK_SIZE=512,
+ ACTIVATION_TYPE=activation,
+ )
+
+
+# _moe_sum_reduce_kernel kernel modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/moe_sum_reduce.py
+@triton.jit
+def _moe_sum_reduce_kernel(
+ input_ptr,
+ input_stride_0,
+ input_stride_1,
+ input_stride_2,
+ output_ptr,
+ output_stride_0,
+ output_stride_1,
+ token_num: int,
+ topk_num: int,
+ hidden_dim: int,
+ routed_scaling_factor: tl.constexpr,
+ BLOCK_M: tl.constexpr,
+ BLOCK_DIM: tl.constexpr,
+ NUM_STAGE: tl.constexpr,
+):
+ input_stride_0 = tl.cast(input_stride_0, dtype=tl.int64)
+ input_stride_1 = tl.cast(input_stride_1, dtype=tl.int64)
+ output_stride_0 = tl.cast(output_stride_0, dtype=tl.int64)
+
+ token_block_id = tl.program_id(0)
+ dim_block_id = tl.program_id(1)
+
+ offs_token = token_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+ offs_dim = dim_block_id * BLOCK_DIM + tl.arange(0, BLOCK_DIM)
+
+ mask_token = offs_token < token_num
+ mask_dim = offs_dim < hidden_dim
+
+ base_ptrs = input_ptr + offs_token[:, None] * input_stride_0 + offs_dim[None, :]
+
+ accumulator = tl.zeros((BLOCK_M, BLOCK_DIM), dtype=tl.float32)
+
+ for i in tl.range(0, topk_num, num_stages=NUM_STAGE):
+ tile = tl.load(
+ base_ptrs + i * input_stride_1,
+ mask=mask_token[:, None] & mask_dim[None, :],
+ other=0.0,
+ )
+ accumulator += tile.to(tl.float32)
+ accumulator *= routed_scaling_factor
+
+ # -------- Write back --------
+ store_ptrs = output_ptr + offs_token[:, None] * output_stride_0 + offs_dim[None, :]
+ tl.store(
+ store_ptrs,
+ accumulator.to(input_ptr.dtype.element_ty),
+ mask=mask_token[:, None] & mask_dim[None, :],
+ )
+
+
+def moe_sum_reduce_triton(
+ input: torch.Tensor, output: torch.Tensor, routed_scaling_factor: float
+):
+ assert input.is_contiguous()
+ assert output.is_contiguous()
+
+ token_num, topk_num, hidden_dim = input.shape
+ assert output.shape[0] == token_num and output.shape[1] == hidden_dim
+
+ BLOCK_M = 1
+ BLOCK_DIM = 2048
+ NUM_STAGE = 1
+ num_warps = 16
+
+ grid = (
+ triton.cdiv(token_num, BLOCK_M),
+ triton.cdiv(hidden_dim, BLOCK_DIM),
+ )
+
+ _moe_sum_reduce_kernel[grid](
+ input,
+ *input.stride(),
+ output,
+ *output.stride(),
+ token_num=token_num,
+ topk_num=topk_num,
+ hidden_dim=hidden_dim,
+ routed_scaling_factor=routed_scaling_factor,
+ BLOCK_M=BLOCK_M,
+ BLOCK_DIM=BLOCK_DIM,
+ NUM_STAGE=NUM_STAGE,
+ num_warps=num_warps,
+ )
+ return
+
+
+@triton.jit
+def _fused_append_shared_experts_kernel(
+ topk_ids_ptr,
+ topk_weights_ptr,
+ out_ids_ptr,
+ out_weights_ptr,
+ N_BASE, # runtime scalar
+ scale_factor, # runtime scalar
+ K: tl.constexpr,
+ S: tl.constexpr,
+):
+ """
+ for m in range(M):
+ for n in range(K):
+ fused_ids[m, n] = topk_ids[m, n]
+ fused_weights[m, n] = topk_weights[m, n]
+ for s in range(S):
+ fused_ids[m, K + s] = N + s
+ fused_weights[m, K + s] = scale_factor
+ """
+ pid = tl.program_id(0)
+
+ ids_row_ptr = pid * K
+ w_row_ptr = pid * K
+ out_ids_row_ptr = pid * (K + S)
+ out_w_row_ptr = pid * (K + S)
+
+ offs_k = tl.arange(0, K)
+ ids = tl.load(topk_ids_ptr + ids_row_ptr + offs_k)
+ ws = tl.load(topk_weights_ptr + w_row_ptr + offs_k)
+
+ tl.store(out_ids_ptr + out_ids_row_ptr + offs_k, ids)
+ tl.store(out_weights_ptr + out_w_row_ptr + offs_k, ws)
+
+ offs_s = tl.arange(0, S)
+
+ shared_ids = tl.cast(N_BASE + offs_s, ids.dtype)
+ shared_ws = tl.full([S], scale_factor, dtype=ws.dtype)
+
+ tl.store(out_ids_ptr + out_ids_row_ptr + K + offs_s, shared_ids)
+ tl.store(out_weights_ptr + out_w_row_ptr + K + offs_s, shared_ws)
+
+
+def fused_append_shared_experts(
+ topk_ids, topk_weights, num_fused_shared_experts, scale_factor, N=None
+):
+ assert N is not None, "N (shared expert base id) must be provided"
+ m, k = topk_ids.shape
+ s = int(num_fused_shared_experts)
+ if s <= 0:
+ return topk_ids, topk_weights
+
+ out_ids = torch.empty((m, k + s), dtype=topk_ids.dtype, device=topk_ids.device)
+ out_weights = torch.empty(
+ (m, k + s), dtype=topk_weights.dtype, device=topk_weights.device
+ )
+
+ _fused_append_shared_experts_kernel[(m,)](
+ topk_ids,
+ topk_weights,
+ out_ids,
+ out_weights,
+ N_BASE=N,
+ scale_factor=scale_factor,
+ K=k,
+ S=s,
+ num_warps=1,
+ )
+ return out_ids, out_weights
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8da7d8eef330944c11f596c6cc5c01f8908891dc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -0,0 +1,1314 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
+
+import logging
+from enum import Enum
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.batch_overlap.single_batch_overlap import DownGemmOverlapArgs
+from sglang.srt.batch_overlap.two_batch_overlap import MaybeTboDeepEPDispatcher
+from sglang.srt.compilation.piecewise_context_manager import (
+ get_forward_context,
+ is_in_piecewise_cuda_graph,
+)
+from sglang.srt.distributed import (
+ get_moe_expert_parallel_rank,
+ get_moe_expert_parallel_world_size,
+ get_moe_tensor_parallel_rank,
+ get_moe_tensor_parallel_world_size,
+ get_tp_group,
+ tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+ use_symmetric_memory,
+)
+from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import (
+ MoeRunnerConfig,
+ get_deepep_mode,
+ get_moe_a2a_backend,
+ get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.kt_ep_wrapper import (
+ KTEPWrapperMethod,
+ create_kt_config_from_server_args,
+)
+from sglang.srt.layers.moe.token_dispatcher import CombineInput, DispatchOutput
+from sglang.srt.layers.moe.token_dispatcher.base import BaseDispatcher
+from sglang.srt.layers.moe.token_dispatcher.flashinfer import FlashinferDispatcher
+from sglang.srt.layers.moe.token_dispatcher.standard import (
+ StandardDispatcher,
+)
+from sglang.srt.layers.moe.topk import (
+ BypassedTopKOutput,
+ StandardTopKOutput,
+ TopKConfig,
+ TopKOutput,
+ TopKOutputChecker,
+)
+from sglang.srt.layers.moe.utils import RoutingMethodType
+from sglang.srt.layers.quantization.base_config import (
+ FusedMoEMethodBase,
+ QuantizationConfig,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMxInt4MoE,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
+from sglang.srt.model_loader.weight_utils import narrow_padded_param_and_loaded_weight
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+ cpu_has_amx_support,
+ get_bool_env_var,
+ is_cpu,
+ is_flashinfer_available,
+ is_hip,
+ next_power_of_2,
+ round_up,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+
+if is_flashinfer_available():
+ from flashinfer import fp4_quantize
+
+# Try to import FP4 TRTLLM function if flashinfer is available
+trtllm_fp4_block_scale_moe = None
+if get_moe_runner_backend().is_flashinfer_trtllm():
+ try:
+ from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+ except ImportError:
+ trtllm_fp4_block_scale_moe = None
+
+_is_hip = is_hip()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+logger = logging.getLogger(__name__)
+
+
+def create_moe_dispatcher(moe_runner_config: MoeRunnerConfig) -> BaseDispatcher:
+ a2a_backend = get_moe_a2a_backend()
+ if a2a_backend.is_none():
+ return StandardDispatcher(moe_runner_config)
+ elif a2a_backend.is_deepep() or a2a_backend.is_mooncake() or a2a_backend.is_mori():
+ return MaybeTboDeepEPDispatcher(
+ group=(
+ get_tp_group().device_group
+ if not a2a_backend.is_mori()
+ else get_tp_group()
+ ),
+ router_topk=moe_runner_config.top_k,
+ permute_fusion=True,
+ num_experts=moe_runner_config.num_experts,
+ num_local_experts=moe_runner_config.num_local_experts,
+ hidden_size=moe_runner_config.hidden_size,
+ params_dtype=moe_runner_config.params_dtype,
+ deepep_mode=get_deepep_mode(),
+ async_finish=True,
+ return_recv_hook=True,
+ )
+ elif a2a_backend.is_ascend_fuseep():
+ from sglang.srt.layers.moe.token_dispatcher import NpuFuseEPDispatcher
+
+ return NpuFuseEPDispatcher(
+ group=get_tp_group().device_group,
+ router_topk=moe_runner_config.top_k,
+ permute_fusion=True,
+ num_experts=moe_runner_config.num_experts,
+ num_local_experts=moe_runner_config.num_local_experts,
+ hidden_size=moe_runner_config.hidden_size,
+ params_dtype=moe_runner_config.params_dtype,
+ )
+
+ elif a2a_backend.is_flashinfer():
+ return FlashinferDispatcher(
+ group=get_tp_group().device_group,
+ router_topk=moe_runner_config.top_k,
+ num_experts=moe_runner_config.num_experts,
+ num_local_experts=moe_runner_config.num_local_experts,
+ hidden_size=moe_runner_config.hidden_size,
+ )
+ else:
+ raise NotImplementedError(f"Unsupported a2a backend: {a2a_backend}")
+
+
+class FusedMoeWeightScaleSupported(Enum):
+ TENSOR = "tensor"
+ CHANNEL = "channel"
+ GROUP = "group"
+ BLOCK = "block"
+
+
+class FusedMoE(torch.nn.Module):
+ """FusedMoE layer for MoE models.
+
+ This layer contains both MergedColumnParallel weights (gate_up_proj /
+ w13) and RowParallelLinear weights (down_proj/ w2).
+
+ Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+ copy that naming convention here and handle any remapping in the
+ load_weights function in each model implementation.
+
+ Args:
+ num_experts: Number of experts in the model
+ top_k: Number of experts selected for each token
+ hidden_size: Input hidden state size of the transformer
+ intermediate_size: Intermediate size of the experts
+ params_dtype: Data type for the parameters.
+ reduce_results: Whether to apply all_reduce on the output of the layer
+ quant_config: Quantization configuration.
+ inplace: suggestion to compute inplace (modify input activation).
+ """
+
+ def __init__(
+ self,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size: int,
+ layer_id: int,
+ top_k: Optional[int] = None,
+ num_fused_shared_experts: int = 0,
+ params_dtype: Optional[torch.dtype] = None,
+ reduce_results: bool = False,
+ quant_config: Optional[QuantizationConfig] = None,
+ prefix: str = "",
+ activation: str = "silu",
+ apply_router_weight_on_input: bool = False,
+ use_presharded_weights: bool = False,
+ inplace: bool = True,
+ no_combine: bool = False,
+ routed_scaling_factor: Optional[float] = None,
+ gemm1_alpha: Optional[float] = None,
+ gemm1_clamp_limit: Optional[float] = None,
+ use_weight_loader_fused: bool = False,
+ with_bias=False,
+ routing_method_type: Optional[RoutingMethodType] = None,
+ is_gated: bool = True,
+ ):
+ super().__init__()
+ if params_dtype is None:
+ params_dtype = torch.get_default_dtype()
+
+ self.layer_id = layer_id
+ self.top_k = top_k
+ self.hidden_size = hidden_size
+ self.num_experts = num_experts
+ self.num_fused_shared_experts = num_fused_shared_experts
+
+ self.enable_flashinfer_cutlass_moe = (
+ get_moe_runner_backend().is_flashinfer_cutlass()
+ )
+ self.moe_ep_size = get_moe_expert_parallel_world_size()
+ self.moe_ep_rank = get_moe_expert_parallel_rank()
+ self.moe_tp_size = get_moe_tensor_parallel_world_size()
+ self.moe_tp_rank = get_moe_tensor_parallel_rank()
+ assert (num_experts - num_fused_shared_experts) % self.moe_ep_size == 0
+ self.num_local_experts = (
+ num_experts - num_fused_shared_experts
+ ) // self.moe_ep_size + num_fused_shared_experts
+
+ self.expert_mask_gpu = None
+
+ assert intermediate_size % self.moe_tp_size == 0
+ self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
+ self.reduce_results = reduce_results
+ self.use_presharded_weights = use_presharded_weights
+
+ self.use_triton_kernels = get_moe_runner_backend().is_triton_kernels()
+ self.use_flashinfer_trtllm_moe = get_moe_runner_backend().is_flashinfer_trtllm()
+
+ # flashinfer_trtllm kernel requires intermediate_size to be a multiple of 128
+ # Pad the intermediate_size_per_partition if necessary
+ if (
+ self.use_flashinfer_trtllm_moe
+ and self.intermediate_size_per_partition % 128 != 0
+ ):
+ self.intermediate_size_per_partition = round_up(
+ self.intermediate_size_per_partition, 128
+ )
+
+ self.quant_config = quant_config
+ self.use_flashinfer_mxfp4_moe = get_moe_runner_backend().is_flashinfer_mxfp4()
+ # TODO maybe we should remove this `if`, since `Mxfp4MoEMethod` does another round-up logic
+ if (
+ self.quant_config is not None
+ and self.quant_config.get_name() == "mxfp4"
+ and self.use_flashinfer_mxfp4_moe
+ ):
+ hidden_size = round_up(hidden_size, 256)
+ self.hidden_size = hidden_size
+
+ self.moe_runner_config = MoeRunnerConfig(
+ num_experts=num_experts,
+ num_local_experts=self.num_local_experts,
+ hidden_size=hidden_size,
+ intermediate_size_per_partition=self.intermediate_size_per_partition,
+ layer_id=layer_id,
+ top_k=top_k,
+ num_fused_shared_experts=num_fused_shared_experts,
+ params_dtype=params_dtype,
+ activation=activation,
+ apply_router_weight_on_input=apply_router_weight_on_input,
+ inplace=inplace,
+ no_combine=no_combine,
+ routed_scaling_factor=routed_scaling_factor,
+ gemm1_alpha=gemm1_alpha,
+ gemm1_clamp_limit=gemm1_clamp_limit,
+ is_gated=is_gated,
+ routing_method_type=routing_method_type,
+ )
+
+ self.quant_method: Optional[FusedMoEMethodBase] = None
+ server_args = get_global_server_args()
+ kt_config = create_kt_config_from_server_args(server_args, layer_id)
+ if kt_config is not None:
+ if quant_config is not None:
+ gpu_method = quant_config.get_quant_method(self, prefix)
+ else:
+ gpu_method = UnquantizedFusedMoEMethod(self.use_triton_kernels)
+ self.quant_method = KTEPWrapperMethod(gpu_method, kt_config)
+ else:
+ if quant_config is not None:
+ self.quant_method = quant_config.get_quant_method(self, prefix)
+ if self.quant_method is None:
+ self.quant_method = UnquantizedFusedMoEMethod(
+ self.use_triton_kernels, self.use_flashinfer_trtllm_moe
+ )
+
+ self.quant_method.create_weights(
+ layer=self,
+ num_experts=self.num_local_experts,
+ hidden_size=hidden_size,
+ intermediate_size_per_partition=self.intermediate_size_per_partition,
+ params_dtype=params_dtype,
+ weight_loader=(
+ self.weight_loader
+ if not use_weight_loader_fused
+ else self.weight_loader_fused
+ ),
+ with_bias=with_bias,
+ moe_intermediate_size=intermediate_size,
+ )
+
+ self.quant_method.create_moe_runner(self, self.moe_runner_config)
+ self.dispatcher = create_moe_dispatcher(self.moe_runner_config)
+
+ self.should_fuse_routed_scaling_factor_in_topk = isinstance(
+ self.quant_method, ModelOptNvFp4FusedMoEMethod
+ ) or (
+ isinstance(self.quant_method, Fp8MoEMethod)
+ and get_moe_runner_backend().is_cutlass()
+ )
+
+ self.routing_method_type = routing_method_type
+
+ # overlap args
+ self.down_gemm_overlap_args: Optional[DownGemmOverlapArgs] = None
+ self.meta_overlap_args: Optional[dict] = None
+
+ if self.quant_method is not None and hasattr(self.quant_method, "runner"):
+ self.runner = self.quant_method.runner
+
+ def _load_per_tensor_weight_scale(
+ self,
+ shard_id: str,
+ param: torch.nn.Parameter,
+ loaded_weight: torch.Tensor,
+ expert_id: int,
+ ):
+ param_data = param.data
+ # for per tensor weight quantization
+ if shard_id in ("w1", "w3"):
+ # We have to keep the weight scales of w1 and w3 because
+ # we need to re-quantize w1/w3 weights after weight loading.
+ idx = 0 if shard_id == "w1" else 1
+ if self.moe_runner_config.is_gated:
+ param_data[expert_id][idx] = loaded_weight
+ else:
+ param_data[expert_id] = loaded_weight
+ # If we are in the row parallel case (down_proj)
+ elif shard_id == "w2":
+ param_data[expert_id] = loaded_weight
+
+ def _load_model_weight_or_group_weight_scale(
+ self,
+ shard_dim: int,
+ expert_data: torch.Tensor,
+ shard_id: str,
+ loaded_weight: torch.Tensor,
+ tp_rank: int,
+ is_bias: bool = False,
+ ):
+ # Load grouped weight scales for group quantization
+ # or model weights
+ if shard_id == "w2":
+ self._load_w2(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ is_bias=is_bias,
+ )
+ elif shard_id in ("w1", "w3", "w13"):
+ self._load_w13(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ is_bias=is_bias,
+ )
+
+ def _load_per_channel_weight_scale(
+ self,
+ expert_data: torch.Tensor,
+ shard_dim: int,
+ shard_id: str,
+ loaded_weight: torch.Tensor,
+ tp_rank: int,
+ ):
+ # for per channel weight quantization
+ if shard_id == "w2":
+ expert_data.copy_(loaded_weight)
+ elif shard_id in ("w1", "w3"):
+ self._load_w13(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+
+ def _load_w13(
+ self,
+ expert_data: torch.Tensor,
+ shard_dim: int,
+ shard_id: str,
+ loaded_weight: torch.Tensor,
+ tp_rank: int,
+ is_bias: bool = False,
+ ):
+ # Index the loaded weight for tp sharding.
+ # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
+ assert shard_id in {"w1", "w3", "w13"}
+
+ if is_bias:
+ # if this weight is a bias, the last dimension must be the sharded dimension
+ shard_dim = -1
+
+ if shard_id in {"w1", "w3"} and self.moe_runner_config.is_gated:
+ # non-fused version
+ shard_size = expert_data.shape[shard_dim] // 2
+ elif shard_id in {"w13"} or (
+ shard_id in {"w1", "w3"} and not self.moe_runner_config.is_gated
+ ):
+ # fused version
+ shard_size = expert_data.shape[shard_dim]
+ else:
+ raise NotImplementedError
+
+ # Narrow parameter and load.
+ # w1, gate_proj: Load into first logical weight of w13.
+ # w3, up_proj: Load into second logical weight of w13.
+ # trtllm cutlass kernel assumes differently
+ switch_w13 = getattr(self.quant_method, "load_up_proj_weight_first", False)
+ if (
+ (switch_w13 and shard_id == "w1") or (not switch_w13 and shard_id == "w3")
+ ) and self.moe_runner_config.is_gated:
+ start = shard_size
+ else:
+ start = 0
+
+ # Use narrow_padded_param_and_loaded_weight for:
+ # 1. CPU (always)
+ # 2. GPU with flashinfer_trtllm padding (when intermediate_size is padded to 128)
+ # This handles the case where the loaded weights are smaller than the padded expert_data
+ use_padded_loading = _is_cpu or self.use_flashinfer_trtllm_moe
+ if use_padded_loading:
+ expert_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+ expert_data,
+ loaded_weight,
+ start,
+ shard_size * tp_rank,
+ shard_dim,
+ shard_size,
+ not self.use_presharded_weights,
+ )
+ else:
+ if not self.use_presharded_weights:
+ if not is_bias and self.use_triton_kernels:
+ # do not transpose for bias
+ loaded_weight = loaded_weight.transpose(-2, -1)
+ loaded_weight = loaded_weight.narrow(
+ shard_dim, shard_size * tp_rank, shard_size
+ )
+
+ expert_data = expert_data.narrow(shard_dim, start, shard_size)
+ expert_data.copy_(loaded_weight)
+
+ def _load_w2(
+ self,
+ expert_data: torch.Tensor,
+ shard_dim: int,
+ shard_id: str,
+ loaded_weight: torch.Tensor,
+ tp_rank: int,
+ is_bias: bool = False,
+ ):
+ """Load w2 weights for down projection.
+
+ Args:
+ expert_data: The expert data tensor to load into
+ shard_dim: The dimension to shard along
+ shard_id: The shard ID (must be "w2")
+ loaded_weight: The weight tensor to load from
+ tp_rank: The tensor parallel rank
+ """
+ if not isinstance(expert_data, torch.Tensor) or not isinstance(
+ loaded_weight, torch.Tensor
+ ):
+ raise ValueError("expert_data and loaded_weight must be torch.Tensor")
+
+ if (
+ self.quant_config is not None
+ and "modelopt" in self.quant_config.get_name()
+ and (expert_data.dim() != 2 or loaded_weight.dim() != 2)
+ ):
+ raise ValueError(
+ f"Expected 2D tensors, got expert_data shape {expert_data.shape} and loaded_weight shape {loaded_weight.shape}"
+ )
+
+ if shard_id != "w2":
+ raise ValueError(f"shard_id must be 'w2', got {shard_id}")
+
+ # Index the loaded weight for tp sharding.
+ # down_proj: "RowParallel" so tp sharding on input_dim
+ # Narrow parameter and load.
+ if is_bias:
+ # this expert_data is a bias, not weight,
+ # for w2_weight_bias in TP, it does not need to be sharded
+ shard_size = expert_data.shape[-1]
+ else:
+ # this parameter is a weight matrix
+ # for w2 in TP, it shards the input_features, i.e., shard_dim=2
+ shard_size = expert_data.shape[shard_dim]
+
+ # Use narrow_padded_param_and_loaded_weight for:
+ # 1. CPU (always)
+ # 2. GPU with flashinfer_trtllm padding (when intermediate_size is padded to 128)
+ # This handles the case where the loaded weights are smaller than the padded expert_data
+ use_padded_loading = _is_cpu or self.use_flashinfer_trtllm_moe
+ if use_padded_loading:
+ expert_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+ expert_data,
+ loaded_weight,
+ 0, # param_data_start
+ shard_size * tp_rank,
+ shard_dim,
+ shard_size,
+ not self.use_presharded_weights,
+ )
+ else:
+ if not is_bias and not self.use_presharded_weights:
+ if self.use_triton_kernels:
+ loaded_weight = loaded_weight.transpose(-2, -1)
+ loaded_weight = loaded_weight.narrow(
+ shard_dim, shard_size * tp_rank, shard_size
+ )
+
+ # w2, down_proj: Load into only logical weight of w2.
+ expert_data.copy_(loaded_weight)
+
+ def _load_single_value(
+ self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
+ ):
+ param_data = param.data
+
+ # Input scales can be loaded directly and should be equal.
+ param_data[expert_id] = loaded_weight
+
+ def _load_g_idx(
+ self,
+ shard_id: str,
+ expert_data: torch.Tensor,
+ shard_dim: int,
+ loaded_weight: torch.Tensor,
+ tp_rank: int,
+ ):
+ if shard_id == "w2":
+ self._load_w2(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ else:
+ assert shard_id in ("w1", "w3")
+ expert_data.copy_(loaded_weight)
+
+ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
+ num_global_routed_experts = self.num_experts - self.num_fused_shared_experts
+ num_local_routed_experts = (
+ self.num_local_experts - self.num_fused_shared_experts
+ )
+ start_idx = self.moe_ep_rank * num_local_routed_experts
+ end_idx = (self.moe_ep_rank + 1) * num_local_routed_experts
+ if start_idx <= expert_id < end_idx:
+ return expert_id - start_idx
+ elif (
+ self.num_fused_shared_experts > 0 and expert_id >= num_global_routed_experts
+ ):
+ return expert_id - num_global_routed_experts + num_local_routed_experts
+ else:
+ return -1
+
+ def weight_loader(
+ self,
+ param: torch.nn.Parameter,
+ loaded_weight: torch.Tensor,
+ weight_name: str,
+ shard_id: str,
+ expert_id: Optional[int],
+ ) -> None:
+ # if expert_id is None, then
+ # all the experts are loaded at the same time
+ if (
+ not expert_id
+ and self.quant_config is not None
+ and self.quant_config.get_name() == "mxfp4"
+ and self.quant_config.is_static_cfg()
+ ):
+ if "bias" in weight_name:
+ dim1 = loaded_weight.shape[1]
+ param.data[:, :dim1].copy_(loaded_weight)
+ else:
+ dim1 = loaded_weight.shape[1]
+ dim2 = loaded_weight.shape[2]
+ param.data[:, :dim1, :dim2].copy_(loaded_weight)
+ return
+
+ global_expert_location_metadata = get_global_expert_location_metadata()
+ if global_expert_location_metadata is None:
+ if not getattr(param, "_sglang_require_global_experts", False):
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+ if expert_id == -1:
+ return
+
+ self._weight_loader_impl(
+ param=param,
+ loaded_weight=loaded_weight,
+ weight_name=weight_name,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ )
+ return
+
+ if expert_id >= self.num_experts - self.num_fused_shared_experts:
+ # This is a shared expert.
+ physical_expert_ids = [expert_id]
+ else:
+ require_global_experts = getattr(
+ param, "_sglang_require_global_experts", False
+ )
+ physical_expert_ids = (
+ global_expert_location_metadata.logical_to_all_physical(
+ self.layer_id, expert_id, require_global_experts
+ )
+ )
+
+ for physical_expert_id in physical_expert_ids:
+ self._weight_loader_physical(
+ param=param,
+ loaded_weight=loaded_weight,
+ weight_name=weight_name,
+ shard_id=shard_id,
+ expert_id=physical_expert_id,
+ )
+
+ def _weight_loader_physical(
+ self,
+ param: torch.nn.Parameter,
+ loaded_weight: torch.Tensor,
+ weight_name: str,
+ shard_id: str,
+ expert_id: int,
+ ) -> None:
+ # WARN: This makes the `expert_id` mean "local" and "global" in different cases
+ if not getattr(param, "_sglang_require_global_experts", False):
+ expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+ if expert_id < 0 or expert_id >= self.num_local_experts:
+ return
+
+ if isinstance(
+ self.quant_method,
+ KTEPWrapperMethod,
+ ):
+ if self.quant_method.num_gpu_experts != -1:
+ if expert_id >= self.quant_method.num_gpu_experts:
+ return
+
+ self._weight_loader_impl(
+ param=param,
+ loaded_weight=loaded_weight,
+ weight_name=weight_name,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ )
+
+ def _weight_loader_impl(
+ self,
+ param: torch.nn.Parameter,
+ loaded_weight: torch.Tensor,
+ weight_name: str,
+ shard_id: str,
+ expert_id: int,
+ ) -> None:
+ tp_rank = self.moe_tp_rank
+
+ # compressed-tensors checkpoints with packed weights are stored flipped
+ # TODO (mgoin): check self.quant_method.quant_config.quant_format
+ # against known CompressionFormat enum values that have this quality
+ method = self.quant_method
+ if hasattr(self, "scheme"):
+ method = self.scheme
+ if method.__class__.__name__ == "KTEPWrapperMethod":
+ method = method.gpu_method
+
+ loaded_weight = (
+ loaded_weight.t().contiguous()
+ if (
+ method.__class__.__name__
+ in [
+ "CompressedTensorsWNA16MarlinMoE",
+ "CompressedTensorsWNA16MoE",
+ "CompressedTensorsWNA16TritonMoE",
+ ]
+ )
+ else loaded_weight
+ )
+
+ if shard_id not in ("w1", "w2", "w3"):
+ raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.")
+
+ # Flashinfer assumes w31 format for w13_weight. Same for the scales.
+ if self.use_flashinfer_trtllm_moe and (
+ isinstance(method, ModelOptNvFp4FusedMoEMethod)
+ or isinstance(method, Fp8MoEMethod)
+ or isinstance(method, UnquantizedFusedMoEMethod)
+ or isinstance(method, CompressedTensorsMxInt4MoE)
+ ):
+ shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]
+
+ WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
+ # Fetch the dim to shard the parameter/loaded weight
+ # based on the shard id. This will be whatever
+ # dimension intermediate_size is used.
+ SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
+
+ expert_data = param.data[expert_id]
+
+ # is_transposed: if the dim to shard the weight
+ # should be flipped. Required by GPTQ, compressed-tensors
+ # should be whatever dimension intermediate_size is
+ is_transposed = getattr(param, "is_transposed", False)
+ shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
+ if self.use_triton_kernels:
+ is_transposed = True
+ if is_transposed:
+ shard_dim = int(not shard_dim)
+
+ # Case input scale: input_scale loading is only supported for fp8
+ if "input_scale" in weight_name:
+ # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust input_scale for e4m3fnuz (AMD)
+ if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+ loaded_weight = loaded_weight * 2.0
+
+ # this is needed for compressed-tensors only
+ loaded_weight = loaded_weight.to(param.data.device)
+
+ if (
+ (
+ "compressed" in method.__class__.__name__.lower()
+ or "w4afp8" in self.quant_config.get_name()
+ )
+ and (param.data[expert_id] != 1).any()
+ and ((param.data[expert_id] - loaded_weight).abs() > 1e-5).any()
+ ):
+ raise ValueError(
+ "input_scales of w1 and w3 of a layer "
+ f"must be equal. But got {param.data[expert_id]} "
+ f"vs. {loaded_weight}"
+ )
+
+ self._load_single_value(
+ param=param, loaded_weight=loaded_weight, expert_id=expert_id
+ )
+ return
+
+ # Case g_idx
+ if "g_idx" in weight_name:
+ self._load_g_idx(
+ shard_dim=0,
+ shard_id=shard_id,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ return
+
+ if "ModelOpt" in method.__class__.__name__:
+ # Determine per-tensor weight scale patterns based on variant
+ is_fp4_variant = isinstance(method, ModelOptNvFp4FusedMoEMethod)
+
+ # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
+ per_tensor_conditions = (
+ "weight_scale_2" in weight_name
+ if is_fp4_variant
+ else "weight_scale" in weight_name
+ ) or "input_scale" in weight_name
+
+ if per_tensor_conditions:
+ self._load_per_tensor_weight_scale(
+ shard_id=shard_id,
+ param=param,
+ loaded_weight=loaded_weight,
+ expert_id=expert_id,
+ )
+ elif "weight" in weight_name:
+ self._load_model_weight_or_group_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ return
+
+ # Case weight scales and zero_points
+ if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
+ # load the weight scales and zp based on the quantization scheme
+ # supported weight scales/zp can be found in
+ # FusedMoeWeightScaleSupported
+ # TODO @dsikka: once hardened, refactor to use vLLM Parameters
+ # specific to each case
+ quant_method = getattr(param, "quant_method", None)
+ if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
+ # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust INT4 column-wise scaling number to e4m3fnuz (AMD)
+ if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+ loaded_weight = loaded_weight * 0.5
+
+ self._load_per_channel_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ elif quant_method in [
+ FusedMoeWeightScaleSupported.GROUP.value,
+ FusedMoeWeightScaleSupported.BLOCK.value,
+ ]:
+ self._load_model_weight_or_group_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
+ # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust FP8 per-tensor scaling number for e4m3fnuz (AMD)
+ if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+ loaded_weight = loaded_weight * 2.0
+
+ self._load_per_tensor_weight_scale(
+ shard_id=shard_id,
+ param=param,
+ loaded_weight=loaded_weight,
+ expert_id=expert_id,
+ )
+ else:
+ raise ValueError(
+ f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}"
+ )
+ return
+
+ # Case weight_shape
+ if "weight_shape" in weight_name:
+ # only required by compressed-tensors
+ self._load_single_value(
+ param=param, loaded_weight=loaded_weight, expert_id=expert_id
+ )
+ return
+
+ # Case model weights
+ if "weight" in weight_name:
+ self._load_model_weight_or_group_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+ return
+
+ if (
+ "bias" in weight_name
+ and self.quant_config.quant_description["quant_method"] == "modelslim"
+ ):
+ self._load_per_channel_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ )
+
+ def weight_loader_fused(
+ self,
+ param: torch.nn.Parameter,
+ loaded_weight: torch.Tensor,
+ weight_name: str,
+ shard_id: str,
+ ) -> None:
+ tp_rank = self.moe_tp_rank
+
+ if (
+ self.quant_config is not None
+ and self.quant_config.get_name() == "mxfp4"
+ and self.quant_config.is_static_cfg()
+ ):
+ if "bias" in weight_name:
+ dim1 = loaded_weight.shape[1]
+ param.data[:, :dim1].copy_(loaded_weight)
+ elif "scale" in weight_name:
+ param.data.copy_(loaded_weight)
+ else:
+ dim1 = loaded_weight.shape[1]
+ dim2 = loaded_weight.shape[2]
+ param.data[:, :dim1, :dim2].copy_(loaded_weight)
+ return
+
+ # compressed-tensors checkpoints with packed weights are stored flipped
+ # TODO: check self.quant_method.quant_config.quant_format
+ # against known CompressionFormat enum values that have this quality
+ method = self.quant_method
+ if hasattr(self, "scheme"):
+ method = self.scheme
+ loaded_weight = (
+ loaded_weight.t().contiguous()
+ if (
+ method.__class__.__name__
+ in [
+ "CompressedTensorsWNA16MoE",
+ "CompressedTensorsWNA16TritonMoE",
+ ]
+ )
+ else loaded_weight
+ )
+
+ if shard_id not in ("w13", "w2"):
+ raise ValueError(f"shard_id must be ['w13','w2'] but got {shard_id}.")
+
+ # Fetch the dim to shard the parameter/loaded weight
+ # based on the shard id. This will be whatever
+ # dimension intermediate_size is used.
+ SHARD_ID_TO_SHARDED_DIM = {"w13": 1, "w2": 2}
+ SHARD_ID_TO_SHARDED_DIM_TRANSPOSE = {"w13": 2, "w2": 1}
+
+ expert_data = param.data
+ is_bias = expert_data.dim() == 2
+
+ # is_transposed: if the dim to shard the weight
+ # should be flipped. Required by GPTQ, compressed-tensors
+ # should be whatever dimension intermediate_size is
+ is_transposed = getattr(param, "is_transposed", False)
+
+ if self.use_triton_kernels:
+ is_transposed = True
+ shard_dim = (
+ SHARD_ID_TO_SHARDED_DIM[shard_id]
+ if not is_transposed
+ else SHARD_ID_TO_SHARDED_DIM_TRANSPOSE[shard_id]
+ )
+
+ # Case model weights
+ if "weight" in weight_name:
+ self._load_model_weight_or_group_weight_scale(
+ shard_id=shard_id,
+ shard_dim=shard_dim,
+ loaded_weight=loaded_weight,
+ expert_data=expert_data,
+ tp_rank=tp_rank,
+ is_bias=is_bias,
+ )
+ return
+ else:
+ logging.warning(
+ f"Unsupported weight_name {weight_name} for FusedMoE weight_loader_fused. Nothing is loaded."
+ )
+
+ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+ if is_in_piecewise_cuda_graph():
+ if not TopKOutputChecker.format_is_standard(topk_output):
+ # Make sure there is torch lib op registration for the whole moe layer
+ return self.forward_impl(hidden_states, topk_output)
+ else:
+ return moe_forward_piecewise_cuda_graph_impl(
+ hidden_states,
+ topk_output.topk_weights,
+ topk_output.topk_ids,
+ topk_output.router_logits,
+ self.layer_id,
+ )
+ else:
+ return self.forward_impl(hidden_states, topk_output)
+
+ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+ origin_hidden_states_dim = hidden_states.shape[-1]
+ assert self.quant_method is not None
+
+ dispatch_output = self.dispatcher.dispatch(
+ hidden_states=hidden_states, topk_output=topk_output
+ )
+ if _use_aiter and self.dispatcher.local_expert_mapping is not None:
+ self.expert_mask_gpu = (
+ (
+ (self.dispatcher.local_expert_mapping >= 0)
+ & (self.dispatcher.local_expert_mapping < self.num_local_experts)
+ )
+ .to(torch.int32)
+ .to(device="cuda")
+ )
+
+ combine_input = self.run_moe_core(
+ dispatch_output=dispatch_output,
+ )
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ final_hidden_states = self.dispatcher.combine(combine_input=combine_input)
+
+ # TODO: should we add some conditions here?
+ final_hidden_states = final_hidden_states[
+ ..., :origin_hidden_states_dim
+ ].contiguous()
+
+ if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
+ final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+ return final_hidden_states
+
+ def run_moe_core(self, dispatch_output: DispatchOutput) -> CombineInput:
+ # TODO: consider using symmetric memory
+ return self.quant_method.apply(
+ layer=self,
+ dispatch_output=dispatch_output,
+ )
+
+ @classmethod
+ def make_expert_params_mapping(
+ cls,
+ ckpt_gate_proj_name: str,
+ ckpt_down_proj_name: str,
+ ckpt_up_proj_name: str,
+ num_experts: int,
+ ) -> List[Tuple[str, str, int, str]]:
+ return [
+ # (param_name, weight_name, expert_id, shard_id)
+ (
+ (
+ "experts.w13_"
+ if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
+ else "experts.w2_"
+ ),
+ f"experts.{expert_id}.{weight_name}.",
+ expert_id,
+ shard_id,
+ )
+ for expert_id in range(num_experts)
+ for shard_id, weight_name in [
+ ("w1", ckpt_gate_proj_name),
+ ("w2", ckpt_down_proj_name),
+ ("w3", ckpt_up_proj_name),
+ ]
+ ]
+
+ @classmethod
+ def make_expert_params_mapping_fused(
+ cls,
+ ckpt_gate_up_proj_name: str,
+ ckpt_down_proj_name: str,
+ ckpt_gate_up_proj_bias_name: str,
+ ckpt_down_proj_bias_name: str,
+ ):
+ return [
+ ("experts.w13_weight", f"experts.{ckpt_gate_up_proj_name}", "w13"),
+ (
+ "experts.w13_weight_bias",
+ f"experts.{ckpt_gate_up_proj_bias_name}",
+ "w13",
+ ),
+ ("experts.w2_weight", f"experts.{ckpt_down_proj_name}", "w2"),
+ ("experts.w2_weight_bias", f"experts.{ckpt_down_proj_bias_name}", "w2"),
+ ]
+
+ @classmethod
+ def make_expert_params_mapping_fused_mxfp4(
+ cls,
+ ckpt_gate_up_proj_name: str,
+ ckpt_down_proj_name: str,
+ ckpt_gate_up_proj_bias_name: str,
+ ckpt_down_proj_bias_name: str,
+ ckpt_gate_up_proj_scale_name: str,
+ ckpt_down_proj_scale_name: str,
+ ):
+ return [
+ ("experts.w13_weight", f"experts.{ckpt_gate_up_proj_name}", "w13"),
+ (
+ "experts.w13_weight_bias",
+ f"experts.{ckpt_gate_up_proj_bias_name}",
+ "w13",
+ ),
+ ("experts.w2_weight", f"experts.{ckpt_down_proj_name}", "w2"),
+ ("experts.w2_weight_bias", f"experts.{ckpt_down_proj_bias_name}", "w2"),
+ (
+ "experts.w13_weight_scale",
+ f"experts.{ckpt_gate_up_proj_scale_name}",
+ "w13",
+ ),
+ ("experts.w2_weight_scale", f"experts.{ckpt_down_proj_scale_name}", "w2"),
+ ]
+
+ @classmethod
+ def make_expert_input_scale_params_mapping(
+ cls,
+ num_experts: int,
+ ) -> List[Tuple[str, str, int, str]]:
+ # (param_name, weight_name, expert_id, shard_id)
+ return [
+ (
+ "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
+ f"experts.{expert_id}.{shard_id}.",
+ expert_id,
+ shard_id,
+ )
+ for expert_id in range(num_experts)
+ for shard_id in ["w1", "w2", "w3"]
+ ]
+
+ def set_overlap_args(
+ self, down_gemm_overlap_args: DownGemmOverlapArgs, meta_overlap_args: dict
+ ):
+ if hasattr(self, "runner"):
+ self.runner.set_overlap_args(down_gemm_overlap_args, meta_overlap_args)
+ else:
+ # TODO: remove this branch after MoE refactor
+ self.down_gemm_overlap_args = down_gemm_overlap_args
+ self.meta_overlap_args = meta_overlap_args
+
+ def clear_overlap_args(self) -> None:
+ if hasattr(self, "runner"):
+ self.runner.clear_overlap_args()
+ else:
+ # TODO: remove this branch after MoE refactor
+ self.down_gemm_overlap_args = None
+ self.meta_overlap_args = None
+
+
+class FlashInferFP4MoE(FusedMoE):
+ """FP4 TRTLLM MoE implementation using FlashInfer."""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # ---------------------------------------------------------------------
+ # Helper: quantize hidden states to FP4 each forward pass
+ # ---------------------------------------------------------------------
+ def _quantize_hidden_states_fp4(self, hidden_states: torch.Tensor):
+ """
+ Quantize hidden states using global scale factor from quantization method.
+
+ Global scale factor is set by ModelOptNvFp4FusedMoEMethod during weight loading.
+ Only block scales are computed at runtime for efficiency.
+
+ Returns (packed_fp4_uint8, scale_float8_e4m3fn_runtime, global_scale_float32)
+ """
+
+ # flashinfer.fp4_quantize returns (packed_uint8, scale_fp8)
+ # Only the block scales are computed at runtime
+ hs_fp4_bytes, hs_sf_bytes = fp4_quantize(
+ hidden_states,
+ self.w13_input_scale_quant,
+ 16, # sf_vec_size
+ False, # use_ue8m0
+ False, # is_sf_swizzled_layout
+ )
+
+ seq_len, hidden_size = hidden_states.shape
+ hs_fp4 = hs_fp4_bytes.reshape(seq_len, hidden_size // 2)
+ # TRT-LLM expects hidden state scales shaped as [seq_len, hidden_size // 16]
+ hs_sf = hs_sf_bytes.view(torch.float8_e4m3fn).reshape(
+ seq_len, hidden_size // 16
+ )
+
+ return hs_fp4, hs_sf
+
+ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+ assert TopKOutputChecker.format_is_bypassed(
+ topk_output
+ ), "Only bypassed topk output is supported for flashinfer fp4 moe"
+
+ if is_in_piecewise_cuda_graph():
+ return flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl(
+ hidden_states,
+ topk_output.router_logits,
+ topk_output.topk_config.top_k,
+ topk_output.topk_config.topk_group,
+ topk_output.topk_config.num_expert_group,
+ topk_output.topk_config.correction_bias,
+ self.layer_id,
+ )
+ else:
+ return self.forward_impl(hidden_states, topk_output)
+
+ def forward_impl(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+ """Forward pass using FP4 TRTLLM kernel.
+
+ Args:
+ hidden_states: Input tensor
+ topk_output: TopKOutput object with Bypassed format
+ """
+ assert isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
+
+ assert (
+ self.moe_runner_config.is_gated
+ ), "Only gated MoEs are supported for flashinfer fp4 moe"
+
+ assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+ router_logits = topk_output.router_logits
+ topk_config = topk_output.topk_config
+
+ hs_fp4, hs_scale_linear = self._quantize_hidden_states_fp4(hidden_states)
+ routing_method_type = self.routing_method_type
+ assert (
+ routing_method_type is not None
+ ), "flashinfer trtllm moe nvfp4 backend has not been adapted for the current moe layer, you can set routing_method_type (See definition of RoutingMethodType please) for the moe layer explicitly for a quick adaptation."
+
+ # DeepSeekV3 style routing requires float32 router logits,
+ # see this PR for details: https://github.com/flashinfer-ai/flashinfer/commit/d84e1d560da0a27961c19ca788d96c19cb9dcfb6
+ if routing_method_type == RoutingMethodType.DeepSeekV3:
+ router_logits = router_logits.to(torch.float32)
+
+ correction_bias = (
+ None
+ if topk_config.correction_bias is None
+ else topk_config.correction_bias.to(hidden_states.dtype)
+ )
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ num_tokens = hs_fp4.shape[0]
+ hidden_size = (
+ hs_fp4.shape[-1] * 2
+ if hs_fp4.dtype == torch.uint8
+ else hs_fp4.shape[-1]
+ )
+ symm_output = torch.empty(
+ num_tokens, hidden_size, dtype=torch.bfloat16, device=hs_fp4.device
+ )
+ result = trtllm_fp4_block_scale_moe(
+ routing_logits=router_logits,
+ routing_bias=correction_bias,
+ hidden_states=hs_fp4,
+ hidden_states_scale=hs_scale_linear.view(torch.float8_e4m3fn).reshape(
+ *hs_scale_linear.shape[:-1], -1
+ ),
+ gemm1_weights=self.gemm1_weights_fp4_shuffled.data,
+ gemm1_weights_scale=self.gemm1_scales_fp4_shuffled.data.view(
+ torch.float8_e4m3fn
+ ),
+ gemm1_bias=None,
+ gemm1_alpha=None,
+ gemm1_beta=None,
+ gemm1_clamp_limit=None,
+ gemm2_weights=self.gemm2_weights_fp4_shuffled.data,
+ gemm2_weights_scale=self.gemm2_scales_fp4_shuffled.data.view(
+ torch.float8_e4m3fn
+ ),
+ gemm2_bias=None,
+ output1_scale_scalar=self.g1_scale_c.data,
+ output1_scale_gate_scalar=self.g1_alphas.data,
+ output2_scale_scalar=self.g2_alphas.data,
+ num_experts=self.num_experts,
+ top_k=topk_config.top_k,
+ n_group=topk_config.num_expert_group,
+ topk_group=topk_config.topk_group,
+ intermediate_size=self.intermediate_size_per_partition,
+ local_expert_offset=self.moe_ep_rank * self.num_local_experts,
+ local_num_experts=self.num_local_experts,
+ routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
+ # Respect the routing method configured for this layer (e.g., Renormalize for Qwen3),
+ # instead of always assuming DeepSeekV3.
+ routing_method_type=(
+ self.routing_method_type
+ if self.routing_method_type is not None
+ else RoutingMethodType.Default
+ ),
+ do_finalize=True,
+ tune_max_num_tokens=next_power_of_2(hs_fp4.shape[0]),
+ output=symm_output,
+ )[0]
+
+ return result
+
+
+@register_custom_op(out_shape="hidden_states")
+def moe_forward_piecewise_cuda_graph_impl(
+ hidden_states: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ router_logits: torch.Tensor,
+ layer_id: int,
+) -> torch.Tensor:
+ # only standard topk output is supported for piecewise cuda graph
+ topk_output = StandardTopKOutput(
+ topk_weights=topk_weights, topk_ids=topk_ids, router_logits=router_logits
+ )
+ forward_context = get_forward_context()
+ moe_layer = forward_context.moe_layers[layer_id]
+ return moe_layer.forward_impl(hidden_states, topk_output)
+
+
+@register_custom_op(out_shape="hidden_states")
+def flashinfer_fp4_moe_forward_piecewise_cuda_graph_impl(
+ hidden_states: torch.Tensor,
+ router_logits: torch.Tensor,
+ top_k: int,
+ topk_group: Optional[int],
+ num_expert_group: Optional[int],
+ correction_bias: Optional[torch.Tensor],
+ layer_id: int,
+) -> torch.Tensor:
+ topk_output = BypassedTopKOutput(
+ hidden_states=hidden_states,
+ router_logits=router_logits,
+ topk_config=TopKConfig(
+ top_k=top_k,
+ topk_group=topk_group,
+ num_expert_group=num_expert_group,
+ correction_bias=correction_bias,
+ ),
+ )
+ forward_context = get_forward_context()
+ moe_layer = forward_context.moe_layers[layer_id]
+ return moe_layer.forward_impl(hidden_states, topk_output)
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d1d5b1d4ff299ae49c819a29a3e8b49ec08272
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+import triton
+
+from sglang.srt.utils import is_cuda, is_hip, is_xpu
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_xpu = is_xpu()
+
+if _is_cuda or _is_hip or _is_xpu:
+ from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+
+
+def moe_align_block_size(
+ topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Aligns the token distribution across experts to be compatible with block
+ size for matrix multiplication.
+
+ Parameters:
+ - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+ top-k expert indices for each token.
+ - block_size: The block size used in block matrix multiplication.
+ - num_experts: The total number of experts.
+
+ Returns:
+ - sorted_token_ids: A tensor containing the sorted token indices according
+ to their allocated expert.
+ - expert_ids: A tensor indicating the assigned expert index for each block.
+ - num_tokens_post_padded: The total number of tokens after padding,
+ ensuring divisibility by block_size.
+
+ This function pads the number of tokens that each expert needs to process
+ so that it is divisible by block_size.
+ Padding ensures that during block matrix multiplication, the dimensions
+ align correctly.
+
+ Example:
+ Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+ block_size = 4, and num_experts = 4:
+ - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+ with each expert needing to process 3 tokens.
+ - As block_size is 4, we pad 1 token for each expert.
+ - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+ - Then append padding tokens [12, 12, 12, 12] for each block.
+ - After sorting by expert index, we obtain token_ids
+ [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+ Tokens 12 are non-existent (padding) and are ignored in
+ the subsequent matrix multiplication.
+ - The padding ensures that the total number of tokens is now divisible
+ by block_size for proper block matrix operations.
+ """
+ if topk_ids.numel() < num_experts + 1:
+ max_num_tokens_padded = topk_ids.numel() * block_size
+ else:
+ max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+ sorted_ids = torch.empty(
+ (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+ )
+ max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+ expert_ids = torch.empty(
+ (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+ )
+ num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+ # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total.
+ cumsum_buffer = torch.empty(
+ (num_experts + 2,), dtype=torch.int32, device=topk_ids.device
+ )
+
+ sgl_moe_align_block_size(
+ topk_ids,
+ num_experts + 1,
+ block_size,
+ sorted_ids,
+ expert_ids,
+ num_tokens_post_pad,
+ cumsum_buffer,
+ True,
+ )
+ return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e6cd7b5e7baaee0dc8dfa406a94013c5123a4d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
@@ -0,0 +1,328 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/18595/files#diff-f426a6de78c82ffec568eff6811bfbf0043dab5f87f1a8c0cffdbdcb8a81e035
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from sgl_kernel import gelu_and_mul, silu_and_mul
+from triton_kernels.matmul_ogs import (
+ FlexCtx,
+ FnSpecs,
+ FusedActivation,
+ PrecisionConfig,
+ matmul_ogs,
+)
+from triton_kernels.numerics import InFlexData
+from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
+from triton_kernels.swiglu import swiglu_fn
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+ from sglang.srt.layers.moe.topk import TopKOutput
+
+
+def quantize(w, dtype, dev, **opt):
+ if dtype == "bf16":
+ return w.to(torch.bfloat16), InFlexData()
+
+
+def triton_kernel_moe_forward(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_output: TopKOutput,
+ moe_runner_config: MoeRunnerConfig,
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ per_channel_quant: bool = False,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+ assert TopKOutputChecker.format_is_triton_kernels(topk_output)
+
+ routing_data, gather_idx, scatter_idx = topk_output
+
+ return triton_kernel_fused_experts(
+ hidden_states,
+ w1,
+ w2,
+ routing_data,
+ gather_idx,
+ scatter_idx,
+ inplace=False, # triton kernel doesn't support inplace
+ activation=moe_runner_config.activation,
+ apply_router_weight_on_input=apply_router_weight_on_input,
+ use_fp8_w8a8=use_fp8_w8a8,
+ per_channel_quant=per_channel_quant,
+ global_num_experts=global_num_experts,
+ expert_map=expert_map,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ )
+
+
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_experts(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ routing_data: RoutingData,
+ gather_indx: GatherIndx,
+ scatter_indx: ScatterIndx,
+ inplace: bool = False,
+ activation: str = "silu",
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ per_channel_quant: bool = False,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+ assert use_fp8_w8a8 is False, "use_fp8_w8a8 is not supported"
+ assert per_channel_quant is False, "per_channel_quant is not supported"
+ assert expert_map is None, "expert_map is not supported"
+ assert w1_scale is None, "w1_scale is not supported"
+ assert w2_scale is None, "w2_scale is not supported"
+ assert a1_scale is None, "a1_scale is not supported"
+ assert a2_scale is None, "a2_scale is not supported"
+ assert block_shape is None, "block_shape is not supported"
+
+ # type check
+ assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+ assert w1.dtype == torch.bfloat16, "w1 must be bfloat16"
+ assert w2.dtype == torch.bfloat16, "w2 must be bfloat16"
+
+ # Shape check
+ assert hidden_states.ndim == 2, "hidden_states must be 2D"
+ assert (
+ hidden_states.shape[-1] == w1.shape[-2]
+ ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+ assert (
+ w2.shape[-1] == w1.shape[1]
+ ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+
+ # feature check
+ assert inplace is False, "Inplace is not supported in new triton MoE kernel"
+
+ M, K = hidden_states.shape
+ E, _, N = w1.shape
+ n_expts_act = routing_data.n_expts_act
+ dtype = hidden_states.dtype
+
+ if global_num_experts == -1:
+ global_num_experts = E
+
+ # consistent with default implementation
+ intermediate_cache2 = torch.empty(
+ (M * n_expts_act, N // 2), device="cuda", dtype=dtype
+ )
+
+ intermediate_cache1 = matmul_ogs(
+ hidden_states,
+ w1,
+ None,
+ routing_data,
+ gather_indx=gather_indx,
+ gammas=routing_data.gate_scal if apply_router_weight_on_input else None,
+ )
+
+ if activation == "silu":
+ silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ elif activation == "gelu":
+ gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ else:
+ raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+ intermediate_cache3 = matmul_ogs(
+ intermediate_cache2,
+ w2,
+ None,
+ routing_data,
+ scatter_indx=scatter_indx,
+ gammas=None if apply_router_weight_on_input else routing_data.gate_scal,
+ )
+
+ return intermediate_cache3
+
+
+def triton_kernel_moe_with_bias_forward(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w1_pcg,
+ b1: torch.Tensor,
+ w2: torch.Tensor,
+ w2_pcg,
+ b2: torch.Tensor,
+ topk_output: TopKOutput,
+ moe_runner_config: MoeRunnerConfig,
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ per_channel_quant: bool = False,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+ assert TopKOutputChecker.format_is_triton_kernels(topk_output)
+
+ routing_data, gather_idx, scatter_idx = topk_output
+
+ return triton_kernel_fused_experts_with_bias(
+ hidden_states,
+ w1=w1,
+ w1_pcg=w1_pcg,
+ b1=b1,
+ w2=w2,
+ w2_pcg=w2_pcg,
+ b2=b2,
+ routing_data=routing_data,
+ gather_indx=gather_idx,
+ scatter_indx=scatter_idx,
+ inplace=False, # triton kernel doesn't support inplace
+ activation=moe_runner_config.activation,
+ apply_router_weight_on_input=apply_router_weight_on_input,
+ use_fp8_w8a8=use_fp8_w8a8,
+ per_channel_quant=per_channel_quant,
+ global_num_experts=global_num_experts,
+ expert_map=expert_map,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_shape,
+ gemm1_alpha=moe_runner_config.gemm1_alpha,
+ gemm1_clamp_limit=moe_runner_config.gemm1_clamp_limit,
+ )
+
+
+def triton_kernel_fused_experts_with_bias(
+ hidden_states: torch.Tensor,
+ w1: torch.Tensor,
+ w1_pcg,
+ b1: torch.Tensor,
+ w2: torch.Tensor,
+ w2_pcg,
+ b2: torch.Tensor,
+ routing_data: RoutingData,
+ gather_indx: GatherIndx,
+ scatter_indx: ScatterIndx,
+ inplace: bool = False,
+ activation: str = "silu",
+ apply_router_weight_on_input: bool = False,
+ use_fp8_w8a8: bool = False,
+ per_channel_quant: bool = False,
+ global_num_experts: int = -1,
+ expert_map: Optional[torch.Tensor] = None,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
+ a1_scale: Optional[torch.Tensor] = None,
+ a2_scale: Optional[torch.Tensor] = None,
+ block_shape: Optional[list[int]] = None,
+ gemm1_alpha: Optional[float] = None,
+ gemm1_clamp_limit: Optional[float] = None,
+) -> torch.Tensor:
+ assert use_fp8_w8a8 is False, "use_fp8_w8a8 is not supported"
+ assert per_channel_quant is False, "per_channel_quant is not supported"
+ assert expert_map is None, "expert_map is not supported"
+ assert w1_scale is None, "w1_scale is not supported"
+ assert w2_scale is None, "w2_scale is not supported"
+ assert a1_scale is None, "a1_scale is not supported"
+ assert a2_scale is None, "a2_scale is not supported"
+ assert block_shape is None, "block_shape is not supported"
+
+ # type check
+ assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+ for w in (w1, w2):
+ # TODO assert bf16 or mxfp4
+ # assert (w.dtype == torch.bfloat16) or check-is-mxfp4, f"w must be bfloat16 or mxfp4 {w1.dtype=}"
+ pass
+
+ # Shape check
+ assert hidden_states.ndim == 2, "hidden_states must be 2D"
+ assert (
+ hidden_states.shape[-1] == w1.shape[-2]
+ ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+ assert (
+ w2.shape[-1] == w1.shape[1]
+ ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+
+ # feature check
+ assert inplace is False, "Inplace is not supported in new triton MoE kernel"
+
+ M, K = hidden_states.shape
+ E, _, N = w1.shape
+ n_expts_act = routing_data.n_expts_act
+
+ if global_num_experts == -1:
+ global_num_experts = E
+
+ # TODO maybe completely remove this branch
+ if w1.dtype == torch.bfloat16:
+ device = "cuda"
+ optg = dict()
+ w1, w1_flex = quantize(w1, "bf16", device, **optg)
+ w1_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex))
+
+ w2, w2_flex = quantize(w2, "bf16", device, **optg)
+ w2_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex))
+
+ act = FusedActivation(
+ FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")),
+ (gemm1_alpha, gemm1_clamp_limit),
+ 2,
+ )
+
+ intermediate_cache = torch.empty(
+ (1, M * n_expts_act, N // 2),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ output = torch.empty(
+ (1, M, K), device=hidden_states.device, dtype=hidden_states.dtype
+ )
+
+ matmul_ogs(
+ hidden_states,
+ w1,
+ b1,
+ routing_data,
+ gather_indx=gather_indx,
+ precision_config=w1_pcg,
+ gammas=routing_data.gate_scal if apply_router_weight_on_input else None,
+ fused_activation=act,
+ y=intermediate_cache,
+ )
+
+ matmul_ogs(
+ intermediate_cache.view(M * n_expts_act, N // 2),
+ w2,
+ b2,
+ routing_data,
+ scatter_indx=scatter_indx,
+ precision_config=w2_pcg,
+ gammas=None if apply_router_weight_on_input else routing_data.gate_scal,
+ y=output,
+ )
+ return output.view(M, K)
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__init__.py b/sglang/python/sglang/srt/layers/moe/moe_runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3320a78751e193ccea76e00b0be30f0193fa2c85
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/__init__.py
@@ -0,0 +1,4 @@
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.runner import MoeRunner
+
+__all__ = ["MoeRunnerConfig", "MoeRunner"]
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7362633e665a458b139598b30aff942428b858e1
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/base.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52e820deb7485df6a35f507eb91303169bcaf44e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/base.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/deep_gemm.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/deep_gemm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4013a1fb5f025b37f9b4cd63218aca52f843290d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/deep_gemm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/flashinfer_trtllm.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/flashinfer_trtllm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12f0b432fdfcf6900c3fee043e54969f350a30c7
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/flashinfer_trtllm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/marlin.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/marlin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b816a19bf7a5e23ef657309c527519c814ee4ed
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/marlin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/runner.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de5e0f5e08442e42264bad20a17c61d902af8b36
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c6267d76f8c891c4bc02b2b408a50608899a649
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton_kernels.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton_kernels.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12e22cf5536a7a0d4bfb5550a771c705678202aa
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/moe_runner/__pycache__/triton_kernels.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/base.py b/sglang/python/sglang/srt/layers/moe/moe_runner/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..12dd2ba6a2379e6174cb6f620858a0e41823d9c9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/base.py
@@ -0,0 +1,285 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional, Tuple, TypeGuard
+
+import torch
+
+from sglang.srt.layers.moe.utils import (
+ MoeA2ABackend,
+ MoeRunnerBackend,
+ RoutingMethodType,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.moe_runner.triton import (
+ TritonRunnerCore,
+ TritonRunnerInput,
+ TritonRunnerOutput,
+ )
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+ )
+
+
+@dataclass
+class MoeRunnerConfig:
+ # MoE parameters
+ num_experts: Optional[int] = None
+ num_local_experts: Optional[int] = None
+ hidden_size: Optional[int] = None
+ intermediate_size_per_partition: Optional[int] = None
+ layer_id: Optional[int] = None
+ top_k: Optional[int] = None
+ num_fused_shared_experts: Optional[int] = None
+ params_dtype: Optional[torch.dtype] = None
+ routing_method_type: Optional[RoutingMethodType] = None
+
+ # Runner configuration
+ activation: str = "silu"
+ is_gated: bool = True
+ apply_router_weight_on_input: bool = False
+ inplace: bool = True
+ no_combine: bool = False
+ routed_scaling_factor: Optional[float] = None
+ gemm1_alpha: Optional[float] = None
+ gemm1_clamp_limit: Optional[float] = None
+
+
+@dataclass
+class RunnerInput(ABC):
+ @property
+ @abstractmethod
+ def runner_backend(self) -> MoeRunnerBackend: ...
+
+ def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerInput]:
+ return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+class RunnerOutput(ABC):
+ @property
+ @abstractmethod
+ def runner_backend(self) -> MoeRunnerBackend: ...
+
+ def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerOutput]:
+ return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+@dataclass
+class MoeQuantInfo(ABC):
+ """Moe quantization data."""
+
+ pass
+
+
+class MoeRunnerCore(ABC):
+ def __init__(self, config: MoeRunnerConfig):
+ self.config = config
+
+ @abstractmethod
+ def run(
+ self, runner_input: RunnerInput, quant_info: MoeQuantInfo, running_state: dict
+ ) -> RunnerOutput:
+ pass
+
+ @property
+ @abstractmethod
+ def runner_backend(self) -> MoeRunnerBackend: ...
+
+ def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerCore]:
+ return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+class FusedOpPool:
+ _fused_funcs: dict[str, Callable] = {}
+
+ @classmethod
+ def register_fused_func(
+ cls, a2a_backend_name: str, runner_backend_name: str, fused_func: Callable
+ ):
+ key = (a2a_backend_name, runner_backend_name)
+ if key in cls._fused_funcs:
+ raise ValueError(
+ f"Fused function for {a2a_backend_name} to {runner_backend_name} is already registered."
+ )
+ assert MoeA2ABackend(
+ a2a_backend_name
+ ), f"Invalid dispatch name: {a2a_backend_name}"
+ assert MoeRunnerBackend(
+ runner_backend_name
+ ), f"Invalid runner name: {runner_backend_name}"
+ cls._fused_funcs[key] = fused_func
+
+ @classmethod
+ def get_fused_func(cls, dispatch_name: str, runner_name: str) -> Optional[Callable]:
+ key = (dispatch_name, runner_name)
+ fused_func = cls._fused_funcs.get(key)
+ return fused_func
+
+
+class PermuteMethodPool:
+ _pre_permute_methods: dict[
+ Tuple[DispatchOutputFormat, MoeRunnerBackend], Callable
+ ] = {}
+ _post_permute_methods: dict[
+ Tuple[MoeRunnerBackend, CombineInputFormat], Callable
+ ] = {}
+
+ @classmethod
+ def register_pre_permute(
+ cls,
+ dispatch_output_name: str,
+ runner_backend_name: str,
+ permute_func: Callable,
+ ):
+ """
+ Register a customized pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+ :param dispatch_output_name: The DispatchOutputFormat name.
+ :param runner_backend_name: The MoeRunnerBackend name.
+ :param permute_func: The permute function to register.
+ """
+ # TODO: check if registration is valid
+ key = (dispatch_output_name, runner_backend_name)
+ if key in cls._pre_permute_methods:
+ raise ValueError(
+ f"Pre-permute method for {dispatch_output_name} to {runner_backend_name} is already registered."
+ )
+ cls._pre_permute_methods[key] = permute_func
+
+ @classmethod
+ def register_post_permute(
+ cls,
+ runner_backend_name: str,
+ combine_input_name: str,
+ permute_func: Callable,
+ ):
+ """
+ Register a customized post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+ :param runner_backend_name: The MoeRunnerBackend name.
+ :param combine_input_name: The CombineInputFormat name.
+ :param permute_func: The permute function to register.
+ """
+ # TODO: check if registration is valid
+ key = (runner_backend_name, combine_input_name)
+ if key in cls._post_permute_methods:
+ raise ValueError(
+ f"Post-permute method for {runner_backend_name} to {combine_input_name} is already registered."
+ )
+ cls._post_permute_methods[key] = permute_func
+
+ @classmethod
+ def get_pre_permute(
+ cls,
+ dispatch_output_format: DispatchOutputFormat,
+ runner_input_format: MoeRunnerBackend,
+ ) -> Callable:
+ """
+ Retrieve the pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+ :param dispatch_output_format: The DispatchOutputFormat type.
+ :param runner_input_format: The MoeRunnerBackend type.
+ :return: The registered permute function or None if not found.
+ """
+ key = (dispatch_output_format, runner_input_format)
+ pre_permute_func = cls._pre_permute_methods.get(key)
+ assert (
+ pre_permute_func is not None
+ ), f"Pre-permute function for {dispatch_output_format} to {runner_input_format} is not registered"
+ return pre_permute_func
+
+ @classmethod
+ def get_post_permute(
+ cls,
+ runner_output_format: MoeRunnerBackend,
+ combine_input_format: CombineInputFormat,
+ ) -> Callable:
+ """
+ Retrieve the post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+ :param runner_output_format: The MoeRunnerBackend type.
+ :param combine_input_format: The CombineInputFormat type.
+ :return: The registered permute function or None if not found.
+ """
+ key = (runner_output_format, combine_input_format)
+ post_permute_func = cls._post_permute_methods.get(key)
+ assert (
+ post_permute_func is not None
+ ), f"Post-permute function for {runner_output_format} to {combine_input_format} is not registered"
+ return post_permute_func
+
+
+def register_fused_func(
+ a2a_backend_name: str,
+ runner_backend_name: str,
+) -> Callable:
+ """
+ Decorator to register a fused function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+ :param a2a_backend_name: The A2A backend name.
+ :param runner_backend_name: The MoeRunnerBackend name.
+ :return: The decorator function.
+ """
+
+ def decorator(fused_func: Callable):
+ FusedOpPool.register_fused_func(
+ a2a_backend_name, runner_backend_name, fused_func
+ )
+ return fused_func
+
+ return decorator
+
+
+def register_pre_permute(
+ dispatch_output_name: str,
+ runner_backend_name: str,
+) -> Callable:
+ """
+ Decorator to register a pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+ :param dispatch_output_name: The DispatchOutputFormat name.
+ :param runner_backend_name: The MoeRunnerBackend name.
+ :return: The decorator function.
+ """
+
+ def decorator(
+ permute_func: Callable[
+ [DispatchOutput, MoeQuantInfo, MoeRunnerConfig, dict], RunnerInput
+ ],
+ ) -> Callable:
+ PermuteMethodPool.register_pre_permute(
+ dispatch_output_name, runner_backend_name, permute_func
+ )
+ return permute_func
+
+ return decorator
+
+
+def register_post_permute(
+ runner_backend_name: str,
+ combine_input_name: str,
+) -> Callable:
+ """
+ Decorator to register a post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+ :param runner_backend_name: The MoeRunnerBackend name.
+ :param combine_input_name: The CombineInputFormat name.
+ :return: The decorator function.
+ """
+
+ def decorator(
+ permute_func: Callable[
+ [RunnerOutput, MoeQuantInfo, MoeRunnerConfig, dict], CombineInput
+ ],
+ ) -> Callable:
+ PermuteMethodPool.register_post_permute(
+ runner_backend_name, combine_input_name, permute_func
+ )
+ return permute_func
+
+ return decorator
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/deep_gemm.py b/sglang/python/sglang/srt/layers/moe/moe_runner/deep_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..93bb9cbcb39acdeaa2fc2bf6e5097591951319a5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/deep_gemm.py
@@ -0,0 +1,614 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.moe.moe_runner.base import (
+ MoeQuantInfo,
+ MoeRunnerConfig,
+ MoeRunnerCore,
+ RunnerInput,
+ RunnerOutput,
+ register_post_permute,
+ register_pre_permute,
+)
+from sglang.srt.layers.moe.utils import MoeRunnerBackend
+from sglang.srt.utils import (
+ ceil_div,
+ dispose_tensor,
+ get_bool_env_var,
+ is_cuda,
+ is_hip,
+ is_npu,
+)
+from sglang.srt.utils.offloader import get_offloader
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher.deepep import (
+ DeepEPLLCombineInput,
+ DeepEPLLDispatchOutput,
+ DeepEPNormalCombineInput,
+ DeepEPNormalDispatchOutput,
+ )
+ from sglang.srt.layers.moe.token_dispatcher.standard import (
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_cuda = is_cuda()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if not (_is_npu or _is_hip) and _is_cuda:
+ from sgl_kernel import silu_and_mul
+
+
+_MASKED_GEMM_FAST_ACT = get_bool_env_var("SGLANG_MASKED_GEMM_FAST_ACT")
+_DEEPGEMM_ON_H20 = get_bool_env_var("SGLANG_DEEPGEMM_ON_H20")
+
+
+# TODO(kaixih@nvidia): ideally we should merge this logic into
+# `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
+@torch.compile(disable=_is_hip or _is_npu)
+def _cast_to_e8m0_with_rounding_up(x: torch.Tensor) -> torch.Tensor:
+ temp = x.to(torch.float32).view(torch.int32)
+ exp = torch.bitwise_right_shift(temp, 23)
+ mant = torch.bitwise_and(temp, 0x7FFFFF)
+ is_ru = torch.logical_and(
+ torch.logical_and((mant > 0), (exp != 0xFE)),
+ ~torch.logical_and((exp == 0), (mant <= 0x400000)),
+ )
+ exp = torch.where(is_ru, exp + 1, exp)
+ new_x = exp.to(torch.uint8).view(torch.int)
+ return new_x.transpose(1, 2).contiguous().transpose(1, 2)
+
+
+def copy_list_to_gpu_no_ce(arr: List[int]):
+ from sgl_kernel.elementwise import copy_to_gpu_no_ce
+
+ tensor_cpu = torch.tensor(arr, dtype=torch.int32, device="cpu")
+ tensor_gpu = torch.empty_like(tensor_cpu, device="cuda")
+ copy_to_gpu_no_ce(tensor_cpu, tensor_gpu)
+ return tensor_gpu
+
+
+@dataclass
+class DeepGemmRunnerInput(RunnerInput):
+ hidden_states: torch.Tensor
+ hidden_states_scale: torch.Tensor
+ use_masked_gemm: bool
+ masked_m: Optional[torch.Tensor] = None
+ expected_m: Optional[int] = None
+ m_indices: Optional[torch.Tensor] = None
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.DEEP_GEMM
+
+
+@dataclass
+class DeepGemmRunnerOutput(RunnerOutput):
+ hidden_states: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.DEEP_GEMM
+
+
+@dataclass
+class DeepGemmMoeQuantInfo(MoeQuantInfo):
+ w13_weight: torch.Tensor
+ w2_weight: torch.Tensor
+ use_fp8: bool
+ w13_scale: Optional[torch.Tensor] = None
+ w2_scale: Optional[torch.Tensor] = None
+ block_shape: Optional[List[int]] = None
+
+
+class DeepGemmRunnerCore(MoeRunnerCore):
+ def __init__(self, config: MoeRunnerConfig):
+ super().__init__(config)
+ assert self.config.activation == "silu"
+ assert self.config.is_gated
+
+ def run(
+ self,
+ runner_input: DeepGemmRunnerInput,
+ quant_info: DeepGemmMoeQuantInfo,
+ running_state: dict,
+ ) -> DeepGemmRunnerOutput:
+ if not runner_input.use_masked_gemm:
+ hidden_states = self._run_contiguous_gemm(
+ runner_input, quant_info, running_state
+ )
+ else:
+ hidden_states = self._run_masked_gemm(
+ runner_input, quant_info, running_state
+ )
+ return DeepGemmRunnerOutput(hidden_states=hidden_states)
+
+ def _run_contiguous_gemm(
+ self,
+ runner_input: DeepGemmRunnerInput,
+ quant_info: DeepGemmMoeQuantInfo,
+ running_state: dict,
+ ) -> torch.Tensor:
+ from sglang.srt.layers.moe.ep_moe.kernels import tma_align_input_scale
+ from sglang.srt.layers.quantization.fp8_kernel import (
+ sglang_per_token_group_quant_fp8,
+ )
+
+ hidden_states = runner_input.hidden_states
+ hidden_states_scale = runner_input.hidden_states_scale
+ all_tokens = running_state["all_tokens"]
+ hidden_states_device = running_state["hidden_states_device"]
+ hidden_states_dtype = running_state["hidden_states_dtype"]
+ hidden_states_shape = running_state["hidden_states_shape"]
+ m_indices = runner_input.m_indices
+
+ N = quant_info.w13_weight.size(1)
+ K = hidden_states_shape[1]
+ scale_block_size = 128
+
+ w13_weight_fp8 = (
+ quant_info.w13_weight,
+ quant_info.w13_scale,
+ )
+ w2_weight_fp8 = (quant_info.w2_weight, quant_info.w2_scale)
+
+ gateup_output = torch.empty(
+ (all_tokens, N),
+ device=hidden_states_device,
+ dtype=torch.bfloat16,
+ )
+ if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+ hidden_states_scale = tma_align_input_scale(hidden_states_scale)
+ deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
+ (hidden_states, hidden_states_scale),
+ w13_weight_fp8,
+ gateup_output,
+ m_indices,
+ )
+
+ dispose_tensor(hidden_states)
+ dispose_tensor(hidden_states_scale)
+
+ down_input = torch.empty(
+ (
+ all_tokens,
+ N // 2,
+ ),
+ device=gateup_output.device,
+ dtype=torch.bfloat16,
+ )
+ silu_and_mul(gateup_output.view(-1, N), down_input)
+ del gateup_output
+
+ down_input_fp8, down_input_scale = sglang_per_token_group_quant_fp8(
+ down_input,
+ scale_block_size,
+ column_major_scales=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ scale_tma_aligned=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ )
+ del down_input
+
+ down_output = torch.empty(
+ (all_tokens, K),
+ device=hidden_states_device,
+ dtype=torch.bfloat16,
+ )
+ if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+ down_input_scale = tma_align_input_scale(down_input_scale)
+
+ deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
+ (down_input_fp8, down_input_scale),
+ w2_weight_fp8,
+ down_output,
+ m_indices,
+ )
+
+ return down_output
+
+ def _run_masked_gemm(
+ self,
+ runner_input: DeepGemmRunnerInput,
+ quant_info: DeepGemmMoeQuantInfo,
+ running_state: dict,
+ ) -> torch.Tensor:
+ from sglang.srt.layers import deep_gemm_wrapper
+ from sglang.srt.layers.moe.ep_moe.kernels import (
+ silu_and_mul_masked_post_quant_fwd,
+ )
+ from sglang.srt.layers.quantization.fp8_kernel import (
+ sglang_per_token_group_quant_8bit,
+ )
+
+ hidden_states = runner_input.hidden_states
+ hidden_states_scale = runner_input.hidden_states_scale
+ masked_m = runner_input.masked_m
+ expected_m = runner_input.expected_m
+
+ w13_weight = quant_info.w13_weight
+ w2_weight = quant_info.w2_weight
+ w13_scale = quant_info.w13_scale
+ w2_scale = quant_info.w2_scale
+
+ hidden_states_device = running_state["hidden_states_device"]
+
+ # GroupGemm-0
+ if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+ if hidden_states_scale.dtype != torch.int:
+ b, s_mn, s_k = hidden_states_scale.shape
+ assert (
+ s_mn % 4 == 0 and s_k % 4 == 0
+ ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
+ hidden_states_scale = _cast_to_e8m0_with_rounding_up(
+ hidden_states_scale
+ )
+ else:
+ hidden_states_scale = deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
+ hidden_states_scale
+ )
+
+ num_groups, m, k = hidden_states.shape
+ n = w13_weight.size(1)
+ gateup_output = torch.empty(
+ (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+ )
+ deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+ (hidden_states, hidden_states_scale),
+ (w13_weight, w13_scale),
+ gateup_output,
+ masked_m,
+ expected_m,
+ )
+ dispose_tensor(hidden_states)
+ dispose_tensor(hidden_states_scale)
+
+ # Act
+ scale_block_size = 128
+ if _MASKED_GEMM_FAST_ACT:
+ down_input, down_input_scale = sglang_per_token_group_quant_8bit(
+ x=gateup_output,
+ dst_dtype=torch.float8_e4m3fn,
+ group_size=scale_block_size,
+ masked_m=masked_m,
+ column_major_scales=True,
+ scale_tma_aligned=True,
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ fuse_silu_and_mul=True,
+ enable_v2=True,
+ )
+ else:
+ down_input = torch.empty(
+ (
+ gateup_output.shape[0],
+ gateup_output.shape[1],
+ gateup_output.shape[2] // 2,
+ ),
+ device=hidden_states_device,
+ dtype=torch.float8_e4m3fn,
+ )
+ down_input_scale = torch.empty(
+ (
+ gateup_output.shape[0],
+ gateup_output.shape[1],
+ gateup_output.shape[2] // 2 // scale_block_size,
+ ),
+ device=hidden_states_device,
+ dtype=torch.float32,
+ )
+ silu_and_mul_masked_post_quant_fwd(
+ gateup_output,
+ down_input,
+ down_input_scale,
+ scale_block_size,
+ masked_m,
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ )
+ del gateup_output
+
+ # GroupGemm-1
+ n = w2_weight.shape[1]
+
+ if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+ down_input_scale = deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
+ down_input_scale
+ )
+
+ down_output = torch.empty(
+ (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+ )
+
+ down_gemm_overlap_args = running_state.get("down_gemm_overlap_args", None)
+ if down_gemm_overlap_args is None:
+ gemm_overlap_args_dict = {}
+ else:
+ down_gemm_overlap_args.start_event.record()
+ max_block_n = (
+ 160 if (_DEEPGEMM_ON_H20 and runner_input.expected_m <= 64) else 256
+ )
+ gemm_overlap_args_dict = {
+ "overlap_args": down_gemm_overlap_args,
+ "max_block_n": max_block_n,
+ }
+
+ deep_gemm_return_value = deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+ (down_input, down_input_scale),
+ (w2_weight, w2_scale),
+ down_output,
+ masked_m,
+ expected_m,
+ **gemm_overlap_args_dict,
+ )
+ meta_overlap_args = running_state.get("meta_overlap_args", None)
+ if meta_overlap_args is not None:
+ block_m, threshold = deep_gemm_return_value
+ meta_overlap_args["block_m"] = block_m
+ meta_overlap_args["threshold"] = threshold
+
+ return down_output
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.DEEP_GEMM
+
+
+@register_pre_permute("standard", "deep_gemm")
+def pre_permute_standard_to_deep_gemm(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> DeepGemmRunnerInput:
+ from sglang.srt.layers.moe.ep_moe.kernels import moe_ep_deepgemm_preprocess
+
+ hidden_states, topk_output = (
+ dispatch_output.hidden_states,
+ dispatch_output.topk_output,
+ )
+ topk_weights, topk_ids, _ = topk_output
+
+ hidden_states_shape = hidden_states.shape
+ hidden_states_dtype = hidden_states.dtype
+ hidden_states_device = hidden_states.device
+ hidden_states_ref = hidden_states
+
+ topk_weights, topk_ids = topk_weights, topk_ids
+
+ # PreReorder
+ masked_m, expected_m, src2dst, hidden_states, hidden_states_scale = (
+ moe_ep_deepgemm_preprocess(
+ topk_ids,
+ runner_config.num_local_experts,
+ hidden_states,
+ runner_config.top_k,
+ quant_info.block_shape,
+ )
+ )
+
+ dispose_tensor(hidden_states_ref)
+
+ running_state["topk_ids"] = topk_ids
+ running_state["topk_weights"] = topk_weights
+ running_state["hidden_states_shape"] = hidden_states_shape
+ running_state["hidden_states_dtype"] = hidden_states_dtype
+ running_state["hidden_states_device"] = hidden_states_device
+ running_state["src2dst"] = src2dst
+
+ return DeepGemmRunnerInput(
+ hidden_states=hidden_states,
+ hidden_states_scale=hidden_states_scale,
+ use_masked_gemm=True,
+ masked_m=masked_m,
+ expected_m=expected_m,
+ )
+
+
+@register_post_permute("deep_gemm", "standard")
+def post_permute_deep_gemm_to_standard(
+ runner_output: DeepGemmRunnerOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> StandardCombineInput:
+ from sglang.srt.layers.moe.ep_moe.kernels import post_reorder_triton_kernel
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+ hidden_states_shape = running_state["hidden_states_shape"]
+ hidden_states_dtype = running_state["hidden_states_dtype"]
+ hidden_states_device = running_state["hidden_states_device"]
+ src2dst = running_state["src2dst"]
+ topk_ids = running_state["topk_ids"]
+ topk_weights = running_state["topk_weights"]
+
+ output = torch.empty(
+ hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
+ )
+ post_reorder_triton_kernel[(hidden_states_shape[0],)](
+ runner_output.hidden_states,
+ output,
+ src2dst,
+ topk_ids,
+ topk_weights,
+ runner_config.top_k,
+ hidden_states_shape[1],
+ BLOCK_SIZE=512,
+ )
+
+ dispose_tensor(runner_output.hidden_states)
+
+ if runner_config.routed_scaling_factor is not None:
+ output *= runner_config.routed_scaling_factor
+
+ return StandardCombineInput(
+ hidden_states=output,
+ )
+
+
+@register_pre_permute("deepep_ll", "deep_gemm")
+def pre_permute_deepep_ll_to_deep_gemm(
+ dispatch_output: DeepEPLLDispatchOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> DeepGemmRunnerInput:
+ hidden_states, hidden_states_scale, topk_ids, topk_weights, masked_m, expected_m = (
+ dispatch_output
+ )
+
+ running_state["topk_ids"] = topk_ids
+ running_state["topk_weights"] = topk_weights
+ running_state["hidden_states_shape"] = hidden_states.shape
+ running_state["hidden_states_dtype"] = hidden_states.dtype
+ running_state["hidden_states_device"] = hidden_states.device
+
+ return DeepGemmRunnerInput(
+ hidden_states=hidden_states,
+ hidden_states_scale=hidden_states_scale,
+ use_masked_gemm=True,
+ masked_m=masked_m,
+ expected_m=expected_m,
+ )
+
+
+@register_post_permute("deep_gemm", "deepep_ll")
+def post_permute_deep_gemm_to_deepep_ll(
+ runner_output: DeepGemmRunnerOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> DeepEPLLCombineInput:
+ from sglang.srt.layers.moe.token_dispatcher.deepep import DeepEPLLCombineInput
+
+ return DeepEPLLCombineInput(
+ hidden_states=runner_output.hidden_states,
+ topk_ids=running_state["topk_ids"],
+ topk_weights=running_state["topk_weights"],
+ )
+
+
+@register_pre_permute("deepep_normal", "deep_gemm")
+def pre_permute_deepep_normal_to_deep_gemm(
+ dispatch_output: DeepEPNormalDispatchOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> DeepGemmRunnerInput:
+ from sglang.srt.layers.moe.ep_moe.kernels import ep_scatter
+
+ (
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ topk_weights,
+ num_recv_tokens_per_expert,
+ ) = dispatch_output
+ assert runner_config.activation == "silu"
+
+ all_tokens = sum(num_recv_tokens_per_expert)
+ running_state["all_tokens"] = all_tokens
+
+ K = hidden_states.shape[1]
+
+ hidden_states_shape = hidden_states.shape
+ hidden_states_device = hidden_states.device
+ hidden_states_dtype = hidden_states.dtype
+
+ running_state["hidden_states_shape"] = hidden_states_shape
+ running_state["hidden_states_device"] = hidden_states_device
+ running_state["hidden_states_dtype"] = hidden_states_dtype
+ running_state["topk_ids"] = topk_ids
+ running_state["topk_weights"] = topk_weights
+
+ input_tensor = torch.empty(
+ (all_tokens, K),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+ # TODO check whether need `zeros`
+ input_tensor_scale = torch.zeros(
+ (ceil_div(K // 128, 4), all_tokens),
+ device=hidden_states.device,
+ dtype=torch.int,
+ ).transpose(0, 1)
+ else:
+ input_tensor_scale = torch.empty(
+ (all_tokens, K // 128),
+ device=hidden_states.device,
+ dtype=torch.float32,
+ )
+ m_indices = torch.empty(all_tokens, device=hidden_states.device, dtype=torch.int32)
+ output_index = torch.empty_like(topk_ids)
+
+ if get_offloader().forbid_copy_engine_usage:
+ num_recv_tokens_per_expert_gpu = copy_list_to_gpu_no_ce(
+ num_recv_tokens_per_expert
+ )
+ else:
+ num_recv_tokens_per_expert_gpu = torch.tensor(
+ num_recv_tokens_per_expert,
+ dtype=torch.int32,
+ pin_memory=True,
+ device="cpu",
+ ).cuda(non_blocking=True)
+ expert_start_loc = torch.empty_like(num_recv_tokens_per_expert_gpu)
+
+ ep_scatter(
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ num_recv_tokens_per_expert_gpu,
+ expert_start_loc,
+ input_tensor,
+ input_tensor_scale,
+ m_indices,
+ output_index,
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ )
+ dispose_tensor(hidden_states)
+ dispose_tensor(hidden_states_scale)
+
+ running_state["output_index"] = output_index
+
+ return DeepGemmRunnerInput(
+ hidden_states=input_tensor,
+ hidden_states_scale=input_tensor_scale,
+ use_masked_gemm=False,
+ m_indices=m_indices,
+ )
+
+
+@register_post_permute("deep_gemm", "deepep_normal")
+def post_permute_deep_gemm_to_deepep_normal(
+ runner_output: DeepGemmRunnerOutput,
+ quant_info: DeepGemmMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> DeepEPNormalCombineInput:
+ from sglang.srt.layers.moe.ep_moe.kernels import ep_gather
+ from sglang.srt.layers.moe.token_dispatcher.deepep import DeepEPNormalCombineInput
+
+ hidden_states = runner_output.hidden_states
+ topk_ids = running_state["topk_ids"]
+ topk_weights = running_state["topk_weights"]
+ output_index = running_state["output_index"]
+
+ gather_out = torch.empty(
+ running_state["hidden_states_shape"],
+ device=running_state["hidden_states_device"],
+ dtype=torch.bfloat16,
+ )
+ ep_gather(hidden_states, topk_ids, topk_weights, output_index, gather_out)
+
+ return DeepEPNormalCombineInput(
+ hidden_states=gather_out,
+ topk_ids=running_state["topk_ids"],
+ topk_weights=running_state["topk_weights"],
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/flashinfer_trtllm.py b/sglang/python/sglang/srt/layers/moe/moe_runner/flashinfer_trtllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6fb78c4fe86d568ecef6c1d86e581e1bf293a1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/flashinfer_trtllm.py
@@ -0,0 +1,579 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, cast
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+ use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe.moe_runner.base import (
+ MoeQuantInfo,
+ MoeRunnerConfig,
+ register_fused_func,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+ per_token_group_quant_fp8,
+ scaled_fp8_quant,
+)
+from sglang.srt.layers.utils import copy_or_rebind_param
+from sglang.srt.utils.common import (
+ is_cuda_alike,
+ is_flashinfer_available,
+ is_sm120_supported,
+ next_power_of_2,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+
+if is_flashinfer_available() and is_sm120_supported():
+ from flashinfer import fp4_quantize
+elif is_cuda_alike():
+ from sglang.jit_kernel.nvfp4 import scaled_fp4_quant as fp4_quantize
+else:
+ fp4_quantize = None
+
+
+def align_fp8_moe_weights_for_flashinfer_trtllm(
+ layer: Module, swap_w13_halves: bool = False
+) -> None:
+ """Prepare FP8 MoE weights/scales for FlashInfer TRT-LLM kernels.
+
+ Args:
+ layer: The MoE layer to process.
+ swap_w13_halves: If True, swap W13 halves from [Up, Gate] to [Gate, Up].
+ This is needed for ModelOpt FP8 checkpoints which store weights in
+ [Up, Gate] order, while regular FP8 checkpoints store them in [Gate, Up].
+ """
+ from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
+
+ w13_weight = cast(torch.Tensor, layer.w13_weight)
+ w2_weight = cast(torch.Tensor, layer.w2_weight)
+ num_experts, two_n, hidden = w13_weight.shape
+
+ # Optionally swap W13 halves: [Up, Gate] -> [Gate, Up]
+ if swap_w13_halves:
+ inter = two_n // 2
+ w13_weight = (
+ w13_weight.reshape(num_experts, 2, inter, hidden)
+ .flip(dims=[1])
+ .reshape(num_experts, two_n, hidden)
+ )
+
+ w13_interleaved_list = [
+ reorder_rows_for_gated_act_gemm(w13_weight[i]) for i in range(num_experts)
+ ]
+ w13_interleaved: torch.Tensor = torch.stack(w13_interleaved_list).reshape(
+ num_experts, two_n, hidden
+ )
+
+ # Shuffle weights for transposed MMA output (both W13, W2)
+ epilogue_tile_m = 128
+ w13_shuffled = [
+ shuffle_matrix_a(w13_interleaved[i].view(torch.uint8), epilogue_tile_m)
+ for i in range(num_experts)
+ ]
+ w2_shuffled = [
+ shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m)
+ for i in range(num_experts)
+ ]
+
+ layer.w13_weight = Parameter(
+ torch.stack(w13_shuffled).view(torch.float8_e4m3fn),
+ requires_grad=False,
+ )
+ layer.w2_weight = Parameter(
+ torch.stack(w2_shuffled).view(torch.float8_e4m3fn),
+ requires_grad=False,
+ )
+
+ # Precompute and register per-expert output scaling factors for FI MoE.
+ # Note: w13_input_scale and w2_input_scale are scalar Parameters post-reduction.
+ assert hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None
+ assert hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None
+ assert hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None
+ assert hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None
+
+ input_scale = cast(torch.Tensor, layer.w13_input_scale).to(torch.float32)
+ activation_scale = cast(torch.Tensor, layer.w2_input_scale).to(torch.float32)
+ w13_weight_scale = cast(torch.Tensor, layer.w13_weight_scale).to(torch.float32)
+ w2_weight_scale = cast(torch.Tensor, layer.w2_weight_scale).to(torch.float32)
+
+ output1_scales_scalar = w13_weight_scale * input_scale * (1.0 / activation_scale)
+ output1_scales_gate_scalar = w13_weight_scale * input_scale
+ output2_scales_scalar = activation_scale * w2_weight_scale
+
+ layer.output1_scales_scalar = Parameter(output1_scales_scalar, requires_grad=False)
+ layer.output1_scales_gate_scalar = Parameter(
+ output1_scales_gate_scalar, requires_grad=False
+ )
+ layer.output2_scales_scalar = Parameter(output2_scales_scalar, requires_grad=False)
+
+
+def align_fp4_moe_weights_for_flashinfer_trtllm(layer: Module) -> None:
+ """Prepare FP4 MoE weights/scales for FlashInfer TRT-LLM kernels.
+
+ This function handles the weight transformation needed for FP4 TRTLLM MoE:
+ - Reorders weights for gated activation GEMM
+ - Shuffles weights and scales for transposed MMA output
+ - Computes the output scale factors
+ """
+ from sglang.srt.layers.quantization.utils import (
+ prepare_static_weights_for_trtllm_fp4_moe,
+ )
+
+ w13_weight = cast(torch.Tensor, layer.w13_weight)
+ w2_weight = cast(torch.Tensor, layer.w2_weight)
+ w13_weight_scale = cast(torch.Tensor, layer.w13_weight_scale)
+ w2_weight_scale = cast(torch.Tensor, layer.w2_weight_scale)
+
+ (
+ gemm1_weights_fp4_shuffled,
+ gemm1_scales_fp4_shuffled,
+ gemm2_weights_fp4_shuffled,
+ gemm2_scales_fp4_shuffled,
+ ) = prepare_static_weights_for_trtllm_fp4_moe(
+ w13_weight,
+ w2_weight,
+ w13_weight_scale,
+ w2_weight_scale,
+ w2_weight.size(-2), # hidden_size
+ w13_weight.size(-2) // 2, # intermediate_size
+ w13_weight.size(0), # num_experts
+ )
+
+ # Set flashinfer parameters
+ copy_or_rebind_param(
+ layer, "gemm1_weights_fp4_shuffled", gemm1_weights_fp4_shuffled
+ )
+ copy_or_rebind_param(
+ layer, "gemm2_weights_fp4_shuffled", gemm2_weights_fp4_shuffled
+ )
+ copy_or_rebind_param(layer, "gemm1_scales_fp4_shuffled", gemm1_scales_fp4_shuffled)
+ copy_or_rebind_param(layer, "gemm2_scales_fp4_shuffled", gemm2_scales_fp4_shuffled)
+
+ # Compute additional scaling factor needed for TRT-LLM
+ w2_input_scale_quant = cast(torch.Tensor, layer.w2_input_scale_quant)
+ g1_alphas = cast(torch.Tensor, layer.g1_alphas)
+ copy_or_rebind_param(
+ layer,
+ "g1_scale_c",
+ (w2_input_scale_quant * g1_alphas).to(torch.float32),
+ )
+
+ # Clean up weights that won't be used by TRT-LLM
+ del (
+ layer.w2_weight,
+ layer.w2_weight_scale,
+ layer.w13_weight,
+ layer.w13_weight_scale,
+ )
+
+
+@dataclass
+class FlashInferTrtllmFp8MoeQuantInfo(MoeQuantInfo):
+ """Quantization payload consumed by FlashInfer TRT-LLM FP8 MoE kernels."""
+
+ # Weights
+ w13_weight: torch.Tensor
+ w2_weight: torch.Tensor
+
+ # Expert-parallel metadata
+ global_num_experts: int
+ local_expert_offset: int
+ local_num_experts: int
+ intermediate_size: int
+
+ routing_method_type: int
+
+ # Block-quant path
+ block_quant: bool
+ weight_block_k: int | None = None
+ w13_weight_scale_inv: torch.Tensor | None = None
+ w2_weight_scale_inv: torch.Tensor | None = None
+
+ # Per-tensor path
+ w13_input_scale: torch.Tensor | None = None
+ output1_scales_scalar: torch.Tensor | None = None
+ output1_scales_gate_scalar: torch.Tensor | None = None
+ output2_scales_scalar: torch.Tensor | None = None
+ use_routing_scales_on_input: bool = False
+
+
+def fused_experts_none_to_flashinfer_trtllm_fp8(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: FlashInferTrtllmFp8MoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ from flashinfer.fused_moe import (
+ trtllm_fp8_block_scale_moe,
+ trtllm_fp8_per_tensor_scale_moe,
+ )
+
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+ from sglang.srt.layers.moe.utils import RoutingMethodType
+
+ assert runner_config.activation == "silu", "Only silu is supported."
+ assert not runner_config.no_combine, "no_combine is not supported for flashinfer."
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+ assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+ router_logits = topk_output.router_logits
+ topk_config = topk_output.topk_config
+ correction_bias = (
+ None
+ if topk_config.correction_bias is None
+ else topk_config.correction_bias.to(hidden_states.dtype)
+ )
+
+ routing_method_type = quant_info.routing_method_type
+
+ if quant_info.block_quant:
+ assert quant_info.weight_block_k is not None
+ assert quant_info.w13_weight_scale_inv is not None
+ assert quant_info.w2_weight_scale_inv is not None
+
+ a_q, a_sf = per_token_group_quant_fp8(hidden_states, quant_info.weight_block_k)
+ a_sf_t = a_sf.t().contiguous()
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ # FIXME: there is a bug in the trtllm_fp8_block_scale_moe.
+ # It ignored the `output` argument. https://github.com/flashinfer-ai/flashinfer/blob/da01b1bd8f9f22aec8c0eea189ad54860b034947/flashinfer/fused_moe/core.py#L1323-L1325
+ # so we put the whole function under the ``use_symmetric_memory`` context manager.
+ # If the bug is fixed, we can only put the output tensor allocation under the context manager.
+ output = trtllm_fp8_block_scale_moe(
+ routing_logits=(
+ router_logits.to(torch.float32)
+ if routing_method_type == RoutingMethodType.DeepSeekV3
+ else router_logits
+ ),
+ routing_bias=correction_bias,
+ hidden_states=a_q,
+ hidden_states_scale=a_sf_t,
+ gemm1_weights=quant_info.w13_weight,
+ gemm1_weights_scale=quant_info.w13_weight_scale_inv,
+ gemm2_weights=quant_info.w2_weight,
+ gemm2_weights_scale=quant_info.w2_weight_scale_inv,
+ num_experts=quant_info.global_num_experts,
+ top_k=topk_config.top_k,
+ n_group=(
+ topk_config.num_expert_group if topk_config.num_expert_group else 0
+ ),
+ topk_group=topk_config.topk_group if topk_config.topk_group else 0,
+ intermediate_size=quant_info.intermediate_size,
+ local_expert_offset=quant_info.local_expert_offset,
+ local_num_experts=quant_info.local_num_experts,
+ routed_scaling_factor=(
+ runner_config.routed_scaling_factor
+ if runner_config.routed_scaling_factor is not None
+ else 1.0
+ ),
+ routing_method_type=routing_method_type,
+ use_shuffled_weight=False,
+ tune_max_num_tokens=next_power_of_2(a_q.shape[0]),
+ )
+ else:
+ assert quant_info.w13_input_scale is not None
+ assert quant_info.output1_scales_scalar is not None
+ assert quant_info.output1_scales_gate_scalar is not None
+ assert quant_info.output2_scales_scalar is not None
+
+ a_q, _ = scaled_fp8_quant(hidden_states, quant_info.w13_input_scale)
+ routing_bias_cast = (
+ None if correction_bias is None else correction_bias.to(torch.bfloat16)
+ )
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ output = trtllm_fp8_per_tensor_scale_moe(
+ routing_logits=router_logits.to(torch.bfloat16),
+ routing_bias=routing_bias_cast,
+ hidden_states=a_q,
+ gemm1_weights=quant_info.w13_weight,
+ output1_scales_scalar=quant_info.output1_scales_scalar,
+ output1_scales_gate_scalar=quant_info.output1_scales_gate_scalar,
+ gemm2_weights=quant_info.w2_weight,
+ output2_scales_scalar=quant_info.output2_scales_scalar,
+ num_experts=quant_info.global_num_experts,
+ top_k=topk_config.top_k,
+ n_group=(
+ topk_config.num_expert_group if topk_config.num_expert_group else 0
+ ),
+ topk_group=topk_config.topk_group if topk_config.topk_group else 0,
+ intermediate_size=quant_info.intermediate_size,
+ local_expert_offset=quant_info.local_expert_offset,
+ local_num_experts=quant_info.local_num_experts,
+ routed_scaling_factor=(
+ runner_config.routed_scaling_factor
+ if runner_config.routed_scaling_factor is not None
+ else 1.0
+ ),
+ use_routing_scales_on_input=quant_info.use_routing_scales_on_input,
+ routing_method_type=routing_method_type,
+ tune_max_num_tokens=next_power_of_2(a_q.shape[0]),
+ )
+
+ return StandardCombineInput(hidden_states=output)
+
+
+@dataclass
+class FlashInferTrtllmFp4MoeQuantInfo(MoeQuantInfo):
+ """Quantization payload consumed by FlashInfer TRT-LLM FP4 MoE kernels."""
+
+ # Shuffled FP4 weights (processed by align_fp4_moe_weights_for_flashinfer_trtllm)
+ gemm1_weights_fp4_shuffled: torch.Tensor
+ gemm2_weights_fp4_shuffled: torch.Tensor
+ gemm1_scales_fp4_shuffled: torch.Tensor
+ gemm2_scales_fp4_shuffled: torch.Tensor
+
+ # Scaling factors
+ g1_scale_c: torch.Tensor
+ g1_alphas: torch.Tensor
+ g2_alphas: torch.Tensor
+ w13_input_scale_quant: torch.Tensor
+
+ # Expert-parallel metadata
+ global_num_experts: int
+ local_expert_offset: int
+ local_num_experts: int
+ intermediate_size_per_partition: int
+
+ routing_method_type: int
+
+
+def quantize_hidden_states_fp4(
+ hidden_states: torch.Tensor,
+ input_scale_quant: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ """
+ Quantize hidden states to FP4 for TRTLLM MoE.
+
+ Global scale factor is set by ModelOptNvFp4FusedMoEMethod during weight loading.
+ Only block scales are computed at runtime for efficiency.
+
+ Returns (packed_fp4_uint8, scale_float8_e4m3fn_runtime)
+ """
+
+ # flashinfer.fp4_quantize returns (packed_uint8, scale_fp8)
+ # Only the block scales are computed at runtime
+ hs_fp4_bytes, hs_sf_bytes = fp4_quantize(
+ hidden_states,
+ input_scale_quant,
+ 16, # sf_vec_size
+ False, # use_ue8m0
+ False, # is_sf_swizzled_layout
+ )
+
+ seq_len, hidden_size = hidden_states.shape
+ hs_fp4 = hs_fp4_bytes.reshape(seq_len, hidden_size // 2)
+ # TRT-LLM expects hidden state scales shaped as [seq_len, hidden_size // 16]
+ hs_sf = hs_sf_bytes.view(torch.float8_e4m3fn).reshape(seq_len, hidden_size // 16)
+
+ return hs_fp4, hs_sf
+
+
+def fused_experts_none_to_flashinfer_trtllm_fp4(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: FlashInferTrtllmFp4MoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ """FlashInfer TRTLLM FP4 MoE forward pass.
+
+ This function handles the FP4 TRTLLM MoE path that was previously in
+ FlashInferFP4MoE.forward_impl and ModelOptNvFp4FusedMoEMethod.apply.
+ """
+ from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+ from sglang.srt.layers.moe.utils import RoutingMethodType
+
+ assert runner_config.activation == "silu", "Only silu is supported for FP4 MoE."
+ assert runner_config.is_gated, "Only gated MoEs are supported for FP4 MoE."
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+ assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+ router_logits = topk_output.router_logits
+ topk_config = topk_output.topk_config
+ routing_method_type = quant_info.routing_method_type
+
+ # Quantize hidden states to FP4
+ hs_fp4, hs_scale_linear = quantize_hidden_states_fp4(
+ hidden_states, quant_info.w13_input_scale_quant
+ )
+
+ # DeepSeekV3 style routing requires float32 router logits
+ if routing_method_type == RoutingMethodType.DeepSeekV3:
+ router_logits = router_logits.to(torch.float32)
+
+ correction_bias = (
+ None
+ if topk_config.correction_bias is None
+ else topk_config.correction_bias.to(hidden_states.dtype)
+ )
+
+ with use_symmetric_memory(get_tp_group(), disabled=not is_allocation_symmetric()):
+ num_tokens = hs_fp4.shape[0]
+ hidden_size = (
+ hs_fp4.shape[-1] * 2 if hs_fp4.dtype == torch.uint8 else hs_fp4.shape[-1]
+ )
+ symm_output = torch.empty(
+ num_tokens, hidden_size, dtype=torch.bfloat16, device=hs_fp4.device
+ )
+
+ result = trtllm_fp4_block_scale_moe(
+ routing_logits=router_logits,
+ routing_bias=correction_bias,
+ hidden_states=hs_fp4,
+ hidden_states_scale=hs_scale_linear.view(torch.float8_e4m3fn).reshape(
+ *hs_scale_linear.shape[:-1], -1
+ ),
+ gemm1_weights=quant_info.gemm1_weights_fp4_shuffled,
+ gemm1_weights_scale=quant_info.gemm1_scales_fp4_shuffled.view(
+ torch.float8_e4m3fn
+ ),
+ gemm1_bias=None,
+ gemm1_alpha=None,
+ gemm1_beta=None,
+ gemm1_clamp_limit=None,
+ gemm2_weights=quant_info.gemm2_weights_fp4_shuffled,
+ gemm2_weights_scale=quant_info.gemm2_scales_fp4_shuffled.view(
+ torch.float8_e4m3fn
+ ),
+ gemm2_bias=None,
+ output1_scale_scalar=quant_info.g1_scale_c,
+ output1_scale_gate_scalar=quant_info.g1_alphas,
+ output2_scale_scalar=quant_info.g2_alphas,
+ num_experts=quant_info.global_num_experts,
+ top_k=topk_config.top_k,
+ n_group=topk_config.num_expert_group,
+ topk_group=topk_config.topk_group,
+ intermediate_size=quant_info.intermediate_size_per_partition,
+ local_expert_offset=quant_info.local_expert_offset,
+ local_num_experts=quant_info.local_num_experts,
+ routed_scaling_factor=runner_config.routed_scaling_factor,
+ tile_tokens_dim=None,
+ routing_method_type=(
+ routing_method_type
+ if routing_method_type is not None
+ else RoutingMethodType.Default
+ ),
+ do_finalize=True,
+ tune_max_num_tokens=next_power_of_2(hs_fp4.shape[0]),
+ output=symm_output,
+ )[0]
+
+ return StandardCombineInput(hidden_states=result)
+
+
+@dataclass
+class FlashInferTrtllmBf16MoeQuantInfo(MoeQuantInfo):
+ """Quantization payload consumed by FlashInfer TRT-LLM BF16 MoE kernels."""
+
+ gemm1_weights: torch.Tensor
+ gemm2_weights: torch.Tensor
+
+ # Expert-parallel metadata
+ global_num_experts: int
+ local_expert_offset: int
+
+
+def fused_experts_none_to_flashinfer_trtllm_bf16(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: FlashInferTrtllmBf16MoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ # lazy import
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+ try:
+ from flashinfer.fused_moe import trtllm_bf16_moe
+ except ImportError as e:
+ raise ImportError(
+ "Can't import trtllm_bf16_moe from flashinfer. "
+ "Please check flashinfer version to use bf16 with flashinfer_trtllm backend."
+ ) from e
+
+ assert (
+ runner_config.activation == "silu"
+ ), "Only silu is supported for flashinfer trtllm moe"
+ assert (
+ dispatch_output.topk_output.topk_config.renormalize
+ ), "Renormalize is required for flashinfer trtllm moe"
+ assert (
+ runner_config.num_fused_shared_experts == 0
+ ), "Fused shared experts are not supported for flashinfer trtllm moe"
+ assert (
+ runner_config.is_gated
+ ), "Only gated MoEs are supported for flashinfer trtllm moe"
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+ assert TopKOutputChecker.format_is_bypassed(dispatch_output.topk_output)
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+ topk_config = topk_output.topk_config
+
+ with use_symmetric_memory(get_tp_group(), disabled=not is_allocation_symmetric()):
+
+ # Call the fused kernel
+ final_hidden_states = trtllm_bf16_moe(
+ routing_logits=topk_output.router_logits,
+ routing_bias=topk_config.correction_bias,
+ hidden_states=hidden_states,
+ gemm1_weights=quant_info.gemm1_weights,
+ gemm2_weights=quant_info.gemm2_weights,
+ num_experts=quant_info.global_num_experts,
+ top_k=topk_config.top_k,
+ n_group=topk_config.num_expert_group,
+ topk_group=topk_config.topk_group,
+ intermediate_size=runner_config.intermediate_size_per_partition,
+ local_expert_offset=quant_info.local_expert_offset,
+ local_num_experts=runner_config.num_local_experts,
+ routing_method_type=runner_config.routing_method_type,
+ routed_scaling_factor=runner_config.routed_scaling_factor,
+ tune_max_num_tokens=next_power_of_2(hidden_states.shape[0]),
+ )
+
+ return StandardCombineInput(hidden_states=final_hidden_states)
+
+
+@register_fused_func("none", "flashinfer_trtllm")
+def fused_experts_none_to_flashinfer_trtllm(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: MoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ """Dispatch to FP8 or FP4 FlashInfer TRT-LLM MoE based on quant_info type."""
+ if isinstance(quant_info, FlashInferTrtllmFp4MoeQuantInfo):
+ return fused_experts_none_to_flashinfer_trtllm_fp4(
+ dispatch_output, quant_info, runner_config
+ )
+ if isinstance(quant_info, FlashInferTrtllmFp8MoeQuantInfo):
+ return fused_experts_none_to_flashinfer_trtllm_fp8(
+ dispatch_output, quant_info, runner_config
+ )
+ if isinstance(quant_info, FlashInferTrtllmBf16MoeQuantInfo):
+ return fused_experts_none_to_flashinfer_trtllm_bf16(
+ dispatch_output, quant_info, runner_config
+ )
+ raise TypeError(
+ f"Unexpected quant_info type for flashinfer_trtllm: {type(quant_info)}"
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/marlin.py b/sglang/python/sglang/srt/layers/moe/moe_runner/marlin.py
new file mode 100644
index 0000000000000000000000000000000000000000..45104dd27805acadc1113094167052d538345230
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/marlin.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe.moe_runner.base import (
+ MoeQuantInfo,
+ MoeRunnerConfig,
+ RunnerInput,
+ RunnerOutput,
+ register_fused_func,
+)
+from sglang.srt.layers.moe.utils import MoeRunnerBackend
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+
+MARLIN_MOE_WORKSPACE: Optional[torch.Tensor] = None
+
+
+@dataclass
+class MarlinRunnerInput(RunnerInput):
+ """Input bundle passed to the Marlin runner core."""
+
+ hidden_states: torch.Tensor
+ topk_weights: torch.Tensor
+ topk_ids: torch.Tensor
+ router_logits: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.MARLIN
+
+
+@dataclass
+class MarlinRunnerOutput(RunnerOutput):
+ """Output bundle returned from the Marlin runner core."""
+
+ hidden_states: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.MARLIN
+
+
+@dataclass
+class MarlinMoeQuantInfo(MoeQuantInfo):
+ """Quantization payload consumed by the Marlin backend."""
+
+ w13_qweight: torch.Tensor
+ w2_qweight: torch.Tensor
+ w13_scales: torch.Tensor
+ w2_scales: torch.Tensor
+ w13_g_idx_sort_indices: Optional[torch.Tensor]
+ w2_g_idx_sort_indices: Optional[torch.Tensor]
+ weight_bits: int
+
+ # GPTQ specific (Optional)
+ w13_g_idx: Optional[torch.Tensor] = None
+ w2_g_idx: Optional[torch.Tensor] = None
+ is_k_full: bool = True
+
+ # AWQ specific (Optional)
+ w13_qzeros: Optional[torch.Tensor] = None
+ w2_qzeros: Optional[torch.Tensor] = None
+
+ # Optional
+ expert_map: Optional[torch.Tensor] = None
+
+
+@register_fused_func("none", "marlin")
+def fused_experts_none_to_marlin(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: MarlinMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ global MARLIN_MOE_WORKSPACE
+ from sglang.srt.layers.moe.fused_moe_triton.fused_marlin_moe import fused_marlin_moe
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+ from sglang.srt.layers.quantization.marlin_utils import marlin_make_workspace
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ assert runner_config.activation == "silu", "Only SiLU activation is supported."
+
+ if (
+ MARLIN_MOE_WORKSPACE is None
+ or MARLIN_MOE_WORKSPACE.device != hidden_states.device
+ ):
+ MARLIN_MOE_WORKSPACE = marlin_make_workspace(
+ hidden_states.device, max_blocks_per_sm=4
+ )
+
+ output = fused_marlin_moe(
+ hidden_states=hidden_states,
+ w1=quant_info.w13_qweight,
+ w2=quant_info.w2_qweight,
+ w1_scale=quant_info.w13_scales,
+ w2_scale=quant_info.w2_scales,
+ gating_output=topk_output.router_logits,
+ topk_weights=topk_output.topk_weights,
+ topk_ids=topk_output.topk_ids,
+ expert_map=quant_info.expert_map,
+ g_idx1=quant_info.w13_g_idx,
+ g_idx2=quant_info.w2_g_idx,
+ sort_indices1=quant_info.w13_g_idx_sort_indices,
+ sort_indices2=quant_info.w2_g_idx_sort_indices,
+ w1_zeros=quant_info.w13_qzeros,
+ w2_zeros=quant_info.w2_qzeros,
+ workspace=MARLIN_MOE_WORKSPACE,
+ num_bits=quant_info.weight_bits,
+ is_k_full=quant_info.is_k_full,
+ inplace=runner_config.inplace,
+ routed_scaling_factor=runner_config.routed_scaling_factor,
+ ).to(hidden_states.dtype)
+
+ return StandardCombineInput(
+ hidden_states=output,
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/runner.py b/sglang/python/sglang/srt/layers/moe/moe_runner/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b58cd3115bda20fec9570908414493c5bd62e4f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/runner.py
@@ -0,0 +1,119 @@
+from __future__ import annotations
+
+import logging
+import os
+from typing import TYPE_CHECKING, Optional
+
+from sglang.srt.layers.moe.moe_runner.base import (
+ FusedOpPool,
+ MoeRunnerConfig,
+ PermuteMethodPool,
+)
+from sglang.srt.layers.moe.moe_runner.deep_gemm import DeepGemmRunnerCore
+from sglang.srt.layers.moe.moe_runner.triton import TritonRunnerCore
+from sglang.srt.layers.moe.moe_runner.triton_kernels import TritonKernelsRunnerCore
+from sglang.srt.layers.moe.utils import get_moe_a2a_backend
+
+if TYPE_CHECKING:
+ from sglang.srt.batch_overlap.single_batch_overlap import DownGemmOverlapArgs
+ from sglang.srt.layers.moe.moe_runner.base import MoeQuantInfo
+ from sglang.srt.layers.moe.token_dispatcher.base import CombineInput, DispatchOutput
+ from sglang.srt.layers.moe.utils import MoeRunnerBackend
+
+logger = logging.getLogger(__name__)
+
+
+class MoeRunner:
+
+ def __init__(self, runner_backend: MoeRunnerBackend, config: MoeRunnerConfig):
+ self.runner_backend = runner_backend
+ self.config = config
+
+ self.fused_func = None
+
+ if runner_backend.is_triton():
+ self.runner_core = TritonRunnerCore(config)
+ elif runner_backend.is_triton_kernels():
+ self.runner_core = TritonKernelsRunnerCore(config)
+ elif runner_backend.is_deep_gemm():
+ self.runner_core = DeepGemmRunnerCore(config)
+ elif runner_backend.is_marlin():
+ self.runner_core = None # Marlin only supports fused path
+ elif runner_backend.is_flashinfer_trtllm():
+ self.runner_core = None # FlashInfer TRT-LLM only supports fused path
+ else:
+ raise NotImplementedError(f"Unsupported runner backend: {runner_backend}")
+
+ a2a_backend_name = get_moe_a2a_backend().value
+ runner_backend_name = runner_backend.value
+
+ # TODO(cwan): add a server argument to disable fused func
+ self.fused_func = FusedOpPool.get_fused_func(
+ a2a_backend_name, runner_backend_name
+ )
+
+ if self.runner_core is None and self.fused_func is None:
+ raise NotImplementedError(
+ f"Runner backend {runner_backend} requires a fused func for a2a backend "
+ f"{a2a_backend_name}, but none is registered."
+ )
+
+ self.down_gemm_overlap_args: Optional[DownGemmOverlapArgs] = None
+ self.meta_overlap_args: Optional[dict] = None
+
+ SGLANG_CI_DISABLE_MOE_FUSED_FUNC = os.environ.get(
+ "SGLANG_CI_DISABLE_MOE_FUSED_FUNC", "0"
+ )
+ if SGLANG_CI_DISABLE_MOE_FUSED_FUNC == "1":
+ logger.info(
+ "SGLANG_CI_DISABLE_MOE_FUSED_FUNC is set to 1, disabling fused func"
+ )
+ self.fused_func = None
+
+ def run(
+ self, dispatch_output: DispatchOutput, quant_info: MoeQuantInfo
+ ) -> CombineInput:
+
+ if self.fused_func is not None:
+ return self.fused_func(dispatch_output, quant_info, self.config)
+
+ assert self.runner_core is not None
+ dispatch_format = dispatch_output.format.value
+ runner_format = self.runner_core.runner_backend.value
+ self.pre_permute_func = PermuteMethodPool.get_pre_permute(
+ dispatch_format, runner_format
+ )
+
+ running_state = {}
+ if self.down_gemm_overlap_args is not None:
+ running_state["down_gemm_overlap_args"] = self.down_gemm_overlap_args
+ if self.meta_overlap_args is not None:
+ running_state["meta_overlap_args"] = self.meta_overlap_args
+
+ runner_input = self.pre_permute_func(
+ dispatch_output, quant_info, self.config, running_state
+ )
+ runner_output = self.runner_core.run(runner_input, quant_info, running_state)
+
+ runner_format = self.runner_core.runner_backend.value
+ combine_format = dispatch_output.format.value
+ self.post_permute_func = PermuteMethodPool.get_post_permute(
+ runner_format, combine_format
+ )
+ combine_input = self.post_permute_func(
+ runner_output, quant_info, self.config, running_state
+ )
+
+ return combine_input
+
+ def set_overlap_args(
+ self, down_gemm_overlap_args: DownGemmOverlapArgs, meta_overlap_args: dict
+ ):
+ assert self.fused_func is None, "Fused func is not supported for overlap args"
+ self.down_gemm_overlap_args = down_gemm_overlap_args
+ self.meta_overlap_args = meta_overlap_args
+
+ def clear_overlap_args(self) -> None:
+ assert self.fused_func is None, "Fused func is not supported for overlap args"
+ self.down_gemm_overlap_args = None
+ self.meta_overlap_args = None
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/triton.py b/sglang/python/sglang/srt/layers/moe/moe_runner/triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..be40253b3ef80f29efa5ba7a54672b2077d5f88d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/triton.py
@@ -0,0 +1,485 @@
+from __future__ import annotations
+
+import functools
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import triton.language as tl
+
+from sglang.srt.layers.moe.moe_runner.base import (
+ MoeQuantInfo,
+ MoeRunnerConfig,
+ MoeRunnerCore,
+ RunnerInput,
+ RunnerOutput,
+ register_fused_func,
+ register_post_permute,
+ register_pre_permute,
+)
+from sglang.srt.layers.moe.utils import MoeRunnerBackend
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_xpu
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher.standard import (
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = bool(int(os.getenv("SGLANG_USE_AITER", "0")))
+_is_xpu = is_xpu()
+_MOE_PADDING_SIZE = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+if _is_cuda or _is_hip:
+ from sgl_kernel import gelu_and_mul, silu_and_mul
+
+ if _is_hip:
+ _has_vllm = False
+ if _use_aiter:
+ try:
+ from aiter import moe_sum
+ except ImportError:
+ raise ImportError(
+ "aiter is required when SGLANG_USE_AITER is set to True"
+ )
+ else:
+ try:
+ from vllm import _custom_ops as vllm_ops # moe_sum
+
+ _has_vllm = True
+ except ImportError:
+ # Fallback: vllm not available, will use triton moe_sum
+ _has_vllm = False
+elif _is_cpu and _is_cpu_amx_available:
+ pass
+elif _is_xpu:
+ from sgl_kernel import moe_sum_reduce, silu_and_mul
+
+
+if _is_cuda or _is_hip or _is_xpu:
+ from sgl_kernel import ( # noqa: F401
+ moe_align_block_size as sgl_moe_align_block_size,
+ )
+
+
+@dataclass
+class TritonRunnerInput(RunnerInput):
+
+ hidden_states: torch.Tensor
+ topk_weights: torch.Tensor
+ topk_ids: torch.Tensor
+ sorted_token_ids: torch.Tensor
+ expert_ids: torch.Tensor
+ num_tokens_post_padded: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON
+
+
+@dataclass
+class TritonRunnerOutput(RunnerOutput):
+
+ hidden_states: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON
+
+
+@dataclass
+class TritonMoeQuantInfo(MoeQuantInfo):
+ w13_weight: torch.Tensor
+ w2_weight: torch.Tensor
+ b13: Optional[torch.Tensor] = None
+ b2: Optional[torch.Tensor] = None
+ use_fp8_w8a8: bool = False
+ use_int8_w8a8: bool = False
+ use_int8_w8a16: bool = False
+ use_int4_w4a16: bool = False
+ per_channel_quant: bool = False
+ w13_scale: Optional[torch.Tensor] = None
+ w2_scale: Optional[torch.Tensor] = None
+ w13_zp: Optional[torch.Tensor] = None
+ w2_zp: Optional[torch.Tensor] = None
+ a13_scale: Optional[torch.Tensor] = None
+ a2_scale: Optional[torch.Tensor] = None
+ block_shape: Optional[List[int]] = None
+
+
+class TritonRunnerCore(MoeRunnerCore):
+
+ def __init__(self, config: MoeRunnerConfig):
+ super().__init__(config)
+
+ def run(
+ self,
+ runner_input: TritonRunnerInput,
+ quant_info: TritonMoeQuantInfo,
+ running_state: dict,
+ ) -> TritonRunnerOutput:
+
+ # TODO: move these functions to the triton runner
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+ _swiglu_gpt_oss_sigmoid_alpha,
+ _swiglu_silu_clamp_mul,
+ invoke_fused_moe_kernel,
+ moe_sum_reduce_torch_compile,
+ moe_sum_reduce_triton,
+ )
+
+ hidden_states = runner_input.hidden_states
+ topk_weights = runner_input.topk_weights
+ topk_ids = runner_input.topk_ids
+ sorted_token_ids = runner_input.sorted_token_ids
+ expert_ids = runner_input.expert_ids
+ num_tokens_post_padded = runner_input.num_tokens_post_padded
+
+ w13 = quant_info.w13_weight
+ w2 = quant_info.w2_weight
+ b13 = quant_info.b13
+ b2 = quant_info.b2
+ a13_scale = quant_info.a13_scale
+ a2_scale = quant_info.a2_scale
+ w13_scale = quant_info.w13_scale
+ w2_scale = quant_info.w2_scale
+ w13_zp = quant_info.w13_zp
+ w2_zp = quant_info.w2_zp
+ block_shape = quant_info.block_shape
+ per_channel_quant = quant_info.per_channel_quant
+ use_fp8_w8a8 = quant_info.use_fp8_w8a8
+ use_int8_w8a8 = quant_info.use_int8_w8a8
+ use_int8_w8a16 = quant_info.use_int8_w8a16
+ use_int4_w4a16 = quant_info.use_int4_w4a16
+
+ activation = self.config.activation
+ no_combine = self.config.no_combine
+ inplace = self.config.inplace
+ gemm1_alpha = self.config.gemm1_alpha
+ gemm1_limit = self.config.gemm1_clamp_limit
+ routed_scaling_factor = self.config.routed_scaling_factor
+ apply_router_weight_on_input = self.config.apply_router_weight_on_input
+
+ assert self.config.is_gated, "Only gated MoEs are supported for Triton runner"
+
+ M = hidden_states.shape[0]
+ E, N, _ = w13.shape
+ compute_type = (
+ tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+ )
+
+ intermediate_cache1 = torch.empty(
+ (M, topk_ids.shape[1], N),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ invoke_fused_moe_kernel(
+ hidden_states,
+ w13,
+ b13,
+ intermediate_cache1,
+ a13_scale,
+ w13_scale,
+ w13_zp,
+ topk_weights,
+ topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ apply_router_weight_on_input,
+ topk_ids.shape[1],
+ running_state["config"],
+ compute_type=compute_type,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ per_channel_quant=per_channel_quant,
+ block_shape=block_shape,
+ )
+
+ intermediate_cache2 = torch.empty(
+ (M * topk_ids.shape[1], N // 2),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ if activation == "silu":
+ if gemm1_alpha is not None:
+ assert gemm1_limit is not None
+ intermediate_cache2 = _swiglu_gpt_oss_sigmoid_alpha(
+ intermediate_cache1.view(-1, N), gemm1_alpha, gemm1_limit
+ )
+ elif gemm1_limit is not None:
+ intermediate_cache2 = _swiglu_silu_clamp_mul(
+ intermediate_cache1.view(-1, N), gemm1_limit
+ )
+ elif _is_cuda or _is_hip or _is_xpu:
+ silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ else:
+ vllm_ops.silu_and_mul(
+ intermediate_cache2, intermediate_cache1.view(-1, N)
+ )
+ elif activation == "gelu":
+ assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
+ assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
+ if _is_cuda or _is_hip:
+ gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+ else:
+ vllm_ops.gelu_and_mul(
+ intermediate_cache2, intermediate_cache1.view(-1, N)
+ )
+ else:
+ raise ValueError(f"Unsupported activation: {activation=}")
+
+ intermediate_cache3 = torch.empty(
+ (M, topk_ids.shape[1], w2.shape[1]),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+
+ if no_combine:
+ assert not inplace
+ out_hidden_states = torch.empty(
+ (M, topk_ids.shape[1], w2.shape[1]),
+ device=hidden_states.device,
+ dtype=hidden_states.dtype,
+ )
+ elif inplace:
+ out_hidden_states = hidden_states
+ else:
+ out_hidden_states = torch.empty_like(hidden_states)
+
+ invoke_fused_moe_kernel(
+ intermediate_cache2,
+ w2,
+ b2,
+ (
+ intermediate_cache3
+ if not no_combine and topk_ids.shape[1] != 1
+ else out_hidden_states.unsqueeze(0)
+ ),
+ a2_scale,
+ w2_scale,
+ w2_zp,
+ topk_weights,
+ topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ not apply_router_weight_on_input,
+ 1,
+ running_state["config"],
+ compute_type=compute_type,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a8=use_int8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ use_int4_w4a16=use_int4_w4a16,
+ per_channel_quant=per_channel_quant,
+ block_shape=block_shape,
+ )
+
+ if routed_scaling_factor is None:
+ routed_scaling_factor = 1.0
+
+ if no_combine:
+ pass
+ elif _is_cuda:
+ if topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0:
+ pass # we write directly into out_hidden_states
+ elif topk_ids.shape[1] == 2 and routed_scaling_factor == 1.0:
+ torch.add(
+ intermediate_cache3[:, 0],
+ intermediate_cache3[:, 1],
+ out=out_hidden_states,
+ ).squeeze(dim=1)
+ else:
+ # According to micro benchmark results, torch.compile can get better performance for small token.
+ if M <= 32:
+ moe_sum_reduce_torch_compile(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ routed_scaling_factor,
+ )
+ else:
+ moe_sum_reduce_triton(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ routed_scaling_factor,
+ )
+ elif _is_hip:
+ if _use_aiter:
+ moe_sum(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ )
+ elif _has_vllm:
+ vllm_ops.moe_sum(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ )
+ else:
+ # Fallback: use triton moe_sum when vllm is not available
+ moe_sum_reduce_triton(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ routed_scaling_factor,
+ )
+ elif _is_xpu:
+ moe_sum_reduce(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ routed_scaling_factor,
+ )
+ else:
+ vllm_ops.moe_sum(
+ intermediate_cache3.view(*intermediate_cache3.shape),
+ out_hidden_states,
+ )
+
+ return TritonRunnerOutput(
+ hidden_states=out_hidden_states,
+ )
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON
+
+
+@register_fused_func("none", "triton")
+def fused_experts_none_to_triton(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: TritonMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+ output = fused_experts(
+ hidden_states=dispatch_output.hidden_states,
+ w1=quant_info.w13_weight,
+ w2=quant_info.w2_weight,
+ topk_output=dispatch_output.topk_output,
+ moe_runner_config=runner_config,
+ b1=quant_info.b13,
+ b2=quant_info.b2,
+ use_fp8_w8a8=quant_info.use_fp8_w8a8,
+ use_int8_w8a8=quant_info.use_int8_w8a8,
+ use_int8_w8a16=quant_info.use_int8_w8a16,
+ use_int4_w4a16=quant_info.use_int4_w4a16,
+ per_channel_quant=quant_info.per_channel_quant,
+ w1_scale=quant_info.w13_scale,
+ w2_scale=quant_info.w2_scale,
+ w1_zp=quant_info.w13_zp,
+ w2_zp=quant_info.w2_zp,
+ a1_scale=quant_info.a13_scale,
+ a2_scale=quant_info.a2_scale,
+ block_shape=quant_info.block_shape,
+ )
+
+ return StandardCombineInput(
+ hidden_states=output,
+ )
+
+
+@register_pre_permute("standard", "triton")
+def pre_permute_standard_to_triton(
+ dispatch_output: StandardDispatchOutput,
+ quant_info: TritonMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> TritonRunnerInput:
+
+ # NOTE: this is dead code as a fused func for standard format is registered.
+ # This is left here for testing and examples.
+
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+ get_config_dtype_str,
+ moe_align_block_size,
+ try_get_optimal_moe_config,
+ )
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+ hidden_states, topk_output = (
+ dispatch_output.hidden_states,
+ dispatch_output.topk_output,
+ )
+
+ assert TopKOutputChecker.format_is_standard(topk_output)
+
+ num_tokens = hidden_states.shape[0]
+ num_local_experts = runner_config.num_local_experts
+
+ if (
+ not (quant_info.use_fp8_w8a8 or quant_info.use_int8_w8a8)
+ or quant_info.block_shape is not None
+ or _use_aiter
+ ):
+ padding_size = 0
+ else:
+ padding_size = _MOE_PADDING_SIZE
+
+ config_dtype = get_config_dtype_str(
+ use_fp8_w8a8=quant_info.use_fp8_w8a8,
+ use_int8_w8a8=quant_info.use_int8_w8a8,
+ use_int8_w8a16=quant_info.use_int8_w8a16,
+ use_int4_w4a16=quant_info.use_int4_w4a16,
+ dtype=hidden_states.dtype,
+ )
+
+ get_config_func = functools.partial(
+ try_get_optimal_moe_config,
+ quant_info.w13_weight.shape,
+ (
+ num_local_experts,
+ quant_info.w2_weight.shape[1],
+ quant_info.w2_weight.shape[2] - padding_size,
+ ),
+ topk_output.topk_ids.shape[1],
+ config_dtype,
+ block_shape=quant_info.block_shape,
+ per_channel_quant=quant_info.per_channel_quant,
+ )
+
+ config = get_config_func(num_tokens)
+
+ sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+ topk_output.topk_ids, config["BLOCK_SIZE_M"], num_local_experts
+ )
+
+ running_state["config"] = config
+
+ return TritonRunnerInput(
+ hidden_states=hidden_states,
+ topk_weights=topk_output.topk_weights,
+ topk_ids=topk_output.topk_ids,
+ sorted_token_ids=sorted_token_ids,
+ expert_ids=expert_ids,
+ num_tokens_post_padded=num_tokens_post_padded,
+ )
+
+
+@register_post_permute("triton", "standard")
+def post_permute_triton_to_standard(
+ runner_output: TritonRunnerOutput,
+ quant_info: TritonMoeQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> StandardCombineInput:
+
+ # NOTE: this is dead code as a fused func for standard format is registered.
+ # This is left here for testing and examples.
+
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+ return StandardCombineInput(
+ hidden_states=runner_output.hidden_states,
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/moe_runner/triton_kernels.py b/sglang/python/sglang/srt/layers/moe/moe_runner/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..b13cd2759108248667753367dee429c76e1999c2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/moe_runner/triton_kernels.py
@@ -0,0 +1,198 @@
+"""Triton kernels MoE runner backend skeleton."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe.moe_runner.base import (
+ MoeQuantInfo,
+ MoeRunnerConfig,
+ MoeRunnerCore,
+ RunnerInput,
+ RunnerOutput,
+ register_post_permute,
+ register_pre_permute,
+)
+from sglang.srt.layers.moe.utils import MoeRunnerBackend
+
+if TYPE_CHECKING:
+ from triton_kernels.matmul_ogs import PrecisionConfig
+ from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
+
+ from sglang.srt.layers.moe.token_dispatcher.standard import (
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Runner IO dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TritonKernelsRunnerInput(RunnerInput):
+ """Input bundle passed to the triton-kernels runner core."""
+
+ hidden_states: torch.Tensor
+ routing_data: "RoutingData"
+ gather_indx: "GatherIndx"
+ scatter_indx: "ScatterIndx"
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON_KERNELS
+
+
+@dataclass
+class TritonKernelsRunnerOutput(RunnerOutput):
+ """Output bundle returned from the triton-kernels runner core."""
+
+ hidden_states: torch.Tensor
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON_KERNELS
+
+
+@dataclass
+class TritonKernelsQuantInfo(MoeQuantInfo):
+ """Quantization payload consumed by the triton-kernels backend."""
+
+ w13_weight: torch.Tensor
+ w2_weight: torch.Tensor
+ w13_bias: Optional[torch.Tensor] = None
+ w2_bias: Optional[torch.Tensor] = None
+ w13_precision_config: Optional[PrecisionConfig] = None
+ w2_precision_config: Optional[PrecisionConfig] = None
+ global_num_experts: int = -1
+
+
+# ---------------------------------------------------------------------------
+# Runner core
+# ---------------------------------------------------------------------------
+
+
+class TritonKernelsRunnerCore(MoeRunnerCore):
+ """Execute MoE experts via the external triton_kernels package."""
+
+ def run(
+ self,
+ runner_input: TritonKernelsRunnerInput,
+ quant_info: TritonKernelsQuantInfo,
+ running_state: dict,
+ ) -> TritonKernelsRunnerOutput:
+ from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+ triton_kernel_fused_experts,
+ triton_kernel_fused_experts_with_bias,
+ )
+
+ assert (
+ self.config.is_gated
+ ), "Only gated MoEs are supported for Triton Kernels runner"
+
+ hidden_states = runner_input.hidden_states
+
+ common_kwargs = dict(
+ routing_data=runner_input.routing_data,
+ gather_indx=runner_input.gather_indx,
+ scatter_indx=None if self.config.no_combine else runner_input.scatter_indx,
+ inplace=False,
+ activation=self.config.activation,
+ apply_router_weight_on_input=self.config.apply_router_weight_on_input,
+ global_num_experts=quant_info.global_num_experts,
+ )
+
+ has_bias = quant_info.w13_bias is not None or quant_info.w2_bias is not None
+
+ if has_bias:
+ assert (
+ quant_info.w13_bias is not None and quant_info.w2_bias is not None
+ ), "Bias execution requires both w13_bias and w2_bias"
+ output = triton_kernel_fused_experts_with_bias(
+ hidden_states=hidden_states,
+ w1=quant_info.w13_weight,
+ w1_pcg=quant_info.w13_precision_config,
+ b1=quant_info.w13_bias,
+ w2=quant_info.w2_weight,
+ w2_pcg=quant_info.w2_precision_config,
+ b2=quant_info.w2_bias,
+ gemm1_alpha=self.config.gemm1_alpha,
+ gemm1_clamp_limit=self.config.gemm1_clamp_limit,
+ **common_kwargs,
+ )
+ else:
+ output = triton_kernel_fused_experts(
+ hidden_states=hidden_states,
+ w1=quant_info.w13_weight,
+ w2=quant_info.w2_weight,
+ **common_kwargs,
+ )
+
+ if self.config.no_combine:
+ tokens = runner_input.hidden_states.shape[0]
+ hidden = runner_input.hidden_states.shape[-1]
+ total_rows = output.shape[0]
+ top_k = total_rows // tokens
+ output = output.view(tokens, top_k, hidden)
+
+ return TritonKernelsRunnerOutput(hidden_states=output)
+
+ @property
+ def runner_backend(self) -> MoeRunnerBackend:
+ return MoeRunnerBackend.TRITON_KERNELS
+
+
+# ---------------------------------------------------------------------------
+# Permute / fused hooks
+# ---------------------------------------------------------------------------
+
+
+@register_pre_permute("standard", "triton_kernel")
+def pre_permute_standard_to_triton_kernels(
+ dispatch_output: "StandardDispatchOutput",
+ quant_info: TritonKernelsQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> TritonKernelsRunnerInput:
+ from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+ hidden_states = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ assert TopKOutputChecker.format_is_triton_kernels(
+ topk_output
+ ), "Triton-kernel runner expects TritonKernelTopKOutput"
+
+ routing_data, gather_indx, scatter_indx = topk_output
+
+ return TritonKernelsRunnerInput(
+ hidden_states=hidden_states,
+ routing_data=routing_data,
+ gather_indx=gather_indx,
+ scatter_indx=scatter_indx,
+ )
+
+
+@register_post_permute("triton_kernel", "standard")
+def post_permute_triton_kernels_to_standard(
+ runner_output: TritonKernelsRunnerOutput,
+ quant_info: TritonKernelsQuantInfo,
+ runner_config: MoeRunnerConfig,
+ running_state: dict,
+) -> StandardCombineInput:
+ from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+ hidden_states = runner_output.hidden_states
+
+ if (
+ runner_config.routed_scaling_factor is not None
+ and runner_config.routed_scaling_factor != 1.0
+ and not runner_config.no_combine
+ ):
+ hidden_states.mul_(runner_config.routed_scaling_factor)
+
+ return StandardCombineInput(hidden_states=hidden_states)
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__init__.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd40a8d9854847f7ad5ac5ddd9cb5c67efb9e58e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__init__.py
@@ -0,0 +1,71 @@
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ BaseDispatcherConfig,
+ CombineInput,
+ CombineInputChecker,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputChecker,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.deepep import (
+ DeepEPConfig,
+ DeepEPDispatcher,
+ DeepEPLLCombineInput,
+ DeepEPLLDispatchOutput,
+ DeepEPNormalCombineInput,
+ DeepEPNormalDispatchOutput,
+)
+from sglang.srt.layers.moe.token_dispatcher.flashinfer import (
+ FlashinferDispatcher,
+ FlashinferDispatchOutput,
+)
+from sglang.srt.layers.moe.token_dispatcher.fuseep import NpuFuseEPDispatcher
+from sglang.srt.layers.moe.token_dispatcher.mooncake import (
+ MooncakeCombineInput,
+ MooncakeDispatchOutput,
+ MooncakeEPDispatcher,
+)
+from sglang.srt.layers.moe.token_dispatcher.moriep import (
+ MoriEPDispatcher,
+ MoriEPLLCombineInput,
+ MoriEPLLDispatchOutput,
+ MoriEPNormalCombineInput,
+ MoriEPNormalDispatchOutput,
+)
+from sglang.srt.layers.moe.token_dispatcher.standard import (
+ StandardCombineInput,
+ StandardDispatcher,
+ StandardDispatchOutput,
+)
+
+__all__ = [
+ "BaseDispatcher",
+ "BaseDispatcherConfig",
+ "CombineInput",
+ "CombineInputChecker",
+ "CombineInputFormat",
+ "DispatchOutput",
+ "DispatchOutputFormat",
+ "DispatchOutputChecker",
+ "FlashinferDispatchOutput",
+ "FlashinferDispatcher",
+ "MooncakeCombineInput",
+ "MooncakeDispatchOutput",
+ "MooncakeEPDispatcher",
+ "MoriEPNormalDispatchOutput",
+ "MoriEPNormalCombineInput",
+ "MoriEPLLDispatchOutput",
+ "MoriEPLLCombineInput",
+ "MoriEPDispatcher",
+ "StandardDispatcher",
+ "StandardDispatchOutput",
+ "StandardCombineInput",
+ "DeepEPConfig",
+ "DeepEPDispatcher",
+ "DeepEPNormalDispatchOutput",
+ "DeepEPLLDispatchOutput",
+ "DeepEPLLCombineInput",
+ "DeepEPNormalCombineInput",
+ "NpuFuseEPDispatcher",
+]
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..269280bcba65f42db669d989efa7a6cd7b3d976c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/base.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e9440f4795bc7b23f03fb44530dd8a1b44ccdb0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/base.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/deepep.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/deepep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d89b3d8eafe31c7900005e251c3283efe568806
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/deepep.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..166a869ed3d253c8ce62f1e42d08d20ed2b22ca5
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab353a2aff73fcaa0e1f18f97f160cd1a05cc0b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/flashinfer_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/fuseep.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/fuseep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91c38ee53cecd79ec0852ef4837b740678c29165
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/fuseep.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/mooncake.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/mooncake.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ed063b286c2288cffd48bb2897f8441bd431348
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/mooncake.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/moriep.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/moriep.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22953292def7355a13f8e3cdbfb32bef74e880d7
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/moriep.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/standard.cpython-311.pyc b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/standard.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16fb5419e10b7c390855cca98431d745d5c3354e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/moe/token_dispatcher/__pycache__/standard.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/base.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..8134a4dea7c1653c71c8bce0313c88632acf326c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/base.py
@@ -0,0 +1,372 @@
+from __future__ import annotations
+
+import weakref
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Optional,
+ OrderedDict,
+ Protocol,
+ Tuple,
+ TypeGuard,
+ Union,
+ runtime_checkable,
+)
+
+import torch
+
+if TYPE_CHECKING:
+ from sglang.srt.batch_overlap.single_batch_overlap import CombineOverlapArgs
+ from sglang.srt.layers.moe.token_dispatcher import (
+ DeepEPLLCombineInput,
+ DeepEPLLDispatchOutput,
+ DeepEPNormalCombineInput,
+ DeepEPNormalDispatchOutput,
+ FlashinferCombineInput,
+ FlashinferDispatchOutput,
+ StandardCombineInput,
+ StandardDispatchOutput,
+ )
+ from sglang.srt.layers.moe.topk import TopKOutput
+
+
+# ------------------------------ Dispatcher Hook -------------------------------------
+
+
+class _RemovableDispatcherHandle:
+
+ next_id = 0 # Global counter for unique IDs
+
+ def __init__(self, hooks_dict: OrderedDict):
+ self.id = _RemovableDispatcherHandle.next_id
+ _RemovableDispatcherHandle.next_id += 1
+ self.weak_hooks_dict = weakref.ref(hooks_dict)
+
+ def remove(self):
+ hooks_dict = self.weak_hooks_dict()
+ if hooks_dict is not None and self.id in hooks_dict:
+ del hooks_dict[self.id]
+
+
+class DispatcherBaseHooks:
+
+ def __init__(self):
+ self.hook_dict = OrderedDict[int, Callable]()
+
+ def register_hook(self, hook_fun: Callable) -> _RemovableDispatcherHandle:
+ handle = _RemovableDispatcherHandle(self.hook_dict)
+ self.hook_dict[handle.id] = hook_fun
+ return handle
+
+ def __call__(self, *args, **kwargs) -> Optional[Any]:
+ raise NotImplementedError("This method should be overridden by subclasses")
+
+
+class _PreDispatchHooks(DispatcherBaseHooks):
+
+ def __call__(
+ self,
+ dispatcher: BaseDispatcher,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ) -> Optional[Tuple[torch.Tensor, TopKOutput]]:
+ for hook_fun in self.hook_dict.values():
+ hook_output = hook_fun(dispatcher, hidden_states, topk_output)
+ if hook_output is not None:
+ hidden_states, topk_output = hook_output
+ return hidden_states, topk_output
+
+
+class _PostDispatchHooks(DispatcherBaseHooks):
+
+ def __call__(
+ self, dispatcher: BaseDispatcher, dispatch_output: DispatchOutput
+ ) -> Optional[DispatchOutput]:
+ for hook_fun in self.hook_dict.values():
+ hook_output = hook_fun(dispatcher, dispatch_output)
+ if hook_output is not None:
+ dispatch_output = hook_output
+ return dispatch_output
+
+
+class _PreCombineHooks(DispatcherBaseHooks):
+
+ def __call__(
+ self, dispatcher: BaseDispatcher, combine_input: CombineInput
+ ) -> Optional[CombineInput]:
+ for hook_fun in self.hook_dict.values():
+ hook_output = hook_fun(dispatcher, combine_input)
+ if hook_output is not None:
+ combine_input = hook_output
+ return combine_input
+
+
+class _PostCombineHooks(DispatcherBaseHooks):
+
+ def __call__(
+ self, dispatcher: BaseDispatcher, hidden_states: torch.Tensor
+ ) -> Optional[torch.Tensor]:
+ for hook_fun in self.hook_dict.values():
+ hook_output = hook_fun(dispatcher, hidden_states)
+ if hook_output is not None:
+ hidden_states = hook_output
+ return hidden_states
+
+
+# ------------------------------ Dispatch Output -------------------------------------
+
+
+class DispatchOutputChecker:
+
+ @staticmethod
+ def format_is_standard(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[StandardDispatchOutput]:
+ return dispatch_output.format.is_standard()
+
+ @staticmethod
+ def format_is_triton_kernels(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[StandardDispatchOutput]:
+ return dispatch_output.format.is_standard()
+
+ @staticmethod
+ def format_is_deepep_normal(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[DeepEPNormalDispatchOutput]:
+ return dispatch_output.format.is_deepep_normal()
+
+ @staticmethod
+ def format_is_deepep_ll(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[DeepEPLLDispatchOutput]:
+ return dispatch_output.format.is_deepep_ll()
+
+ @staticmethod
+ def format_is_deepep(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput]]:
+ return dispatch_output.format.is_deepep()
+
+ @staticmethod
+ def format_is_flashinfer(
+ dispatch_output: DispatchOutput,
+ ) -> TypeGuard[FlashinferDispatchOutput]:
+ return dispatch_output.format.is_flashinfer()
+
+
+class DispatchOutputFormat(Enum):
+
+ STANDARD = "standard"
+ DEEPEP_NORMAL = "deepep_normal"
+ DEEPEP_LL = "deepep_ll"
+ FLASHINFER = "flashinfer"
+
+ def is_standard(self) -> bool:
+ return self == DispatchOutputFormat.STANDARD
+
+ def is_deepep_normal(self) -> bool:
+ return self == DispatchOutputFormat.DEEPEP_NORMAL
+
+ def is_deepep_ll(self) -> bool:
+ return self == DispatchOutputFormat.DEEPEP_LL
+
+ def is_deepep(self) -> bool:
+ return self in [
+ DispatchOutputFormat.DEEPEP_NORMAL,
+ DispatchOutputFormat.DEEPEP_LL,
+ ]
+
+ def is_flashinfer(self) -> bool:
+ return self == DispatchOutputFormat.FLASHINFER
+
+
+@runtime_checkable
+class DispatchOutput(Protocol):
+ """Protocol for dispatch outputs in different formats."""
+
+ hidden_states: torch.Tensor
+
+ @property
+ def format(self) -> DispatchOutputFormat: ...
+
+
+# ------------------------------ Combine Input -------------------------------------
+
+
+class CombineInputChecker:
+ @staticmethod
+ def format_is_standard(
+ combine_input: CombineInput,
+ ) -> TypeGuard[StandardCombineInput]:
+ return combine_input.format == CombineInputFormat.STANDARD
+
+ @staticmethod
+ def format_is_deepep_normal(
+ combine_input: CombineInput,
+ ) -> TypeGuard[DeepEPNormalCombineInput]:
+ return combine_input.format == CombineInputFormat.DEEPEP_NORMAL
+
+ @staticmethod
+ def format_is_deepep_ll(
+ combine_input: CombineInput,
+ ) -> TypeGuard[DeepEPLLCombineInput]:
+ return combine_input.format == CombineInputFormat.DEEPEP_LL
+
+ @staticmethod
+ def format_is_deepep(
+ combine_input: CombineInput,
+ ) -> TypeGuard[Union[DeepEPNormalCombineInput, DeepEPLLCombineInput]]:
+ return combine_input.format in [
+ CombineInputFormat.DEEPEP_NORMAL,
+ CombineInputFormat.DEEPEP_LL,
+ ]
+
+ @staticmethod
+ def format_is_flashinfer(
+ combine_input: CombineInput,
+ ) -> TypeGuard[FlashinferCombineInput]:
+ return combine_input.format == CombineInputFormat.FLASHINFER
+
+
+class CombineInputFormat(Enum):
+ STANDARD = "standard"
+ DEEPEP_NORMAL = "deepep_normal"
+ DEEPEP_LL = "deepep_ll"
+ FLASHINFER = "flashinfer"
+
+
+@runtime_checkable
+class CombineInput(Protocol):
+ """Protocol for combine inputs in different formats."""
+
+ # TODO: add hidden_states to the protocol
+
+ @property
+ def format(self) -> CombineInputFormat: ...
+
+
+# ------------------------------ Base Dispatcher -------------------------------------
+
+
+class BaseDispatcherConfig(ABC):
+ """Base class for dispatcher configs."""
+
+ pass
+
+
+class BaseDispatcher(ABC):
+ """Base class for dispatchers."""
+
+ def __init__(self):
+ self.quant_config: Optional[dict] = None
+
+ # Overlap args
+ self.overlap_args: Optional[CombineOverlapArgs] = None
+ self.meta_overlap_args: Optional[dict] = None
+
+ # Hooks
+ self._pre_dispatch_hooks: Optional[_PreDispatchHooks] = None
+ self._post_dispatch_hooks: Optional[_PostDispatchHooks] = None
+ self._pre_combine_hooks: Optional[_PreCombineHooks] = None
+ self._post_combine_hooks: Optional[_PostCombineHooks] = None
+ self._original_dispatch_func: Optional[Callable] = None
+ self._original_combine_func: Optional[Callable] = None
+
+ @abstractmethod
+ def dispatch(
+ self, hidden_states: torch.Tensor, topk_output: TopKOutput
+ ) -> DispatchOutput:
+ pass
+
+ def _dispatch_with_hook(
+ self, hidden_states: torch.Tensor, topk_output: TopKOutput
+ ) -> DispatchOutput:
+ if self._pre_dispatch_hooks is not None:
+ hidden_states, topk_output = self._pre_dispatch_hooks(
+ self, hidden_states, topk_output
+ )
+ dispatch_output = self._original_dispatch_func(
+ hidden_states=hidden_states, topk_output=topk_output
+ )
+ if self._post_dispatch_hooks is not None:
+ dispatch_output = self._post_dispatch_hooks(self, dispatch_output)
+ return dispatch_output
+
+ def _override_dispatch_func(self) -> None:
+ if self._original_dispatch_func is None:
+ self._original_dispatch_func = self.dispatch
+ self.dispatch = self._dispatch_with_hook
+
+ @abstractmethod
+ def combine(self, combine_input: CombineInput) -> torch.Tensor:
+ pass
+
+ def _combine_with_hook(self, combine_input: CombineInput) -> torch.Tensor:
+ if self._pre_combine_hooks is not None:
+ combine_input = self._pre_combine_hooks(self, combine_input)
+ hidden_states = self._original_combine_func(combine_input=combine_input)
+ if self._post_combine_hooks is not None:
+ hidden_states = self._post_combine_hooks(self, hidden_states)
+ return hidden_states
+
+ def _override_combine_func(self) -> None:
+ if self._original_combine_func is None:
+ self._original_combine_func = self.combine
+ self.combine = self._combine_with_hook
+
+ def register_pre_dispatch_hook(
+ self,
+ hook: Callable[
+ [BaseDispatcher, torch.Tensor, TopKOutput],
+ Optional[Tuple[torch.Tensor, TopKOutput]],
+ ],
+ ) -> _RemovableDispatcherHandle:
+ if self._pre_dispatch_hooks is None:
+ self._pre_dispatch_hooks = _PreDispatchHooks()
+ self._override_dispatch_func()
+ handle = self._pre_dispatch_hooks.register_hook(hook)
+ return handle
+
+ def register_post_dispatch_hook(
+ self, hook: Callable[[BaseDispatcher, DispatchOutput], Optional[DispatchOutput]]
+ ) -> _RemovableDispatcherHandle:
+ if self._post_dispatch_hooks is None:
+ self._post_dispatch_hooks = _PostDispatchHooks()
+ self._override_dispatch_func()
+ handle = self._post_dispatch_hooks.register_hook(hook)
+ return handle
+
+ def register_pre_combine_hook(
+ self, hook: Callable[[BaseDispatcher, CombineInput], Optional[CombineInput]]
+ ) -> _RemovableDispatcherHandle:
+ if self._pre_combine_hooks is None:
+ self._pre_combine_hooks = _PreCombineHooks()
+ self._override_combine_func()
+ handle = self._pre_combine_hooks.register_hook(hook)
+ return handle
+
+ def register_post_combine_hook(
+ self, hook: Callable[[BaseDispatcher, torch.Tensor], Optional[torch.Tensor]]
+ ) -> _RemovableDispatcherHandle:
+ if self._post_combine_hooks is None:
+ self._post_combine_hooks = _PostCombineHooks()
+ self._override_combine_func()
+ handle = self._post_combine_hooks.register_hook(hook)
+ return handle
+
+ def set_quant_config(self, quant_config: dict) -> None:
+ self.quant_config = quant_config
+
+ def set_overlap_args(
+ self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+ ) -> None:
+ self.overlap_args = combine_overlap_args
+ self.meta_overlap_args = meta_overlap_args
+
+ def clear_overlap_args(self) -> None:
+ self.overlap_args = None
+ self.meta_overlap_args = None
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
new file mode 100644
index 0000000000000000000000000000000000000000..8539639d5e9a08b9f8ff5c53d048d545cdbe67d6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
@@ -0,0 +1,872 @@
+from __future__ import annotations
+
+import logging
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union
+
+from sglang.srt.environ import envs
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.dp_attention import get_is_extend_in_batch
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ BaseDispatcherConfig,
+ CombineInput,
+ CombineInputFormat,
+ DispatcherBaseHooks,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.moe.utils import (
+ DeepEPMode,
+ get_deepep_config,
+ get_moe_runner_backend,
+ is_tbo_enabled,
+)
+from sglang.srt.utils import (
+ get_bool_env_var,
+ is_blackwell,
+ is_hip,
+ is_npu,
+ load_json_config,
+)
+
+_is_npu = is_npu()
+
+if TYPE_CHECKING:
+ from sglang.srt.batch_overlap.single_batch_overlap import CombineOverlapArgs
+
+try:
+ from deep_ep import Buffer, Config
+
+ if not _is_npu:
+ from sglang.srt.layers.quantization.fp8_kernel import (
+ sglang_per_token_group_quant_fp8,
+ )
+
+ use_deepep = True
+except ImportError:
+ use_deepep = False
+
+from enum import Enum, IntEnum, auto
+
+import torch
+import torch.distributed as dist
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEPPDispatchHooks(DispatcherBaseHooks):
+
+ def __call__(self, dispatcher: BaseDispatcher):
+ for hook_fun in self.hook_dict.values():
+ hook_fun(dispatcher)
+
+
+class DeepEPNormalDispatchOutput(NamedTuple):
+ """DeepEP normal dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+ num_recv_tokens_per_expert: List[int]
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_NORMAL
+
+
+class DeepEPLLDispatchOutput(NamedTuple):
+ """DeepEP low latency dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+ masked_m: torch.Tensor
+ expected_m: int
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_LL
+
+
+assert isinstance(DeepEPNormalDispatchOutput, DispatchOutput)
+assert isinstance(DeepEPLLDispatchOutput, DispatchOutput)
+
+
+class DeepEPNormalCombineInput(NamedTuple):
+ """DeepEP normal combine input."""
+
+ hidden_states: torch.Tensor
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_NORMAL
+
+
+class DeepEPLLCombineInput(NamedTuple):
+ """DeepEP low latency combine input."""
+
+ hidden_states: torch.Tensor
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_LL
+
+
+assert isinstance(DeepEPNormalCombineInput, CombineInput)
+assert isinstance(DeepEPLLCombineInput, CombineInput)
+
+
+class DeepEPDispatchMode(IntEnum):
+ NORMAL = auto()
+ LOW_LATENCY = auto()
+
+
+class DeepEPBuffer:
+ _buffer = None
+ _dispatch_mode: Optional[DeepEPDispatchMode] = None
+ _hidden_size: Optional[int] = None
+ _num_max_dispatch_tokens_per_rank: Optional[int] = None
+ _num_experts: Optional[int] = None
+
+ @classmethod
+ def get_deepep_buffer(
+ cls,
+ group: dist.ProcessGroup,
+ hidden_size: int,
+ param_bytes: int,
+ deepep_mode: DeepEPMode,
+ num_max_dispatch_tokens_per_rank: int = -1,
+ num_experts: int = -1,
+ ):
+ if cls._buffer is not None:
+ return cls._buffer
+
+ cls._hidden_size = hidden_size
+ cls._num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
+ cls._num_experts = num_experts
+
+ num_nvl_bytes, num_rdma_bytes = 0, 0
+ if deepep_mode.enable_normal():
+ hidden_bytes = hidden_size * param_bytes
+ for config in (
+ DeepEPConfig.get_instance().normal_dispatch_config
+ or Buffer.get_dispatch_config(group.size()),
+ DeepEPConfig.get_instance().normal_combine_config
+ or Buffer.get_combine_config(group.size()),
+ ):
+ num_nvl_bytes = max(
+ config.get_nvl_buffer_size_hint(hidden_bytes, group.size()),
+ num_nvl_bytes,
+ )
+ num_rdma_bytes = max(
+ config.get_rdma_buffer_size_hint(hidden_bytes, group.size()),
+ num_rdma_bytes,
+ )
+ if deepep_mode.enable_low_latency():
+ assert num_max_dispatch_tokens_per_rank != -1
+ assert num_experts != -1 and num_experts % group.size() == 0
+ num_rdma_bytes = max(
+ Buffer.get_low_latency_rdma_size_hint(
+ num_max_dispatch_tokens_per_rank,
+ hidden_size,
+ group.size(),
+ num_experts,
+ ),
+ num_rdma_bytes,
+ )
+
+ # We should calculate num_qps_per_rank consistently with DeepEP's test script logic:
+ if deepep_mode == DeepEPMode.NORMAL:
+ # refer: https://github.com/deepseek-ai/DeepEP/blob/main/tests/test_internode.py#L235
+ num_qps_per_rank = DeepEPConfig.get_instance().num_sms
+ elif deepep_mode == DeepEPMode.LOW_LATENCY:
+ # refer: https://github.com/deepseek-ai/DeepEP/blob/main/tests/test_low_latency.py#L176
+ num_qps_per_rank = num_experts // group.size()
+ elif deepep_mode == DeepEPMode.AUTO:
+ # low-latency and normal mode all need run
+ # refer: https://github.com/deepseek-ai/DeepEP/blob/main/tests/test_internode.py#L235
+ num_qps_per_rank = max(
+ DeepEPConfig.get_instance().num_sms, num_experts // group.size()
+ )
+ else:
+ raise NotImplementedError
+
+ if not _is_npu:
+ total_num_sms = torch.cuda.get_device_properties(
+ device="cuda"
+ ).multi_processor_count
+ if (
+ (deepep_mode != DeepEPMode.LOW_LATENCY)
+ and not is_tbo_enabled()
+ and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
+ ):
+ logger.warning(
+ f"Only use {DeepEPConfig.get_instance().num_sms} SMs for DeepEP communication. "
+ f"This may result in highly suboptimal performance. "
+ f"Consider using --deepep-config to change the behavior."
+ )
+
+ cls._buffer = Buffer(
+ group,
+ num_nvl_bytes,
+ num_rdma_bytes,
+ low_latency_mode=deepep_mode.enable_low_latency(),
+ num_qps_per_rank=num_qps_per_rank,
+ # TODO can be false when unneeded
+ allow_mnnvl=True,
+ )
+ return cls._buffer
+
+ @classmethod
+ def clean_buffer(cls):
+ if not cls._buffer.low_latency_mode:
+ return
+ cls._buffer.clean_low_latency_buffer(
+ cls._num_max_dispatch_tokens_per_rank,
+ cls._hidden_size,
+ cls._num_experts,
+ )
+
+ @classmethod
+ def set_dispatch_mode_as_normal(cls):
+ cls._dispatch_mode = DeepEPDispatchMode.NORMAL
+
+ @classmethod
+ def set_dispatch_mode_as_low_latency(cls):
+ if cls._dispatch_mode == DeepEPDispatchMode.NORMAL:
+ cls.clean_buffer()
+ cls._dispatch_mode = DeepEPDispatchMode.LOW_LATENCY
+
+ @classmethod
+ def set_dispatch_mode(cls, mode: DeepEPMode):
+ if mode.is_low_latency():
+ cls.set_dispatch_mode_as_low_latency()
+ elif mode.is_normal():
+ cls.set_dispatch_mode_as_normal()
+ else:
+ raise Exception("unsupported mode")
+
+
+class DeepEPConfig(BaseDispatcherConfig):
+ _instance = None
+
+ def __init__(self):
+ config_str = get_deepep_config()
+ if config_str:
+ config_parsed = load_json_config(config_str)
+ if torch.distributed.get_rank() == 0:
+ logger.info(f"Use DeepEP Config: {config_parsed}")
+ config_dispatch = config_parsed["normal_dispatch"]
+ config_combine = config_parsed["normal_combine"]
+
+ self.normal_dispatch_config = Config(**config_dispatch)
+ self.normal_combine_config = Config(**config_combine)
+
+ assert config_dispatch["num_sms"] == config_combine["num_sms"]
+ self.num_sms = config_dispatch["num_sms"]
+ else:
+ self.normal_dispatch_config = None
+ self.normal_combine_config = None
+ self.num_sms = Buffer.num_sms
+
+ @classmethod
+ def get_instance(cls):
+ if cls._instance is None:
+ cls._instance = DeepEPConfig()
+ return cls._instance
+
+
+class _DeepEPDispatcherImplBase:
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool,
+ num_experts: int,
+ num_local_experts: int,
+ hidden_size: int,
+ params_dtype: torch.dtype,
+ deepep_mode: DeepEPMode,
+ ):
+ if not use_deepep:
+ raise ImportError(
+ "DeepEP is not installed. Please install DeepEP package from "
+ "https://github.com/deepseek-ai/deepep."
+ )
+
+ self.group = group
+ self.router_topk = router_topk
+ self.permute_fusion = permute_fusion
+ self.num_experts = num_experts
+ self.num_local_experts = num_local_experts
+ self.hidden_size = hidden_size
+ self.params_dtype = params_dtype
+ self.deepep_mode = deepep_mode
+
+ self.params_bytes = 2
+ # A large value will lead to large memory occupation, thus users should change it accordingly
+ self.num_max_dispatch_tokens_per_rank = (
+ envs.SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK.get()
+ )
+ # DeepEP internode_ll dispatch uses FINISHED_SUM_TAG=1024
+ # and the logic requires num-tokens-sent-from-one-rank-to-another-rank less than it
+ assert self.num_max_dispatch_tokens_per_rank <= 1024
+
+ self.handle = None
+
+ self.quant_config: Optional[dict] = None
+
+ self.overlap_args: Optional[CombineOverlapArgs] = None
+ self.meta_overlap_args: Optional[dict] = None
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ raise NotImplementedError
+
+ def dispatch_b(self, *args, **kwargs):
+ raise NotImplementedError
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ raise NotImplementedError
+
+ def combine_b(self, *args, **kwargs):
+ raise NotImplementedError
+
+ def _get_buffer(self):
+ raise NotImplementedError
+
+ def set_quant_config(self, quant_config: dict) -> None:
+ self.quant_config = quant_config
+
+ def set_overlap_args(
+ self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+ ) -> None:
+ self.overlap_args = combine_overlap_args
+ self.meta_overlap_args = meta_overlap_args
+
+ def clear_overlap_args(self) -> None:
+ self.overlap_args = None
+ self.meta_overlap_args = None
+
+
+class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
+ def __init__(self, async_finish: bool, **kwargs):
+ super().__init__(**kwargs)
+
+ self.async_finish = async_finish
+ self.src2dst = None
+ self.quant_config = {}
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+ topk_ids = topk_ids.to(torch.int64)
+ if (
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+ and not get_moe_runner_backend().is_cutlass()
+ and not envs.SGLANG_DEEPEP_BF16_DISPATCH.get()
+ ):
+ # TODO hard code 128 block quant,use fp8 communication
+ hidden_states = sglang_per_token_group_quant_fp8(
+ hidden_states,
+ 128,
+ column_major_scales=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ scale_tma_aligned=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+ )
+ previous_event = Buffer.capture() if self.async_finish else None
+ return hidden_states, topk_ids, topk_weights, previous_event
+
+ def dispatch_b(self, hidden_states, topk_ids, topk_weights, previous_event):
+ (
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ num_recv_tokens_per_expert,
+ event,
+ ) = self._dispatch_core(hidden_states, topk_ids, topk_weights, previous_event)
+ event.current_stream_wait() if self.async_finish else ()
+
+ if isinstance(hidden_states, tuple):
+ hidden_states, hidden_states_scale = hidden_states
+ else:
+ hidden_states_scale = None
+
+ return DeepEPNormalDispatchOutput(
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ topk_weights,
+ num_recv_tokens_per_expert,
+ )
+
+ def _dispatch_core(
+ self,
+ x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ previous_event,
+ ):
+ buffer = self._get_buffer()
+ (
+ num_tokens_per_rank,
+ num_tokens_per_rdma_rank,
+ num_tokens_per_expert,
+ is_token_in_rank,
+ previous_event,
+ ) = buffer.get_dispatch_layout(
+ topk_ids,
+ self.num_experts,
+ previous_event=previous_event,
+ async_finish=self.async_finish,
+ allocate_on_comm_stream=previous_event is not None,
+ )
+ # FIXME: `handle` should be transmitted with tokens from dispatch to combine.
+ # However, doing this would incur an unknown synchronization error, but keeping
+ # `handle` as a member variable works.
+
+ (
+ recv_x,
+ recv_topk_ids,
+ recv_topk_weights,
+ num_recv_tokens_per_expert,
+ self.handle,
+ event,
+ ) = buffer.dispatch(
+ x,
+ topk_idx=topk_ids,
+ topk_weights=topk_weights,
+ num_tokens_per_rank=num_tokens_per_rank,
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+ is_token_in_rank=is_token_in_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ previous_event=previous_event,
+ async_finish=self.async_finish,
+ allocate_on_comm_stream=(previous_event is not None) and self.async_finish,
+ expert_alignment=128 if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM else 1,
+ config=DeepEPConfig.get_instance().normal_dispatch_config,
+ )
+ get_global_expert_distribution_recorder().on_deepep_dispatch_normal(
+ num_recv_tokens_per_expert,
+ num_tokens_per_rank=num_tokens_per_rank,
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ )
+
+ return (
+ recv_x,
+ recv_topk_ids,
+ recv_topk_weights,
+ num_recv_tokens_per_expert,
+ event,
+ )
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+
+ if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM or _use_aiter or _is_npu:
+ output = hidden_states
+ else:
+ raise NotImplementedError() # triton runner was supported but it's temporarily disabled
+
+ previous_event = Buffer.capture() if self.async_finish else None
+ return output, previous_event
+
+ def combine_b(self, output, previous_event):
+ hidden_states, event = self._combine_core(output, previous_event)
+ event.current_stream_wait() if self.async_finish else ()
+ self.handle = None
+ self.src2dst = None
+ return hidden_states
+
+ def _combine_core(self, x: torch.Tensor, previous_event):
+ buffer = self._get_buffer()
+ combined_x, _, event = buffer.combine(
+ x,
+ self.handle,
+ async_finish=self.async_finish,
+ previous_event=previous_event,
+ allocate_on_comm_stream=previous_event is not None,
+ config=DeepEPConfig.get_instance().normal_combine_config,
+ )
+ return combined_x, event
+
+ def _get_buffer(self):
+ DeepEPBuffer.set_dispatch_mode_as_normal()
+
+ return DeepEPBuffer.get_deepep_buffer(
+ self.group,
+ self.hidden_size,
+ self.params_bytes,
+ self.deepep_mode,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ )
+
+
+class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
+ def __init__(self, return_recv_hook: bool, **kwargs):
+ super().__init__(**kwargs)
+
+ """
+ num_max_dispatch_tokens_per_rank: the actual batch size in the decoding engine should be less than 256
+ https://github.com/deepseek-ai/DeepEP?tab=readme-ov-file#example-use-in-inference-decoding
+ """
+ self.return_recv_hook = return_recv_hook
+ self.device_module = torch.get_device_module()
+ self.quant_config = {}
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ buffer = self._get_buffer()
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+ topk_ids = topk_ids.to(torch.int64)
+ expected_m = (
+ hidden_states.shape[0] * buffer.group_size * topk_ids.shape[1]
+ + self.num_experts
+ ) // self.num_experts
+ hidden_states, masked_m, event, hook = self._dispatch_core(
+ hidden_states,
+ topk_ids,
+ )
+ return (
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ event,
+ hook,
+ )
+
+ def dispatch_b(
+ self,
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ event,
+ hook,
+ ):
+ hook() if self.return_recv_hook else event.current_stream_wait()
+
+ get_global_expert_distribution_recorder().on_deepep_dispatch_low_latency(
+ masked_m
+ )
+
+ if isinstance(hidden_states, tuple):
+ hidden_states, hidden_states_scale = hidden_states
+ else:
+ hidden_states_scale = None
+
+ deepep_output = DeepEPLLDispatchOutput(
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ )
+ return deepep_output
+
+ def _dispatch_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ ):
+ use_nvfp4 = use_fp8 = False
+ input_global_scale = self.quant_config.get("input_global_scale", None)
+ if input_global_scale is not None:
+ use_nvfp4 = True
+ elif not envs.SGLANG_DEEPEP_BF16_DISPATCH.get():
+ use_fp8 = True
+
+ buffer = self._get_buffer()
+ packed_recv_hidden, self.packed_recv_count, self.handle, event, hook = (
+ buffer.low_latency_dispatch(
+ hidden_states,
+ topk_ids,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ use_fp8=use_fp8,
+ **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+ **(
+ dict(x_global_scale=input_global_scale)
+ if input_global_scale is not None
+ else dict()
+ ),
+ async_finish=not self.return_recv_hook,
+ return_recv_hook=self.return_recv_hook,
+ round_scale=deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+ and deep_gemm_wrapper.DEEPGEMM_BLACKWELL,
+ use_ue8m0=deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+ and deep_gemm_wrapper.DEEPGEMM_BLACKWELL,
+ )
+ )
+ return packed_recv_hidden, self.packed_recv_count, event, hook
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ hidden_states, event, hook = self._combine_core(
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ )
+ return hidden_states, event, hook
+
+ def combine_b(self, hidden_states, event, hook):
+ overlap_args = self.overlap_args
+ if overlap_args is not None:
+ overlap_args.stream.wait_stream(self.device_module.current_stream())
+
+ hook() if self.return_recv_hook else event.current_stream_wait()
+
+ if overlap_args is not None:
+ self.device_module.current_stream().wait_stream(overlap_args.stream)
+
+ return hidden_states
+
+ def _combine_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ buffer = self._get_buffer()
+ overlap_args = self.overlap_args
+ meta_overlap_args = self.meta_overlap_args
+
+ ctx = nullcontext()
+ if overlap_args is not None:
+ overlap_args.stream.wait_event(overlap_args.wait_event)
+ ctx = torch.cuda.stream(overlap_args.stream)
+
+ if is_blackwell():
+ overlap_args_dict = dict(
+ overlap=overlap_args.overlap,
+ src_signals=overlap_args.signal,
+ src_signal_expect_value=overlap_args.threshold,
+ )
+ else:
+ overlap_args_dict = dict(
+ overlap=overlap_args.overlap,
+ packed_recv_count=self.packed_recv_count,
+ comp_signal=overlap_args.signal,
+ block_m=meta_overlap_args["block_m"],
+ threshold=meta_overlap_args["threshold"],
+ num_sms=overlap_args.num_sms,
+ )
+ else:
+ overlap_args_dict = {}
+
+ with ctx:
+ combined_hidden_states, event, hook = buffer.low_latency_combine(
+ x=hidden_states,
+ topk_idx=topk_ids,
+ topk_weights=topk_weights,
+ handle=self.handle,
+ async_finish=not self.return_recv_hook,
+ return_recv_hook=self.return_recv_hook,
+ **overlap_args_dict,
+ )
+
+ self.packed_recv_count = self.handle = None
+ return combined_hidden_states, event, hook
+
+ def _get_buffer(self):
+ DeepEPBuffer.set_dispatch_mode_as_low_latency()
+ return DeepEPBuffer.get_deepep_buffer(
+ self.group,
+ self.hidden_size,
+ self.params_bytes,
+ self.deepep_mode,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ )
+
+
+@dataclass
+class _Stage(Enum):
+ INITIAL = auto()
+ AFTER_DISPATCH_A = auto()
+ AFTER_DISPATCH_B = auto()
+ AFTER_COMBINE_A = auto()
+
+
+class DeepEPDispatcher(BaseDispatcher):
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool = False,
+ num_experts: int = None,
+ num_local_experts: int = None,
+ hidden_size: int = None,
+ params_dtype: torch.dtype = None,
+ deepep_mode: DeepEPMode = DeepEPMode.AUTO,
+ async_finish: bool = False,
+ return_recv_hook: bool = False,
+ ):
+ super().__init__()
+
+ self.deepep_mode = deepep_mode
+
+ common_kwargs = dict(
+ group=group,
+ router_topk=router_topk,
+ permute_fusion=permute_fusion,
+ num_experts=num_experts,
+ num_local_experts=num_local_experts,
+ hidden_size=hidden_size,
+ params_dtype=params_dtype,
+ deepep_mode=deepep_mode,
+ )
+
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher = _DeepEPDispatcherImplLowLatency(
+ return_recv_hook=return_recv_hook,
+ **common_kwargs,
+ )
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher = _DeepEPDispatcherImplNormal(
+ async_finish=async_finish,
+ **common_kwargs,
+ )
+
+ self._stage = _Stage.INITIAL
+ self._deepep_dispatch_hooks = DeepEPPDispatchHooks()
+
+ def dispatch(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ) -> DispatchOutput:
+ self.dispatch_a(hidden_states, topk_output)
+ if self._deepep_dispatch_hooks is not None:
+ self._deepep_dispatch_hooks(self)
+ ret = self.dispatch_b()
+ return ret
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ self._update_stage(_Stage.INITIAL, _Stage.AFTER_DISPATCH_A)
+ inner_state = self._get_impl().dispatch_a(
+ hidden_states=hidden_states,
+ topk_output=topk_output,
+ )
+ self._dispatch_intermediate_state = inner_state
+
+ def dispatch_b(self):
+ self._update_stage(_Stage.AFTER_DISPATCH_A, _Stage.AFTER_DISPATCH_B)
+ inner_state = self._dispatch_intermediate_state
+ del self._dispatch_intermediate_state
+ return self._get_impl().dispatch_b(*inner_state)
+
+ def combine(
+ self,
+ combine_input: CombineInput,
+ ) -> torch.Tensor:
+ self.combine_a(combine_input)
+ ret = self.combine_b()
+ return ret
+
+ def combine_a(
+ self,
+ combine_input: CombineInput,
+ ):
+ hidden_states, topk_ids, topk_weights = combine_input
+ self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
+ inner_state = self._get_impl().combine_a(
+ hidden_states=hidden_states,
+ topk_ids=topk_ids,
+ topk_weights=topk_weights,
+ )
+ self._combine_intermediate_state = inner_state
+
+ def combine_b(self):
+ self._update_stage(_Stage.AFTER_COMBINE_A, _Stage.INITIAL)
+ inner_state = self._combine_intermediate_state
+ del self._combine_intermediate_state
+ return self._get_impl().combine_b(*inner_state)
+
+ def _get_impl(self) -> _DeepEPDispatcherImplBase:
+ is_extend_in_batch = get_is_extend_in_batch()
+ resolved_deepep_mode = self.deepep_mode.resolve(is_extend_in_batch)
+ if resolved_deepep_mode == DeepEPMode.NORMAL:
+ return self._normal_dispatcher
+ elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
+ return self._low_latency_dispatcher
+ else:
+ raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
+
+ def _update_stage(self, old_stage, new_stage):
+ assert self._stage == old_stage
+ self._stage = new_stage
+
+ def set_quant_config(self, quant_config: dict):
+ super().set_quant_config(quant_config)
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.set_quant_config(quant_config)
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.set_quant_config(quant_config)
+
+ def set_overlap_args(
+ self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+ ):
+ super().set_overlap_args(combine_overlap_args, meta_overlap_args)
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.set_overlap_args(
+ combine_overlap_args, meta_overlap_args
+ )
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.set_overlap_args(
+ combine_overlap_args, meta_overlap_args
+ )
+
+ def clear_overlap_args(self):
+ super().clear_overlap_args()
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.clear_overlap_args()
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.clear_overlap_args()
+
+ def register_deepep_dispatch_hook(self, hook):
+ return self._deepep_dispatch_hooks.register_hook(hook)
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d5b2ea3754fac9a3e5dcf299bbf299a7815384
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import logging
+from typing import NamedTuple, Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import get_dp_global_num_tokens
+from sglang.srt.layers.moe.token_dispatcher import (
+ BaseDispatcher,
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.flashinfer_utils import (
+ TorchDistributedCommBackend,
+)
+from sglang.srt.layers.moe.topk import StandardTopKOutput, TopKOutput
+from sglang.srt.layers.moe.utils import get_moe_runner_backend
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import get_int_env_var
+
+try:
+ from flashinfer import fp4_quantize, nvfp4_block_scale_interleave
+ from flashinfer.comm import MoeAlltoAll, moe_a2a_get_workspace_size_per_rank
+ from flashinfer.comm.mapping import Mapping
+ from flashinfer.comm.mnnvl import MnnvlConfig
+
+ use_flashinfer = True
+except ImportError:
+ use_flashinfer = False
+
+logger = logging.getLogger(__name__)
+
+MOE_NVFP4_DISPATCH = envs.SGLANG_MOE_NVFP4_DISPATCH.get()
+
+
+class FlashinferDispatchOutput(NamedTuple):
+ """Flashinfer EP dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_output: StandardTopKOutput
+ # Provide an output tensor to fused_moe so it writes directly to our buffer
+ moe_output: Optional[torch.Tensor] = None
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.FLASHINFER
+
+
+assert isinstance(FlashinferDispatchOutput, DispatchOutput)
+
+
+class FlashinferCombineInput(NamedTuple):
+ """Flashinfer combine input."""
+
+ hidden_states: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.FLASHINFER
+
+
+assert isinstance(FlashinferCombineInput, CombineInput)
+
+
+class FlashinferDispatcher(BaseDispatcher):
+ """Main dispatcher class for Flashinfer A2A backend."""
+
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ num_experts: int = None,
+ num_local_experts: int = None, # Unused
+ hidden_size: int = None,
+ params_dtype: torch.dtype = None, # Unused
+ ):
+ super().__init__()
+ if not use_flashinfer:
+ raise ImportError(
+ "Flashinfer is not installed or does not support A2A. "
+ "Please install the appropriate version of Flashinfer."
+ )
+
+ self.ep_size = group.size()
+ self.ep_rank = group.rank()
+ self.router_topk = router_topk
+ self.hidden_size = hidden_size
+ self.num_experts = num_experts
+ self.num_local_experts = num_local_experts
+
+ # TODO: Can other moe runners use payload_in_workspace too?
+ self.payload_in_workspace = get_moe_runner_backend().is_flashinfer_cutlass()
+
+ # TODO: Can this be a server arg and shared with deepep/mooncakeep?
+ self.max_num_tokens = (
+ get_int_env_var("SGLANG_FLASHINFER_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 1024)
+ * self.ep_size
+ )
+
+ # Calculate workspace size. For eagle mode, use the larger workspace size since nextn layer will be unquantized.
+ speculative_algo = SpeculativeAlgorithm.from_string(
+ get_global_server_args().speculative_algorithm
+ )
+ if MOE_NVFP4_DISPATCH and not speculative_algo.is_eagle():
+ total_dispatch_payload_size_per_token = (
+ hidden_size // 2 # nvfp4 hidden states
+ + hidden_size // 16 # fp8 scaling factors
+ + self.router_topk * 4 # int32 topks ids
+ + self.router_topk * 4 # float32 topk weights
+ )
+ else:
+ total_dispatch_payload_size_per_token = (
+ hidden_size * 2 # bf16 hidden states
+ + self.router_topk * 4 # int32 topks ids
+ + self.router_topk * 4 # float32 topk weights
+ )
+ combine_payload_size_per_token = hidden_size * 2 # bf16 hidden states
+ self.workspace_size = moe_a2a_get_workspace_size_per_rank(
+ ep_size=self.ep_size,
+ max_num_tokens=self.max_num_tokens,
+ total_dispatch_payload_size_per_token=total_dispatch_payload_size_per_token,
+ combine_payload_size_per_token=combine_payload_size_per_token,
+ )
+
+ self.mapping = Mapping(
+ rank=self.ep_rank,
+ tp_size=self.ep_size,
+ moe_ep_size=self.ep_size,
+ world_size=self.ep_size,
+ gpus_per_node=torch.cuda.device_count(),
+ pp_size=1,
+ cp_size=1,
+ )
+ self.moe_a2a = MoeAlltoAll(
+ mapping=self.mapping,
+ max_num_tokens=self.max_num_tokens,
+ top_k=self.router_topk,
+ num_experts=self.num_experts,
+ workspace_size_per_rank=self.workspace_size,
+ mnnvl_config=MnnvlConfig(comm_backend=TorchDistributedCommBackend(group)),
+ )
+
+ # Preallocate dummy tensors (to overcome numLocalTokens > 0 restriction)
+ self.dummy_x = torch.empty(
+ (1, hidden_size),
+ dtype=torch.bfloat16,
+ device="cuda",
+ )
+ # -1 will be ignored by flashinfer cutlass moe
+ self.dummy_topk_ids = torch.full(
+ (1, self.router_topk), -1, dtype=torch.int32, device="cuda"
+ )
+ # Hack for dispatch with dummy token - will route the dummy token to this rank so it doesn't require any transfer.
+ self.dummy_topk_ids_current_rank = torch.full(
+ (1, self.router_topk),
+ self.ep_rank * self.num_local_experts,
+ dtype=torch.int32,
+ device="cuda",
+ )
+ self.dummy_topk_weights = torch.zeros(
+ (1, self.router_topk), dtype=torch.float32, device="cuda"
+ )
+
+ def dispatch(
+ self, hidden_states: torch.Tensor, topk_output: TopKOutput
+ ) -> FlashinferDispatchOutput:
+ output_dtype = hidden_states.dtype
+ x = hidden_states
+ x_sf = None
+ topk_ids = topk_output.topk_ids
+ topk_weights = topk_output.topk_weights
+
+ # Handle case where there are no tokens on this DP worker
+ # moe_a2a.dispatch requires at least one token
+ self.has_dummy_token = False
+ if x.shape[0] == 0:
+ logger.warning("No tokens on this DP worker, using dummy token")
+ self.has_dummy_token = True
+ x = self.dummy_x
+ topk_ids = self.dummy_topk_ids
+ topk_weights = self.dummy_topk_weights
+
+ global_scale = self.quant_config.get("input_global_scale", None)
+ if global_scale is not None:
+ if x.shape[0] > 0:
+ x, x_sf = fp4_quantize(x, global_scale, is_sf_swizzled_layout=False)
+ else:
+ x = torch.zeros(
+ 0, self.hidden_size // 2, dtype=torch.uint8, device=x.device
+ )
+ x_sf = torch.zeros(
+ 0, self.hidden_size // 16, dtype=torch.uint8, device=x.device
+ )
+
+ payloads = []
+ payloads.append(x)
+ if x_sf is not None:
+ payloads.append(x_sf)
+ expert_id_payload_index = 2
+ else:
+ expert_id_payload_index = 1
+ payloads.append(topk_ids)
+ payloads.append(topk_weights)
+
+ self.runtime_max_tokens_per_rank = (
+ max(get_dp_global_num_tokens())
+ if get_dp_global_num_tokens() is not None
+ else x.shape[0]
+ )
+ recv_tensors = self.moe_a2a.dispatch(
+ self.dummy_topk_ids_current_rank if self.has_dummy_token else topk_ids,
+ payloads,
+ self.runtime_max_tokens_per_rank,
+ expert_id_payload_index=expert_id_payload_index,
+ )
+ if x_sf is not None:
+ x_recv, x_sf_recv, topk_ids_recv, topk_weights_recv = recv_tensors
+ x_sf = x_sf_recv.view(-1, x_sf_recv.shape[-1])
+ # TODO: fuse interleave into cutlass moe
+ x_sf = nvfp4_block_scale_interleave(x_sf)
+ else:
+ x_recv, topk_ids_recv, topk_weights_recv = recv_tensors
+ x = x_recv.view(-1, x_recv.shape[-1])
+ topk_ids = topk_ids_recv.view(-1, topk_ids_recv.shape[-1])
+ topk_weights = topk_weights_recv.view(-1, topk_weights_recv.shape[-1])
+
+ # Provide an output tensor to fused_moe so it writes directly to our buffer
+ moe_output = None
+ if self.payload_in_workspace:
+ moe_output = self.moe_a2a.get_combine_payload_tensor_in_workspace(
+ self.runtime_max_tokens_per_rank, self.hidden_size, output_dtype
+ ).view(-1, self.hidden_size)
+ return FlashinferDispatchOutput(
+ x,
+ x_sf,
+ StandardTopKOutput(topk_weights, topk_ids, topk_output.router_logits),
+ moe_output,
+ )
+
+ def combine(self, combine_input: FlashinferCombineInput) -> torch.Tensor:
+ hidden_states = combine_input.hidden_states
+ output_hidden_size = hidden_states.shape[-1]
+ hidden_states = self.moe_a2a.combine(
+ hidden_states.view(
+ self.ep_size, self.runtime_max_tokens_per_rank, output_hidden_size
+ ),
+ self.runtime_max_tokens_per_rank,
+ payload_in_workspace=self.payload_in_workspace,
+ )
+
+ # Remove dummy token if it was added in dispatch
+ if self.has_dummy_token:
+ hidden_states = hidden_states[1:, :]
+
+ del self.runtime_max_tokens_per_rank
+ del self.has_dummy_token
+ return hidden_states
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer_utils.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba3071413b8ea553bc98cf62555146196f8a793
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/flashinfer_utils.py
@@ -0,0 +1,47 @@
+import torch.distributed as dist
+
+from sglang.srt.utils import is_flashinfer_available
+
+if is_flashinfer_available():
+ from flashinfer.comm.mnnvl import CommBackend
+else:
+
+ class CommBackend:
+ """
+ Placeholder base class when flashinfer is not available
+ """
+
+ pass
+
+
+class TorchDistributedCommBackend(CommBackend):
+ """
+ Use torch distributed instead of MPI to set up flashinfer MNNVL workspaces during initialization
+ """
+
+ def __init__(self, group: dist.ProcessGroup):
+ self._group = group
+
+ def Get_rank(self) -> int:
+ return self._group.rank()
+
+ def Get_size(self) -> int:
+ return self._group.size()
+
+ def allgather(self, data: int):
+ gathered = [None] * self.Get_size()
+ dist.all_gather_object(gathered, data, group=self._group)
+ return gathered
+
+ def bcast(self, data, root: int = 0):
+ obj_list = [data]
+ # broadcast_object_list mutates obj_list in-place
+ dist.broadcast_object_list(obj_list, src=root, group=self._group)
+ return obj_list[0]
+
+ def Split(self, color: int, key: int):
+ # No need to split, we already use the proper group
+ return self
+
+ def barrier(self):
+ dist.barrier(group=self._group)
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/fuseep.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/fuseep.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba29e13608600216b3b0e9956be2611709743eed
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/fuseep.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import logging
+from typing import NamedTuple
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.deepep import DeepEPBuffer
+from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.moe.utils import DeepEPMode
+
+logger = logging.getLogger(__name__)
+
+
+class FuseEPDispatchOutput(NamedTuple):
+ """DeepEP low latency dispatch output."""
+
+ hidden_state: torch.Tensor
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_LL
+
+
+class FuseEPCombineInput(NamedTuple):
+ """DeepEP low latency combine input."""
+
+ hidden_state: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_LL
+
+
+class NpuFuseEPDispatcher(BaseDispatcher):
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool = False,
+ num_experts: int = None,
+ num_local_experts: int = None,
+ hidden_size: int = None,
+ params_dtype: torch.dtype = None,
+ deepep_mode: DeepEPMode = DeepEPMode.LOW_LATENCY,
+ ):
+ self.group = group
+ self.router_topk = router_topk
+ self.permute_fusion = permute_fusion
+ self.num_experts = num_experts
+ self.num_local_experts = num_local_experts
+ self.hidden_size = hidden_size
+ self.params_dtype = params_dtype
+ self.deepep_mode = deepep_mode
+
+ self.params_bytes = 2
+ self.num_max_dispatch_tokens_per_rank = (
+ envs.SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK.get()
+ )
+
+ def dispatch(
+ self, hidden_states: torch.Tensor, topk_output: TopKOutput, **kwargs
+ ) -> DispatchOutput:
+ hidden_states, _ = self._get_buffer().fused_deep_moe(
+ hidden_states,
+ topk_idx=topk_output.topk_ids,
+ topk_weights=topk_output.topk_weights,
+ gmm1_permuted_weight=kwargs["gmm1_permuted_weight"],
+ gmm1_permuted_weight_scale=kwargs["gmm1_permuted_weight_scale"],
+ gmm2_weight=kwargs["gmm2_weight"],
+ gmm2_weight_scale=kwargs["gmm2_weight_scale"],
+ num_max_dispatch_tokens_per_rank=self.num_max_dispatch_tokens_per_rank,
+ num_experts=self.num_experts,
+ )
+ return FuseEPDispatchOutput(hidden_states)
+
+ def combine(self, combine_input: CombineInput, **kwargs) -> torch.Tensor:
+ pass
+
+ def _get_buffer(self):
+ DeepEPBuffer.set_dispatch_mode_as_low_latency()
+ return DeepEPBuffer.get_deepep_buffer(
+ self.group,
+ self.hidden_size,
+ self.params_bytes,
+ self.deepep_mode,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ )
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
new file mode 100644
index 0000000000000000000000000000000000000000..f475d69d2dd187ba1dbb8734447112e2559af7db
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
@@ -0,0 +1,387 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import NamedTuple, Optional
+
+import torch
+import torch.distributed as dist
+
+from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import get_is_extend_in_batch
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.moe.utils import DeepEPMode
+from sglang.srt.utils import get_int_env_var
+
+logger = logging.getLogger(__name__)
+
+
+class MooncakeDispatchOutput(NamedTuple):
+ """Mooncake EP dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+ masked_m: torch.Tensor
+ expected_m: int
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_LL
+
+
+assert isinstance(MooncakeDispatchOutput, DispatchOutput)
+
+
+class MooncakeCombineInput(NamedTuple):
+ """Mooncake EP combine input."""
+
+ pass
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_LL
+
+
+assert isinstance(MooncakeCombineInput, CombineInput)
+
+
+class EPBuffer:
+ _buffer = None
+ _hidden_size: Optional[int] = None
+ _num_max_dispatch_tokens_per_rank: Optional[int] = None
+ _num_experts: Optional[int] = None
+
+ @classmethod
+ def get_ep_buffer(
+ cls,
+ group: dist.ProcessGroup,
+ hidden_size: int,
+ param_bytes: int,
+ deepep_mode: DeepEPMode,
+ num_max_dispatch_tokens_per_rank: int = -1,
+ num_experts: int = -1,
+ ):
+ if cls._buffer is not None:
+ return cls._buffer
+
+ # Lazy import Buffer to avoid creating CUDA context at module import time
+ from mooncake.mooncake_ep_buffer import Buffer
+
+ cls._hidden_size = hidden_size
+ cls._num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
+ cls._num_experts = num_experts
+
+ num_ep_buffer_bytes = 0
+ if deepep_mode.enable_normal():
+ raise NotImplementedError(
+ "Normal mode is not supported for Mooncake EP yet."
+ )
+ if deepep_mode.enable_low_latency():
+ assert num_max_dispatch_tokens_per_rank != -1
+ assert num_experts != -1 and num_experts % group.size() == 0
+ num_ep_buffer_bytes = Buffer.get_ep_buffer_size_hint(
+ num_max_dispatch_tokens_per_rank,
+ hidden_size,
+ group.size(),
+ num_experts,
+ )
+
+ cls._buffer = Buffer(group, num_ep_buffer_bytes)
+ return cls._buffer
+
+
+class _MooncakeEPDispatcherImpl:
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool,
+ num_experts: int,
+ num_local_experts: int,
+ hidden_size: int,
+ params_dtype: torch.dtype,
+ return_recv_hook: bool,
+ deepep_mode: DeepEPMode,
+ ):
+ try:
+ from mooncake.mooncake_ep_buffer import Buffer # noqa: F401
+ except ImportError:
+ raise ImportError(
+ "Mooncake EP is not installed. Please install Mooncake package at "
+ "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "
+ "with EP support to run SGLang with Mooncake EP."
+ )
+ self.group = group
+ self.router_topk = router_topk
+ self.permute_fusion = permute_fusion
+ self.num_experts = num_experts
+ self.num_local_experts = num_local_experts
+ self.hidden_size = hidden_size
+ self.params_dtype = params_dtype
+ self.return_recv_hook = return_recv_hook
+ self.deepep_mode = deepep_mode
+
+ self.params_bytes = 2
+ self.num_max_dispatch_tokens_per_rank = get_int_env_var(
+ "SGLANG_MOONCAKE_EP_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 128
+ )
+ # Mooncake EP dispatch uses FINISHED_SUM_TAG=1024
+ # and the logic requires num-tokens-sent-from-one-rank-to-another-rank less than it
+ assert self.num_max_dispatch_tokens_per_rank <= 1024
+
+ self.first_execution = True
+ self.timeout_us = 10000000
+
+ self.handle = None
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ topk_ids, topk_weights = topk_output.topk_ids, topk_output.topk_weights
+ buffer = self._get_buffer()
+ topk_ids = topk_ids.to(torch.int64)
+ expected_m = (
+ hidden_states.shape[0] * buffer.group_size * topk_ids.shape[1]
+ + self.num_experts
+ ) // self.num_experts
+ hidden_states, masked_m, event, hook = self._dispatch_core(
+ hidden_states,
+ topk_ids,
+ use_fp8=True,
+ )
+ return (
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ event,
+ hook,
+ )
+
+ def dispatch_b(
+ self,
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ event,
+ hook,
+ ):
+ hook() if self.return_recv_hook else event.current_stream_wait()
+
+ get_global_expert_distribution_recorder().on_deepep_dispatch_low_latency(
+ masked_m
+ )
+
+ if isinstance(hidden_states, tuple):
+ hidden_states, hidden_states_scale = hidden_states
+ else:
+ hidden_states_scale = None
+
+ return MooncakeDispatchOutput(
+ hidden_states,
+ hidden_states_scale,
+ topk_ids,
+ topk_weights,
+ masked_m,
+ expected_m,
+ )
+
+ def _dispatch_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ use_fp8: bool = False,
+ ):
+ buffer = self._get_buffer()
+ active_ranks = ElasticEPStateManager.instance().active_ranks
+ packed_recv_hidden, packed_recv_count, self.handle, event, hook = (
+ buffer.dispatch(
+ hidden_states,
+ topk_ids,
+ active_ranks,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ -1 if self.first_execution else self.timeout_us,
+ use_fp8=use_fp8,
+ async_finish=not self.return_recv_hook,
+ return_recv_hook=self.return_recv_hook,
+ )
+ )
+ return packed_recv_hidden, packed_recv_count, event, hook
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ hidden_states, event, hook = self._combine_core(
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ )
+ return hidden_states, event, hook
+
+ def combine_b(self, hidden_states, event, hook):
+ hook() if self.return_recv_hook else event.current_stream_wait()
+ return hidden_states
+
+ def _combine_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ buffer = self._get_buffer()
+ active_ranks = ElasticEPStateManager.instance().active_ranks
+ combined_hidden_states, event, hook = buffer.combine(
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ active_ranks,
+ -1 if self.first_execution else self.timeout_us,
+ self.handle,
+ async_finish=not self.return_recv_hook,
+ return_recv_hook=self.return_recv_hook,
+ )
+ self.first_execution = False
+ self.handle = None
+ return combined_hidden_states, event, hook
+
+ def _get_buffer(self):
+ return EPBuffer.get_ep_buffer(
+ self.group,
+ self.hidden_size,
+ self.params_bytes,
+ self.deepep_mode,
+ self.num_max_dispatch_tokens_per_rank,
+ self.num_experts,
+ )
+
+
+@dataclass
+class _Stage(Enum):
+ INITIAL = auto()
+ AFTER_DISPATCH_A = auto()
+ AFTER_DISPATCH_B = auto()
+ AFTER_COMBINE_A = auto()
+
+
+class MooncakeEPDispatcher(BaseDispatcher):
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool = False,
+ num_experts: int = None,
+ num_local_experts: int = None,
+ hidden_size: int = None,
+ params_dtype: torch.dtype = None,
+ deepep_mode: DeepEPMode = DeepEPMode.AUTO,
+ async_finish: bool = False,
+ return_recv_hook: bool = False,
+ ):
+ super().__init__()
+
+ self.deepep_mode = deepep_mode
+
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher = _MooncakeEPDispatcherImpl(
+ group=group,
+ router_topk=router_topk,
+ permute_fusion=permute_fusion,
+ num_experts=num_experts,
+ num_local_experts=num_local_experts,
+ hidden_size=hidden_size,
+ params_dtype=params_dtype,
+ return_recv_hook=return_recv_hook,
+ deepep_mode=deepep_mode,
+ )
+ if self.deepep_mode.enable_normal():
+ raise NotImplementedError
+
+ self._stage = _Stage.INITIAL
+
+ def dispatch(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ) -> DispatchOutput:
+ self.dispatch_a(hidden_states, topk_output)
+ ret = self.dispatch_b()
+ return ret
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ self._update_stage(_Stage.INITIAL, _Stage.AFTER_DISPATCH_A)
+ inner_state = self._get_impl().dispatch_a(
+ hidden_states=hidden_states,
+ topk_output=topk_output,
+ )
+ self._dispatch_intermediate_state = inner_state
+
+ def dispatch_b(self):
+ self._update_stage(_Stage.AFTER_DISPATCH_A, _Stage.AFTER_DISPATCH_B)
+ inner_state = self._dispatch_intermediate_state
+ del self._dispatch_intermediate_state
+ return self._get_impl().dispatch_b(*inner_state)
+
+ def combine(
+ self,
+ combine_input: CombineInput,
+ ) -> torch.Tensor:
+ self.combine_a(combine_input)
+ ret = self.combine_b()
+ return ret
+
+ def combine_a(
+ self,
+ combine_input: CombineInput,
+ ):
+ hidden_states, topk_ids, topk_weights = combine_input
+ self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
+ inner_state = self._get_impl().combine_a(
+ hidden_states=hidden_states,
+ topk_ids=topk_ids,
+ topk_weights=topk_weights,
+ )
+ self._combine_intermediate_state = inner_state
+
+ def combine_b(self):
+ self._update_stage(_Stage.AFTER_COMBINE_A, _Stage.INITIAL)
+ inner_state = self._combine_intermediate_state
+ del self._combine_intermediate_state
+ return self._get_impl().combine_b(*inner_state)
+
+ def _get_impl(self) -> _MooncakeEPDispatcherImpl:
+ is_extend_in_batch = get_is_extend_in_batch()
+ resolved_deepep_mode = self.deepep_mode.resolve(is_extend_in_batch)
+ if resolved_deepep_mode == DeepEPMode.NORMAL:
+ raise NotImplementedError
+ elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
+ return self._low_latency_dispatcher
+ else:
+ raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
+
+ def _update_stage(self, old_stage, new_stage):
+ assert self._stage == old_stage
+ self._stage = new_stage
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/moriep.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/moriep.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c189d0f70796491cfe3a99849567ecda1c3565d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/moriep.py
@@ -0,0 +1,877 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple
+
+from sglang.srt.layers.dp_attention import get_is_extend_in_batch
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.deepep import DeepEPPDispatchHooks
+from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.moe.utils import (
+ DeepEPMode,
+ is_tbo_enabled,
+)
+from sglang.srt.utils import get_bool_env_var, get_int_env_var, is_hip
+
+if TYPE_CHECKING:
+ from sglang.srt.single_batch_overlap import CombineOverlapArgs
+ import mori
+
+from enum import Enum, auto
+from functools import lru_cache
+
+import torch
+
+from sglang.srt.distributed import (
+ get_moe_expert_parallel_rank,
+ get_moe_expert_parallel_world_size,
+)
+from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+ from aiter import QuantType, get_hip_quant
+
+logger = logging.getLogger(__name__)
+
+
+class MoriEPPDispatchHooks(DeepEPPDispatchHooks):
+
+ def __call__(self, dispatcher: BaseDispatcher):
+ for hook_fun in self.hook_dict.values():
+ hook_fun(dispatcher)
+
+
+class MoriEPNormalDispatchOutput(NamedTuple):
+ """Mori EP normal dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+ num_recv_tokens_per_expert: List[int]
+ origin_topk_ids: torch.Tensor
+ origin_topk_weights: torch.Tensor
+ out_dtype: torch.dtype
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_NORMAL
+
+
+class MoriEPLLDispatchOutput(NamedTuple):
+ """Mori EP low latency dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+ num_recv_tokens_per_expert: List[int]
+ origin_topk_ids: torch.Tensor
+ origin_topk_weights: torch.Tensor
+ out_dtype: torch.dtype
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.DEEPEP_LL
+
+
+assert isinstance(MoriEPNormalDispatchOutput, DispatchOutput)
+assert isinstance(MoriEPLLDispatchOutput, DispatchOutput)
+
+
+class MoriEPNormalCombineInput(NamedTuple):
+ """Mori EP combine input."""
+
+ hidden_states: torch.Tensor
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_NORMAL
+
+
+class MoriEPLLCombineInput(NamedTuple):
+ """Mori EP combine input."""
+
+ hidden_states: torch.Tensor
+ topk_ids: torch.Tensor
+ topk_weights: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.DEEPEP_LL
+
+
+assert isinstance(MoriEPNormalCombineInput, CombineInput)
+assert isinstance(MoriEPLLCombineInput, CombineInput)
+
+
+class EpMode(Enum):
+ INTRA_NODE = "intra_node"
+ INTER_NODE = "inter_node"
+ LOW_LATENCY = "low_latency"
+
+
+@dataclass(frozen=True)
+class EpDispatchConfig:
+ kernel_type: mori.ops.EpDispatchCombineKernelType
+ warp_num_per_block: int
+ block_num: int
+ rdma_block_num: int
+
+
+def get_ep_dispatch_configs(num_max_dispatch_tokens_per_rank: int = 4096):
+ import mori
+
+ # Selects the inter-node kernel. `InterNodeV1LL` is used if `num_max_dispatch_tokens_per_rank`
+ # is less than or equal to the threshold, otherwise `InterNodeV1` is used. The threshold defaults to 256.
+ inter_kernel_switch_threshold = get_int_env_var(
+ "SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD", 256
+ )
+
+ inter_kernel_type = (
+ mori.ops.EpDispatchCombineKernelType.InterNodeV1LL
+ if num_max_dispatch_tokens_per_rank <= inter_kernel_switch_threshold
+ else mori.ops.EpDispatchCombineKernelType.InterNodeV1
+ )
+
+ return {
+ # TODO(billishyahao): need to tune different configs for intra node async
+ # Also could be tuned for different AMD platform
+ EpMode.INTRA_NODE: EpDispatchConfig(
+ kernel_type=mori.ops.EpDispatchCombineKernelType.IntraNode,
+ warp_num_per_block=16,
+ block_num=80,
+ rdma_block_num=0,
+ ),
+ EpMode.INTER_NODE: EpDispatchConfig(
+ kernel_type=inter_kernel_type,
+ warp_num_per_block=8,
+ block_num=64,
+ rdma_block_num=32,
+ ),
+ EpMode.LOW_LATENCY: EpDispatchConfig(
+ kernel_type=mori.ops.EpDispatchCombineKernelType.AsyncLL,
+ warp_num_per_block=8,
+ block_num=64,
+ rdma_block_num=32,
+ ),
+ }
+
+
+# init_mori_op only needs do once in model initial stage
+# use lru_cache to reuse the same mori_op instance to avoid the init overhead for mori
+@lru_cache(maxsize=2)
+def init_mori_op(
+ group,
+ router_topk,
+ num_experts,
+ num_local_experts,
+ hidden_size,
+ params_dtype,
+ num_max_dispatch_tokens_per_rank,
+ deepep_mode,
+):
+
+ import mori
+
+ world_size = get_moe_expert_parallel_world_size()
+ rank = get_moe_expert_parallel_rank()
+
+ gpu_per_node = 8 if world_size >= 8 else world_size
+
+ cpu_group = group.cpu_group
+ torch._C._distributed_c10d._register_process_group("mori", cpu_group)
+ mori.shmem.shmem_torch_process_group_init("mori")
+
+ mode = EpMode.INTRA_NODE if world_size <= 8 else EpMode.INTER_NODE
+ async_mode = deepep_mode.enable_low_latency()
+ if async_mode:
+ mode = EpMode.LOW_LATENCY
+
+ logger.info(
+ f"[MORI init] {world_size=} {rank=} {hidden_size=} {params_dtype=} {num_max_dispatch_tokens_per_rank=} {num_local_experts=} {router_topk=} {mode=}"
+ )
+
+ cfg = get_ep_dispatch_configs(num_max_dispatch_tokens_per_rank)[mode]
+
+ kernel_type = cfg.kernel_type
+ warp_num_per_block = cfg.warp_num_per_block
+ block_num = cfg.block_num
+ rdma_block_num = cfg.rdma_block_num
+
+ mori_config = mori.ops.EpDispatchCombineConfig(
+ rank=rank,
+ world_size=world_size,
+ data_type=fp8_dtype,
+ hidden_dim=hidden_size,
+ scale_dim=(
+ hidden_size // 128
+ if get_bool_env_var("SGLANG_MORI_FP8_DISP", "False")
+ else 1
+ ),
+ scale_type_size=torch.float32.itemsize,
+ max_token_type_size=params_dtype.itemsize,
+ max_num_inp_token_per_rank=num_max_dispatch_tokens_per_rank,
+ num_experts_per_rank=num_local_experts,
+ num_experts_per_token=router_topk,
+ warp_num_per_block=warp_num_per_block,
+ block_num=block_num,
+ kernel_type=kernel_type,
+ gpu_per_node=gpu_per_node,
+ rdma_block_num=rdma_block_num,
+ num_qp_per_pe=2,
+ )
+ mori_op = mori.ops.EpDispatchCombineOp(mori_config)
+ return mori_op
+
+
+class CommStreamPool:
+ _streams = {} # key -> torch.cuda.Stream
+
+ @classmethod
+ def _make_key(cls, group):
+ return (torch.cuda.current_device(), id(group))
+
+ @classmethod
+ def get_stream_from_pool(cls, group) -> torch.cuda.Stream:
+ key = cls._make_key(group)
+ stream = cls._streams.get(key)
+ if stream is None:
+ stream = torch.cuda.Stream(priority=0)
+ cls._streams[key] = stream
+ return stream
+
+ @classmethod
+ def clear_group(cls, group):
+ key = (torch.cuda.current_device(), id(group))
+ cls._streams.pop(key, None)
+
+
+class _MoriEPDispatcherImplBase:
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool,
+ num_experts: int,
+ num_local_experts: int,
+ hidden_size: int,
+ params_dtype: torch.dtype,
+ deepep_mode: DeepEPMode,
+ ):
+ try:
+ import mori # noqa: F401
+ except ImportError:
+ raise ImportError("Mori EP is not installed. Please install.")
+ self.group = group
+ self.router_topk = router_topk
+ self.permute_fusion = permute_fusion
+ self.num_experts = num_experts
+ self.num_local_experts = num_local_experts
+ self.hidden_size = hidden_size
+ self.params_dtype = params_dtype
+ self.deepep_mode = deepep_mode
+
+ self.num_max_dispatch_tokens_per_rank = get_int_env_var(
+ "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 4096
+ )
+
+ self.mori_op = init_mori_op(
+ self.group,
+ self.router_topk,
+ self.num_experts,
+ self.num_local_experts,
+ self.hidden_size,
+ self.params_dtype,
+ self.num_max_dispatch_tokens_per_rank,
+ self.deepep_mode,
+ )
+
+ self.quant_config: Optional[dict] = None
+
+ self.overlap_args: Optional[CombineOverlapArgs] = None
+ self.meta_overlap_args: Optional[dict] = None
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ raise NotImplementedError
+
+ def dispatch_b(self, *args, **kwargs):
+ raise NotImplementedError
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ raise NotImplementedError
+
+ def combine_b(self, *args, **kwargs):
+ raise NotImplementedError
+
+ def set_quant_config(self, quant_config: dict) -> None:
+ self.quant_config = quant_config
+
+ def set_overlap_args(
+ self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+ ) -> None:
+ self.overlap_args = combine_overlap_args
+ self.meta_overlap_args = meta_overlap_args
+
+ def clear_overlap_args(self) -> None:
+ self.overlap_args = None
+ self.meta_overlap_args = None
+
+
+class _MoriEPDispatcherImplNormal(_MoriEPDispatcherImplBase):
+ def __init__(self, async_finish: bool, **kwargs):
+ super().__init__(**kwargs)
+
+ self.async_finish = async_finish
+ self.quant_config = {}
+ # [kk TODO] need to support mxfp4 type
+ self.quant_func = get_hip_quant(QuantType.per_1x128)
+ self.enable_dual_stream = is_tbo_enabled()
+ self._comm_stream = None
+ if self.enable_dual_stream:
+ self._comm_stream = CommStreamPool.get_stream_from_pool(self.group)
+
+ def _capture_event_if_async(self) -> Optional[torch.cuda.Event]:
+ assert self.enable_dual_stream, "dual stream must be enabled"
+ if not self.async_finish:
+ return None
+ ev = torch.cuda.Event(blocking=False, interprocess=False)
+ ev.record(torch.cuda.current_stream())
+ return ev
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+ previous_event = self._capture_event_if_async() if self._comm_stream else None
+
+ return (hidden_states, topk_weights, topk_ids, previous_event)
+
+ def dispatch_b(
+ self,
+ hidden_states,
+ topk_weights,
+ topk_ids,
+ previous_event,
+ ):
+ num_token = hidden_states.shape[0]
+ output_dtype = hidden_states.dtype
+ scale = None
+
+ fp8_dispatch = get_bool_env_var("SGLANG_MORI_FP8_DISP", "False")
+
+ if fp8_dispatch:
+ # FP8 quant
+ if num_token > 0:
+ # NOTE: aiter is able to handle token=0 case in UT. But for some reason it failed at e2e case. Root cause TBD.
+ hidden_states, scale = self.quant_func(
+ hidden_states, quant_dtype=fp8_dtype
+ )
+ else:
+ hidden_states = torch.empty(
+ hidden_states.shape, dtype=fp8_dtype, device=hidden_states.device
+ )
+ scale = torch.empty(
+ (0, self.hidden_size // 128),
+ dtype=torch.float32,
+ device=hidden_states.device,
+ )
+
+ (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ done_event,
+ ) = self._dispatch_core(
+ hidden_states,
+ topk_weights,
+ topk_ids,
+ scale=scale,
+ previous_event=previous_event,
+ )
+
+ if self._comm_stream and self.async_finish and done_event is not None:
+ torch.cuda.current_stream().wait_event(done_event)
+
+ return MoriEPNormalDispatchOutput(
+ hidden_states=packed_recv_hidden,
+ hidden_states_scale=recv_scales,
+ topk_ids=recv_topk_ids,
+ topk_weights=recv_topk_weights,
+ num_recv_tokens_per_expert=packed_recv_count,
+ origin_topk_ids=topk_ids,
+ origin_topk_weights=topk_weights,
+ out_dtype=output_dtype,
+ )
+
+ def _dispatch_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ scale: Optional[torch.Tensor] = None,
+ previous_event: Optional[torch.cuda.Event] = None,
+ ):
+ done_event: Optional[torch.cuda.Event] = None
+
+ if self._comm_stream:
+ compute_stream = torch.cuda.current_stream()
+ comm_stream = self._comm_stream # comm stream
+
+ for t in (hidden_states, topk_weights, topk_ids):
+ t.record_stream(comm_stream)
+ if scale is not None:
+ scale.record_stream(comm_stream)
+
+ with torch.cuda.stream(comm_stream):
+ # if (previous_event) stream_wait(comm_stream, previous_event)
+ # else stream_wait(comm_stream, compute_stream)
+
+ if previous_event is not None:
+ comm_stream.wait_event(previous_event)
+ else:
+ comm_stream.wait_stream(compute_stream)
+
+ (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ ) = self.mori_op.dispatch(hidden_states, topk_weights, scale, topk_ids)
+
+ if self.async_finish:
+ done_event = torch.cuda.Event(blocking=False, interprocess=False)
+ done_event.record(comm_stream)
+ else:
+ compute_stream.wait_stream(comm_stream)
+
+ for t in (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ ):
+ if t is not None:
+ t.record_stream(comm_stream)
+ else:
+
+ (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ ) = self.mori_op.dispatch(hidden_states, topk_weights, scale, topk_ids)
+
+ # TODO(billishyahao): EPLB
+ # get_global_expert_distribution_recorder().on_deepep_dispatch_normal(
+
+ return (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ done_event,
+ )
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ ):
+ previous_event = self._capture_event_if_async() if self._comm_stream else None
+ return hidden_states, topk_ids, topk_weights, previous_event
+
+ def combine_b(self, hidden_states, topk_ids, topk_weights, previous_event):
+
+ hidden_states, done_event = self._combine_core(
+ hidden_states, topk_ids, topk_weights, previous_event
+ )
+
+ if self._comm_stream and self.async_finish and done_event is not None:
+ torch.cuda.current_stream().wait_event(done_event)
+
+ return hidden_states
+
+ def _combine_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ previous_event: Optional[torch.cuda.Event],
+ ):
+ done_event: Optional[torch.cuda.Event] = None
+
+ if self._comm_stream:
+ compute_stream = torch.cuda.current_stream()
+ comm_stream = self._comm_stream
+
+ for t in (hidden_states, topk_ids, topk_weights):
+ t.record_stream(comm_stream)
+
+ with torch.cuda.stream(comm_stream):
+ if previous_event is not None:
+ comm_stream.wait_event(previous_event)
+ else:
+ comm_stream.wait_stream(compute_stream)
+
+ combined_hidden_states = self.mori_op.combine(
+ hidden_states, None, topk_ids
+ )[0]
+
+ if self.async_finish:
+ done_event = torch.cuda.Event(blocking=False, interprocess=False)
+ done_event.record(comm_stream)
+ else:
+ compute_stream.wait_stream(comm_stream)
+
+ combined_hidden_states.record_stream(comm_stream)
+
+ else:
+ combined_hidden_states = self.mori_op.combine(
+ hidden_states, None, topk_ids
+ )[0]
+
+ return combined_hidden_states, done_event
+
+ def set_quant_config(self, quant_config: dict):
+ self.quant_config = quant_config
+
+
+class _MoriEPDispatcherImplLowLatency(_MoriEPDispatcherImplBase):
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ self.quant_config = {}
+ self.quant_func = get_hip_quant(QuantType.per_1x128)
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ import mori
+
+ assert (
+ self.mori_op.config.kernel_type
+ is mori.ops.EpDispatchCombineKernelType.AsyncLL
+ ), "mori asyncll mismatch"
+
+ num_tokens = hidden_states.shape[0]
+ output_dtype = hidden_states.dtype
+ scale = None
+
+ fp8_dispatch = get_bool_env_var("SGLANG_MORI_FP8_DISP", "False")
+
+ if fp8_dispatch:
+ # FP8 quant
+ if num_tokens > 0:
+ # NOTE: aiter is able to handle token=0 case in UT. But for some reason it failed at e2e case. Root cause TBD.
+ hidden_states, scale = self.quant_func(
+ hidden_states, quant_dtype=fp8_dtype
+ )
+ else:
+ hidden_states = torch.empty(
+ hidden_states.shape, dtype=fp8_dtype, device=hidden_states.device
+ )
+ scale = torch.empty(
+ (0, self.hidden_size // 128),
+ dtype=torch.float32,
+ device=hidden_states.device,
+ )
+
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+ (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ ) = self._dispatch_core(hidden_states, topk_weights, topk_ids, scale=scale)
+
+ return (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_topk_ids,
+ recv_scales,
+ packed_recv_count,
+ topk_weights,
+ topk_ids,
+ output_dtype,
+ )
+
+ def dispatch_b(
+ self,
+ hidden_states,
+ recv_topk_weights,
+ recv_topk_ids,
+ recv_scales,
+ packed_recv_count,
+ topk_weights,
+ topk_ids,
+ output_dtype,
+ ):
+
+ ##TODO(billishyahao): add assertion here to check async
+ import mori
+
+ assert (
+ self.mori_op.config.kernel_type
+ is mori.ops.EpDispatchCombineKernelType.AsyncLL
+ ), "mori asyncll mismatch"
+
+ self.mori_op.dispatch_recv()
+
+ return MoriEPLLDispatchOutput(
+ hidden_states=hidden_states,
+ hidden_states_scale=recv_scales,
+ topk_ids=recv_topk_ids,
+ topk_weights=recv_topk_weights,
+ num_recv_tokens_per_expert=packed_recv_count,
+ origin_topk_ids=topk_ids,
+ origin_topk_weights=topk_weights,
+ out_dtype=output_dtype,
+ )
+
+ def _dispatch_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ scale: Optional[torch.Tensor] = None,
+ ):
+ ##TODO(billishyahao): add assertion here to check async
+
+ (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ ) = self.mori_op.dispatch_send(hidden_states, topk_weights, scale, topk_ids)
+
+ return (
+ packed_recv_hidden,
+ recv_topk_weights,
+ recv_scales,
+ recv_topk_ids,
+ packed_recv_count,
+ )
+
+ def combine_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ overlap_args: Optional[CombineOverlapArgs] = None,
+ ):
+ hidden_states = self._combine_core(
+ hidden_states,
+ topk_ids,
+ topk_weights,
+ overlap_args=overlap_args,
+ )
+ return hidden_states, topk_ids, topk_weights, overlap_args
+
+ def combine_b(self, hidden_states, topk_ids, topk_weights, previous_event):
+
+ self.mori_op.combine_recv()
+
+ return hidden_states[0]
+
+ def _combine_core(
+ self,
+ hidden_states: torch.Tensor,
+ topk_ids: torch.Tensor,
+ topk_weights: torch.Tensor,
+ overlap_args: Optional[CombineOverlapArgs] = None,
+ ):
+ combined_hidden_states = self.mori_op.combine_send(
+ hidden_states, None, topk_ids
+ )
+
+ return combined_hidden_states
+
+ def set_quant_config(self, quant_config: dict):
+ self.quant_config = quant_config
+
+
+@dataclass
+class _Stage(Enum):
+ INITIAL = auto()
+ AFTER_DISPATCH_A = auto()
+ AFTER_DISPATCH_B = auto()
+ AFTER_COMBINE_A = auto()
+
+
+class MoriEPDispatcher(BaseDispatcher):
+ def __init__(
+ self,
+ group: torch.distributed.ProcessGroup,
+ router_topk: int,
+ permute_fusion: bool = False,
+ num_experts: int = None,
+ num_local_experts: int = None,
+ hidden_size: int = None,
+ params_dtype: torch.dtype = None,
+ deepep_mode: DeepEPMode = DeepEPMode.AUTO,
+ async_finish: bool = False,
+ return_recv_hook: bool = False,
+ ):
+ super().__init__()
+
+ self.deepep_mode = deepep_mode
+
+ common_kwargs = dict(
+ group=group,
+ router_topk=router_topk,
+ permute_fusion=permute_fusion,
+ num_experts=num_experts,
+ num_local_experts=num_local_experts,
+ hidden_size=hidden_size,
+ params_dtype=params_dtype,
+ deepep_mode=deepep_mode,
+ )
+
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher = _MoriEPDispatcherImplLowLatency(
+ **common_kwargs,
+ )
+
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher = _MoriEPDispatcherImplNormal(
+ async_finish=async_finish,
+ **common_kwargs,
+ )
+
+ self._stage = _Stage.INITIAL
+ self._deepep_dispatch_hooks = MoriEPPDispatchHooks()
+
+ def dispatch(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ) -> DispatchOutput:
+ self.dispatch_a(hidden_states, topk_output)
+ if self._deepep_dispatch_hooks is not None:
+ self._deepep_dispatch_hooks(self)
+ ret = self.dispatch_b()
+ return ret
+
+ def dispatch_a(
+ self,
+ hidden_states: torch.Tensor,
+ topk_output: TopKOutput,
+ ):
+ self._update_stage(_Stage.INITIAL, _Stage.AFTER_DISPATCH_A)
+ inner_state = self._get_impl().dispatch_a(
+ hidden_states=hidden_states,
+ topk_output=topk_output,
+ )
+ self._dispatch_intermediate_state = inner_state
+
+ def dispatch_b(self):
+ self._update_stage(_Stage.AFTER_DISPATCH_A, _Stage.AFTER_DISPATCH_B)
+ inner_state = self._dispatch_intermediate_state
+ del self._dispatch_intermediate_state
+ return self._get_impl().dispatch_b(*inner_state)
+
+ def combine(
+ self,
+ combine_input: CombineInput,
+ ) -> Tuple:
+ self.combine_a(combine_input)
+ ret = self.combine_b()
+ return ret
+
+ def combine_a(
+ self,
+ combine_input: CombineInput,
+ ):
+ hidden_states, topk_ids, topk_weights = combine_input
+ self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
+ inner_state = self._get_impl().combine_a(
+ hidden_states=hidden_states,
+ topk_ids=topk_ids,
+ topk_weights=topk_weights,
+ )
+ self._combine_intermediate_state = inner_state
+
+ def combine_b(self):
+ self._update_stage(_Stage.AFTER_COMBINE_A, _Stage.INITIAL)
+ inner_state = self._combine_intermediate_state
+ del self._combine_intermediate_state
+ return self._get_impl().combine_b(*inner_state)
+
+ def _get_impl(self) -> _MoriEPDispatcherImplBase:
+ is_extend_in_batch = get_is_extend_in_batch()
+ resolved_deepep_mode = self.deepep_mode.resolve(is_extend_in_batch)
+ if resolved_deepep_mode == DeepEPMode.NORMAL:
+ return self._normal_dispatcher
+ elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
+ return self._low_latency_dispatcher
+ else:
+ raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
+
+ def _update_stage(self, old_stage, new_stage):
+ assert self._stage == old_stage
+ self._stage = new_stage
+
+ def set_quant_config(self, quant_config: dict):
+ super().set_quant_config(quant_config)
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.set_quant_config(quant_config)
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.set_quant_config(quant_config)
+
+ def set_overlap_args(
+ self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+ ):
+ super().set_overlap_args(combine_overlap_args, meta_overlap_args)
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.set_overlap_args(
+ combine_overlap_args, meta_overlap_args
+ )
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.set_overlap_args(
+ combine_overlap_args, meta_overlap_args
+ )
+
+ def clear_overlap_args(self):
+ super().clear_overlap_args()
+ if self.deepep_mode.enable_low_latency():
+ self._low_latency_dispatcher.clear_overlap_args()
+ if self.deepep_mode.enable_normal():
+ self._normal_dispatcher.clear_overlap_args()
+
+ def register_deepep_dispatch_hook(self, hook):
+ return self._deepep_dispatch_hooks.register_hook(hook)
diff --git a/sglang/python/sglang/srt/layers/moe/token_dispatcher/standard.py b/sglang/python/sglang/srt/layers/moe/token_dispatcher/standard.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec839991a9042044811d25d6ea32dcb2f53ec16
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/token_dispatcher/standard.py
@@ -0,0 +1,193 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, NamedTuple, Optional
+
+import torch
+
+from sglang.srt.distributed import (
+ get_moe_expert_parallel_rank,
+ get_moe_expert_parallel_world_size,
+ get_tp_group,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+ use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import (
+ get_dp_global_num_tokens,
+ get_local_dp_buffer,
+ is_allocation_symmetric,
+)
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+from sglang.srt.layers.moe.token_dispatcher.base import (
+ BaseDispatcher,
+ CombineInput,
+ CombineInputFormat,
+ DispatchOutput,
+ DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.topk import StandardTopKOutput, TopKOutput, TopKOutputChecker
+from sglang.srt.layers.moe.utils import (
+ get_moe_runner_backend,
+ should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.utils.common import get_bool_env_var, is_hip, is_sm120_supported
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.topk import TopKOutput
+
+
+try:
+ if is_sm120_supported():
+ from flashinfer import fp4_quantize
+ else:
+ from sglang.jit_kernel.nvfp4 import scaled_fp4_quant as fp4_quantize
+
+ from flashinfer import fp4_quantize as fp4_quantize_flashinfer
+except ImportError:
+ fp4_quantize = None
+
+
+class StandardDispatchOutput(NamedTuple):
+ """Standard dispatch output."""
+
+ hidden_states: torch.Tensor
+ hidden_states_scale: Optional[torch.Tensor]
+ topk_output: TopKOutput
+
+ @property
+ def format(self) -> DispatchOutputFormat:
+ return DispatchOutputFormat.STANDARD
+
+
+assert isinstance(StandardDispatchOutput, DispatchOutput)
+
+
+class StandardCombineInput(NamedTuple):
+ """Standard combine input."""
+
+ hidden_states: torch.Tensor
+
+ @property
+ def format(self) -> CombineInputFormat:
+ return CombineInputFormat.STANDARD
+
+
+assert isinstance(StandardCombineInput, CombineInput)
+
+
+class StandardDispatcher(BaseDispatcher):
+
+ def __init__(self, moe_runner_config: MoeRunnerConfig):
+ super().__init__()
+ self.moe_ep_size = get_moe_expert_parallel_world_size()
+ self.enable_flashinfer_cutlass_moe = (
+ get_moe_runner_backend().is_flashinfer_cutlass()
+ )
+ self.num_experts = moe_runner_config.num_experts
+ self.num_local_shared_experts = moe_runner_config.num_fused_shared_experts
+ self.num_local_routed_experts = (
+ moe_runner_config.num_local_experts - self.num_local_shared_experts
+ )
+ self.moe_ep_rank = get_moe_expert_parallel_rank()
+ self.local_expert_mapping = None
+
+ def dispatch(
+ self, hidden_states: torch.Tensor, topk_output: TopKOutput
+ ) -> StandardDispatchOutput:
+
+ if should_use_flashinfer_cutlass_moe_fp4_allgather():
+ # all-gather fp4 hidden states
+ from flashinfer import nvfp4_block_scale_interleave
+
+ global_scale = self.quant_config.get("input_global_scale", None)
+ assert global_scale is not None, "input_global_scale is not set"
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+ # Quantize before comm, swizzle after.
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ if hidden_states.shape[0] > 0:
+ x, x_sf = fp4_quantize_flashinfer(
+ hidden_states, global_scale, is_sf_swizzled_layout=False
+ )
+ else:
+ x_col = hidden_states.shape[1]
+ x = torch.zeros(
+ 0, x_col // 2, dtype=torch.uint8, device=hidden_states.device
+ )
+ x_sf = torch.zeros(
+ 0, x_col // 16, dtype=torch.uint8, device=hidden_states.device
+ )
+ topk_weights, topk_ids, x, x_sf = get_tp_group().all_gatherv(
+ [topk_weights, topk_ids, x, x_sf], sizes=get_dp_global_num_tokens()
+ )
+ # TODO: fuse into cutlass moe
+ x_sf = nvfp4_block_scale_interleave(x_sf)
+
+ hidden_states = x
+ hidden_states_scale = x_sf
+ topk_output = StandardTopKOutput(
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ router_logits=topk_output.router_logits, # never tested
+ )
+ else:
+ hidden_states = hidden_states
+ hidden_states_scale = None
+
+ if (
+ self.moe_ep_size > 1
+ and not self.enable_flashinfer_cutlass_moe
+ and TopKOutputChecker.format_is_standard(topk_output)
+ ):
+ if self.local_expert_mapping is None:
+ self.local_expert_mapping = torch.full(
+ (self.num_experts,), -1, dtype=torch.int32, device="cuda"
+ )
+ self.local_expert_mapping[
+ self.moe_ep_rank
+ * self.num_local_routed_experts : (self.moe_ep_rank + 1)
+ * self.num_local_routed_experts
+ ] = torch.arange(
+ 0, self.num_local_routed_experts, dtype=torch.int32, device="cuda"
+ )
+
+ if self.num_local_shared_experts > 0:
+ self.local_expert_mapping[-self.num_local_shared_experts :] = (
+ torch.arange(
+ self.num_local_routed_experts,
+ self.num_local_routed_experts
+ + self.num_local_shared_experts,
+ dtype=torch.int32,
+ device="cpu",
+ )
+ )
+
+ if self.local_expert_mapping is not None and not _use_aiter:
+ if TopKOutputChecker.format_is_standard(topk_output):
+ topk_output = topk_output._replace(
+ topk_ids=self.local_expert_mapping[topk_output.topk_ids]
+ )
+ elif TopKOutputChecker.format_is_triton_kernels(topk_output):
+ raise NotImplementedError()
+
+ return StandardDispatchOutput(
+ hidden_states=hidden_states,
+ hidden_states_scale=hidden_states_scale,
+ topk_output=topk_output,
+ )
+
+ def combine(self, combine_input: StandardCombineInput) -> torch.Tensor:
+ (hidden_states,) = combine_input
+ if should_use_flashinfer_cutlass_moe_fp4_allgather():
+ hidden_states, global_hidden_states = get_local_dp_buffer(), hidden_states
+ get_tp_group().reduce_scatterv(
+ global_hidden_states,
+ output=hidden_states,
+ sizes=get_dp_global_num_tokens(),
+ )
+ return hidden_states
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b65506406d671c8f93ecb8bd62c3242bab1db0f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7fb3d74eac34a2df1eda7e669ded6e615f98111
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/compressed_tensors.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f44cf02bb3f1c0360bc60f5e9cbb69cd13c15a
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f67e5ba5338eb492ce0df6f3a4255492031e1ad
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .compressed_tensors_scheme import (
+ CompressedTensorsLinearScheme,
+ CompressedTensorsMoEScheme,
+)
+from .compressed_tensors_w4a4_mxint4_moe import CompressedTensorsMxInt4MoE
+from .compressed_tensors_w4a4_nvfp4 import CompressedTensorsW4A4Fp4
+from .compressed_tensors_w4a4_nvfp4_moe import CompressedTensorsW4A4Nvfp4MoE
+from .compressed_tensors_w4a8_int8_moe import NPUCompressedTensorsW4A8Int8DynamicMoE
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a8_fp8_moe import CompressedTensorsW8A8Fp8MoE
+from .compressed_tensors_w8a8_int8 import (
+ CompressedTensorsW8A8Int8,
+ NPUCompressedTensorsW8A8Int8,
+)
+from .compressed_tensors_w8a8_int8_moe import NPUCompressedTensorsW8A8Int8DynamicMoE
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
+from .compressed_tensors_wNa16_moe import (
+ CompressedTensorsWNA16MoE,
+ CompressedTensorsWNA16TritonMoE,
+ NPUCompressedTensorsW4A16Int4DynamicMoE,
+)
+
+__all__ = [
+ "CompressedTensorsLinearScheme",
+ "CompressedTensorsMoEScheme",
+ "CompressedTensorsW8A8Fp8",
+ "CompressedTensorsW8A8Fp8MoE",
+ "CompressedTensorsW8A16Fp8",
+ "CompressedTensorsW8A8Int8",
+ "NPUCompressedTensorsW8A8Int8",
+ "NPUCompressedTensorsW8A8Int8DynamicMoE",
+ "CompressedTensorsWNA16",
+ "CompressedTensorsWNA16MoE",
+ "CompressedTensorsWNA16TritonMoE",
+ "NPUCompressedTensorsW4A16Int4DynamicMoE",
+ "WNA16_SUPPORTED_BITS",
+ "CompressedTensorsW4A4Fp4",
+ "CompressedTensorsW4A4Nvfp4MoE",
+ "NPUCompressedTensorsW4A8Int8DynamicMoE",
+ "CompressedTensorsMxInt4MoE",
+]
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d772baa65bfa1790b6c24d5683d0a6228abd0351
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2e9d2d7d2691b4d871c44eb20a50df74a597f0d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_scheme.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_mxint4_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_mxint4_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dd371845562f4bc630e4f4f6f9da8ce5ea81b76
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_mxint4_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..056a68273424e16ddae53ff635d923d23a565523
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8af559e174f8936b67788b74ad128d52360ec62d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a4_nvfp4_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a8_int8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a8_int8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27911ecc28f05adba61251cb21b7ac3c21149597
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w4a8_int8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2872bd42780474767a3f23c509350c79593afcf3
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a16_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e66671fb919b6594518cc9f30d4782491ca10e4d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..405b3155a512aa47dec9e5589557813bd87685a5
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_fp8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b547e1f8bbef119f147fe21e32ea213d939ad448
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01ab5119de6fe18219b66ec4ee426421750f8279
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_w8a8_int8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd3c1ab987cac5d43fbc0298dd5fade4e45f2351
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..540038b9cfaf52e6d325a943be4149dc550ab115
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__pycache__/compressed_tensors_wNa16_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
new file mode 100644
index 0000000000000000000000000000000000000000..917d417e76b6a0b1c7a03af9a320a7d2afbf3274
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -0,0 +1,115 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_scheme import BaseLinearScheme, BaseMoEScheme
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+__all__ = ["CompressedTensorsLinearScheme", "CompressedTensorsMoEScheme"]
+
+
+class CompressedTensorsLinearScheme(BaseLinearScheme):
+ """
+ Abstract class used to describe the weight creation and forward pass
+ of different quantization schemes supported by CompressedTensors.
+ """
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ """
+ Get minimum device capability.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def create_weights(self, *args, **kwargs):
+ """
+ Weight creation for the particular scheme. Inputs to this function
+
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def apply_weights(
+ self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+ ):
+ """
+ Run the forward pass for the particular scheme. This is where
+ scheme-specific dequant/quant steps/kernels should be applied.
+
+ :param layer: torch.nn.Module with the registered weights and
+ other parameters relevant to the particular scheme.
+ :param x: input to the layer
+ :param bias: bias parameter
+
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def process_weights_after_loading(self, layer: torch.nn.Module):
+ """
+ Called after weight loading is complete for any cleanup that
+ needs to occur.
+ """
+ raise NotImplementedError
+
+
+class CompressedTensorsMoEScheme(BaseMoEScheme):
+ """
+ Abstract class used to describe the weight creation and forward pass
+ of different quantization schemes supported by CompressedTensors.
+ """
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ """
+ Get minimum device capability.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def create_weights(self, *args, **kwargs):
+ """
+ Weight creation for the particular scheme. Inputs to this function
+
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ raise NotImplementedError
+
+ @abstractmethod
+ def process_weights_after_loading(self, layer: torch.nn.Module):
+ """
+ Called after weight loading is complete for any cleanup that
+ needs to occur.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: "StandardDispatchOutput",
+ ):
+ """
+ Run the forward pass for the particular scheme. This is where
+ scheme-specific dequant/quant steps/kernels should be applied.
+
+ :param layer: torch.nn.Module with the registered weights and
+ other parameters relevant to the particular scheme.
+ :param x: input to the layer
+ :param bias: bias parameter
+
+ """
+ raise NotImplementedError
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_mxint4_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_mxint4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..865f3de43849c2d13f0e57d944bef3664beff9fe
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_mxint4_moe.py
@@ -0,0 +1,357 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from compressed_tensors import CompressionFormat
+
+from sglang.srt.distributed import get_moe_expert_parallel_rank, get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+ use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.moe.utils import RoutingMethodType, get_moe_runner_backend
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.layers.quantization.utils import replace_parameter
+from sglang.srt.utils import is_flashinfer_available, next_power_of_2, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsMxInt4MoE"]
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+ CompressedTensorsConfig,
+ )
+
+if is_flashinfer_available():
+ from flashinfer.fp4_quantization import block_scale_interleave
+ from flashinfer.fused_moe import (
+ convert_to_block_layout,
+ trtllm_mxint4_block_scale_moe,
+ )
+ from flashinfer.fused_moe.core import (
+ _maybe_get_cached_w3_w1_permute_indices,
+ get_w2_permute_indices_with_cache,
+ )
+
+
+class CompressedTensorsMxInt4MoE(CompressedTensorsMoEScheme):
+ def __init__(self, quant_config: CompressedTensorsConfig):
+ self.quant_config = quant_config
+ config = self.quant_config.target_scheme_map["Linear"].get("weights")
+ self.num_bits = config.num_bits
+ self.packed_factor = 32 // config.num_bits
+ self.strategy = config.strategy
+ self.group_size = config.group_size
+ self.actorder = config.actorder
+ assert (
+ config.strategy == "group"
+ and config.group_size == 32
+ and config.num_bits == 4
+ ), "MxInt4 only supports group strategy with group size 32"
+ assert config.symmetric, "Only symmetric quantization is supported for MoE"
+ assert (
+ get_moe_runner_backend().is_flashinfer_trtllm()
+ ), "MxInt4 only supports flashinfer_trtllm backend"
+ assert (
+ not config.actorder
+ ), "Actorder is not supported by flashinfer_trtllm backend"
+ self.moe_ep_rank = get_moe_expert_parallel_rank()
+
+ if self.quant_config.quant_format != CompressionFormat.pack_quantized.value:
+ raise ValueError(
+ f"For Fused MoE layers, only {CompressionFormat.pack_quantized.value} "
+ "is supported for the mxint4"
+ )
+ self._cache_permute_indices = {}
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # Requires sm100(blackwell) architecture
+ return 100
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ assert (
+ params_dtype == torch.bfloat16
+ ), f"Params dtype should be torch.bfloat16, but got: {params_dtype}"
+
+ extra_weight_attrs.update({"quant_method": self.strategy})
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.packed_factor,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_packed", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.packed_factor,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_packed", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ w2_scales_size = intermediate_size_per_partition
+ num_groups_w2 = w2_scales_size // self.group_size
+ num_groups_w13 = hidden_size // self.group_size
+
+ w13_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ num_groups_w13,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_scale)
+ set_weight_attrs(w13_scale, extra_weight_attrs)
+
+ w2_scale = torch.nn.Parameter(
+ torch.ones(num_experts, hidden_size, num_groups_w2, dtype=params_dtype),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_scale)
+ set_weight_attrs(w2_scale, extra_weight_attrs)
+
+ w13_weight_shape = torch.nn.Parameter(
+ torch.empty(num_experts, 2), requires_grad=False
+ )
+
+ layer.register_parameter("w13_weight_shape", w13_weight_shape)
+ set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+ w2_weight_shape = torch.nn.Parameter(
+ torch.empty(num_experts, 2), requires_grad=False
+ )
+ layer.register_parameter("w2_weight_shape", w2_weight_shape)
+ set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+
+ layer.a13_scale = None
+ layer.a2_scale = None
+
+ # Adapted from https://github.com/flashinfer-ai/flashinfer/blob/main/tests/moe/test_trtllm_gen_fused_moe.py
+ def prepare_static_weights_for_kernel(
+ self,
+ gemm1_weights,
+ gemm2_weights,
+ gemm1_scales,
+ gemm2_scales,
+ num_experts,
+ ):
+ """Prepare quantized weights for kernel (done offline with weights)."""
+
+ epilogue_tile_m = 128
+ gemm1_weights_mxint4_shuffled = []
+ gemm1_scales_shuffled = []
+ gemm2_weights_mxint4_shuffled = []
+ gemm2_scales_shuffled = []
+
+ def repack(w):
+ assert w.dim() == 2 and w.dtype == torch.int32
+ shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=w.device)
+ w = (w.unsqueeze(2) >> shifts) & 0x0F
+ w = (w - 8).to(torch.int8).reshape(w.shape[0], -1, 2)
+ w = (w[..., 0] & 0x0F) | ((w[..., 1] & 0x0F) << 4)
+ w = w.to(torch.uint8)
+ return w
+
+ for i in range(num_experts):
+ # NOTE(HandH1998):
+ # the huggingface weight format follows (w/s + 8) to pack,
+ # however, trtllm requires (w/s) to pack
+ # we need to convert the weight to trtllm's format first
+ cur_expert_gemm1_weight = repack(gemm1_weights[i])
+ cur_expert_gemm2_weight = repack(gemm2_weights[i])
+
+ # Calculate the permute indices for the following:
+ # 1. Reorder rows of W1 and scales for fused gated activation
+ # 2. Shuffle weights and scaling factors for transposed mma output
+ # for both w3_w1 and w2 weights and scale factors
+ permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+ self._cache_permute_indices,
+ cur_expert_gemm1_weight,
+ epilogue_tile_m,
+ )
+ gemm1_weights_shuffled = cur_expert_gemm1_weight[
+ permute_indices.to(gemm1_weights.device)
+ ].contiguous()
+ permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+ self._cache_permute_indices,
+ gemm1_scales[i].to(torch.bfloat16),
+ epilogue_tile_m,
+ num_elts_per_sf=32,
+ )
+ gemm1_scales_shuffled.append(
+ block_scale_interleave(
+ gemm1_scales[i]
+ .to(torch.bfloat16)[permute_sf_indices.to(gemm1_scales.device)]
+ .contiguous()
+ )
+ )
+
+ permute_indices = get_w2_permute_indices_with_cache(
+ self._cache_permute_indices,
+ cur_expert_gemm2_weight,
+ epilogue_tile_m,
+ )
+ gemm2_weights_shuffled = cur_expert_gemm2_weight[
+ permute_indices.to(gemm2_weights.device)
+ ].contiguous()
+
+ permute_sf_indices = get_w2_permute_indices_with_cache(
+ self._cache_permute_indices,
+ gemm2_scales[i].to(torch.bfloat16),
+ epilogue_tile_m,
+ num_elts_per_sf=16,
+ )
+ gemm2_scales_shuffled.append(
+ block_scale_interleave(
+ gemm2_scales[i]
+ .to(torch.bfloat16)[permute_sf_indices.to(gemm2_scales.device)]
+ .contiguous()
+ )
+ )
+
+ block_k = 128
+ gemm1_weights_shuffled = convert_to_block_layout(
+ gemm1_weights_shuffled.view(torch.uint8), block_k
+ )
+ gemm2_weights_shuffled = convert_to_block_layout(
+ gemm2_weights_shuffled.view(torch.uint8), block_k
+ )
+
+ gemm1_weights_mxint4_shuffled.append(gemm1_weights_shuffled)
+ gemm2_weights_mxint4_shuffled.append(gemm2_weights_shuffled)
+
+ gemm1_weights_mxint4_shuffled = torch.stack(gemm1_weights_mxint4_shuffled)
+ gemm2_weights_mxint4_shuffled = torch.stack(gemm2_weights_mxint4_shuffled)
+ gemm1_scales_shuffled = torch.stack(gemm1_scales_shuffled).view(torch.bfloat16)
+ gemm2_scales_shuffled = torch.stack(gemm2_scales_shuffled).view(torch.bfloat16)
+
+ return (
+ gemm1_weights_mxint4_shuffled,
+ gemm1_scales_shuffled,
+ gemm2_weights_mxint4_shuffled,
+ gemm2_scales_shuffled,
+ )
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+ num_experts = layer.w13_weight_packed.shape[0]
+ (
+ gemm1_weights_mxint4_shuffled,
+ gemm1_scales_shuffled,
+ gemm2_weights_mxint4_shuffled,
+ gemm2_scales_shuffled,
+ ) = self.prepare_static_weights_for_kernel(
+ layer.w13_weight_packed,
+ layer.w2_weight_packed,
+ layer.w13_weight_scale,
+ layer.w2_weight_scale,
+ num_experts=num_experts,
+ )
+ replace_parameter(layer, "w13_weight_packed", gemm1_weights_mxint4_shuffled)
+ replace_parameter(layer, "w2_weight_packed", gemm2_weights_mxint4_shuffled)
+ replace_parameter(layer, "w13_weight_scale", gemm1_scales_shuffled)
+ replace_parameter(layer, "w2_weight_scale", gemm2_scales_shuffled)
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ assert (
+ self.moe_runner_config.is_gated
+ ), "Only gated MoEs are supported for flashinfer mxint4"
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ router_logits = topk_output.router_logits
+ topk_config = topk_output.topk_config
+ correction_bias = (
+ None
+ if topk_config.correction_bias is None
+ else topk_config.correction_bias.to(x.dtype)
+ )
+
+ local_num_experts = self.moe_runner_config.num_local_experts
+ routing_method_type = layer.routing_method_type
+ assert routing_method_type is not None
+ # DeepSeekV3 style routing requires float32 router logits,
+ # see this PR for details: https://github.com/flashinfer-ai/flashinfer/commit/d84e1d560da0a27961c19ca788d96c19cb9dcfb6
+ if routing_method_type == RoutingMethodType.DeepSeekV3:
+ router_logits = router_logits.to(torch.float32)
+ routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
+ routed_scaling_factor = (
+ routed_scaling_factor if routed_scaling_factor is not None else 1.0
+ )
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ num_tokens = x.shape[0]
+ hidden_size = x.shape[-1]
+ symm_output = torch.empty(
+ num_tokens, hidden_size, dtype=torch.bfloat16, device=x.device
+ )
+
+ output = trtllm_mxint4_block_scale_moe(
+ routing_logits=router_logits, # float
+ routing_bias=correction_bias,
+ hidden_states=x,
+ gemm1_weights=layer.w13_weight_packed,
+ gemm1_weights_scale=layer.w13_weight_scale,
+ gemm1_alpha=self.moe_runner_config.gemm1_alpha,
+ gemm1_beta=None,
+ gemm1_clamp_limit=self.moe_runner_config.gemm1_clamp_limit,
+ gemm2_weights=layer.w2_weight_packed,
+ gemm2_weights_scale=layer.w2_weight_scale,
+ num_experts=self.moe_runner_config.num_experts,
+ top_k=topk_config.top_k,
+ n_group=topk_config.num_expert_group,
+ topk_group=topk_config.topk_group,
+ intermediate_size=self.moe_runner_config.intermediate_size_per_partition,
+ local_expert_offset=self.moe_ep_rank * local_num_experts,
+ local_num_experts=local_num_experts,
+ routed_scaling_factor=routed_scaling_factor,
+ routing_method_type=routing_method_type,
+ tune_max_num_tokens=next_power_of_2(x.shape[0]),
+ output=symm_output,
+ )
+
+ return StandardCombineInput(hidden_states=output)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1072dac7b3ca952a4b9238b101b25274f25d1639
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -0,0 +1,168 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from collections.abc import Callable
+from typing import Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.parameter import (
+ GroupQuantScaleParameter,
+ ModelWeightParameter,
+ PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsLinearScheme,
+)
+from sglang.srt.layers.quantization.fp4_utils import get_fp4_gemm_runner_backend
+from sglang.srt.layers.quantization.modelopt_quant import (
+ enable_flashinfer_fp4_gemm,
+ fp4_gemm,
+ fp4_quantize,
+)
+from sglang.srt.layers.quantization.utils import swizzle_blockscale
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsW4A4Fp4"]
+
+
+class CompressedTensorsW4A4Fp4(CompressedTensorsLinearScheme):
+ def __init__(self):
+ self.group_size = 16
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ return 100
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ output_partition_sizes: list[int],
+ input_size_per_partition: int,
+ params_dtype: torch.dtype,
+ weight_loader: Callable,
+ **kwargs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ layer.logical_widths = output_partition_sizes
+ layer.input_size_per_partition = input_size_per_partition
+ layer.output_size_per_partition = output_size_per_partition
+
+ # Weight
+ weight = ModelWeightParameter(
+ data=torch.empty(
+ sum(output_partition_sizes),
+ input_size_per_partition // 2,
+ dtype=torch.uint8,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight_packed", weight)
+
+ # Global Weight Scale
+ weight_global_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight_global_scale", weight_global_scale)
+
+ # Per Group Weight Scale
+ weight_scale = GroupQuantScaleParameter(
+ data=torch.empty(
+ sum(output_partition_sizes),
+ input_size_per_partition // self.group_size,
+ dtype=torch.float8_e4m3fn,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+
+ layer.register_parameter("weight_scale", weight_scale)
+
+ input_global_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("input_global_scale", input_global_scale)
+
+ def process_weights_after_loading(self, layer) -> None:
+ global_input_scale = layer.input_global_scale.max().to(torch.float32)
+ layer.input_global_scale = Parameter(global_input_scale, requires_grad=False)
+
+ layer.weight_global_scale = Parameter(
+ layer.weight_global_scale.max().to(torch.float32), requires_grad=False
+ )
+
+ if get_fp4_gemm_runner_backend().is_flashinfer_trtllm():
+ # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+ # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+ # layout but we use our own quantization so we have to call
+ # shuffles ourselves.
+ from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+
+ weight = layer.weight_packed.data
+ weight_scale = layer.weight_scale.data
+
+ epilogue_tile_m = 128
+ weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m)
+ weight_scale = (
+ shuffle_matrix_sf_a(weight_scale.view(torch.uint8), epilogue_tile_m)
+ .reshape(weight_scale.shape)
+ .view(torch.float8_e4m3fn)
+ )
+
+ layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+ layer.weight_packed = Parameter(weight, requires_grad=False)
+ else:
+ swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
+ layer.weight_scale = Parameter(swizzled_weight_scale, requires_grad=False)
+ layer.weight_packed = Parameter(
+ layer.weight_packed.data, requires_grad=False
+ )
+
+ layer.alpha = Parameter(
+ 1 / (layer.input_global_scale * layer.weight_global_scale),
+ requires_grad=False,
+ )
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ output_dtype = x.dtype
+ w_n, _ = layer.weight_packed.shape
+ output_shape = [x.shape[0], w_n]
+
+ # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+ x_fp4, x_blockscale = fp4_quantize(x, layer.input_global_scale)
+
+ assert x_fp4.dtype == torch.uint8
+ assert layer.weight_packed.dtype == torch.uint8
+ assert layer.weight_scale.dtype == torch.float8_e4m3fn
+ assert layer.alpha.dtype == torch.float32
+
+ w = layer.weight_packed
+ w_blockscale = layer.weight_scale
+ if enable_flashinfer_fp4_gemm:
+ w = layer.weight_packed.T
+ w_blockscale = layer.weight_scale.T
+
+ out = fp4_gemm(
+ x_fp4,
+ w,
+ x_blockscale,
+ w_blockscale,
+ layer.alpha,
+ output_dtype,
+ w_n,
+ )
+ if bias is not None:
+ out = out + bias
+ return out.view(*output_shape)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..5898a078dbba4a8e87f5003bde6d067e24a183bf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4_moe.py
@@ -0,0 +1,423 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+ use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.utils import RoutingMethodType, get_moe_runner_backend
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.layers.quantization.fp8_utils import is_blackwell_supported
+from sglang.srt.layers.quantization.utils import (
+ prepare_static_weights_for_trtllm_fp4_moe,
+ reorder_w1w3_to_w3w1,
+ swizzle_blockscale,
+)
+from sglang.srt.utils import next_power_of_2, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsW4A4Nvfp4MoE"]
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+
+class CompressedTensorsW4A4Nvfp4MoE(CompressedTensorsMoEScheme):
+
+ def __init__(self):
+ if not is_blackwell_supported():
+ raise ValueError(
+ "Current platform does not support NVFP4"
+ " quantization. Please use Blackwell and"
+ " above."
+ )
+ self.group_size = 16
+ self.use_flashinfer_trtllm = get_moe_runner_backend().is_flashinfer_trtllm()
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # Requires sm100(blackwell) architecture
+ return 100
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ layer.params_dtype = params_dtype
+
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ # 2 fp4 items are packed in the input dimension
+ hidden_size // 2,
+ requires_grad=False,
+ dtype=torch.uint8,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_packed", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ # 2 fp4 items are packed in the input dimension
+ intermediate_size_per_partition // 2,
+ dtype=torch.uint8,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_packed", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # Weight Scales
+ w13_weight_scale = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ # 2 fp4 items are packed in the input dimension
+ hidden_size // self.group_size,
+ dtype=torch.float8_e4m3fn,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+ )
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+ w2_weight_scale = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ # 2 fp4 items are packed in the input dimension
+ intermediate_size_per_partition // self.group_size,
+ dtype=torch.float8_e4m3fn,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+ )
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # Weight Global Scales
+ w13_weight_scale_2 = torch.nn.Parameter(
+ torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w13_weight_global_scale", w13_weight_scale_2)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+ )
+ set_weight_attrs(w13_weight_scale_2, extra_weight_attrs)
+
+ w2_weight_scale_2 = torch.nn.Parameter(
+ torch.empty(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w2_weight_global_scale", w2_weight_scale_2)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+ )
+ set_weight_attrs(w2_weight_scale_2, extra_weight_attrs)
+
+ # Input Global Scales
+ w13_input_scale = torch.nn.Parameter(
+ torch.empty(num_experts, 2, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w13_input_global_scale", w13_input_scale)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+ )
+ set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+ w2_input_scale = torch.nn.Parameter(
+ torch.empty(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w2_input_global_scale", w2_input_scale)
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+ )
+ set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ # From packed to weight
+ layer.w13_weight = torch.nn.Parameter(
+ layer.w13_weight_packed.data, requires_grad=False
+ )
+ delattr(layer, "w13_weight_packed")
+
+ layer.w2_weight = torch.nn.Parameter(
+ layer.w2_weight_packed.data, requires_grad=False
+ )
+ delattr(layer, "w2_weight_packed")
+
+ if self.use_flashinfer_trtllm:
+ w, s = reorder_w1w3_to_w3w1(
+ layer.w13_weight.data, layer.w13_weight_scale.data, dim=-2
+ )
+ layer.w13_weight = torch.nn.Parameter(w, requires_grad=False)
+ layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False)
+
+ if not torch.allclose(
+ layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]
+ ):
+ logger.warning_once(
+ "w1_weight_global_scale must match w3_weight_global_scale. "
+ "Accuracy may be affected."
+ )
+
+ # Take inverse of global scale saved to disk
+ layer.w13_weight_scale_2 = torch.nn.Parameter(
+ 1 / layer.w13_weight_global_scale[:, 0], requires_grad=False
+ )
+
+ layer.w2_weight_scale_2 = torch.nn.Parameter(
+ 1 / layer.w2_weight_global_scale.data, requires_grad=False
+ )
+
+ # w13
+ if self.use_flashinfer_trtllm:
+ w13_input_global_scale = (
+ layer.w13_input_global_scale.min()
+ .to(torch.float32)
+ .expand(layer.num_local_experts)
+ )
+ else:
+ w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
+ torch.float32
+ )
+ layer.g1_alphas = torch.nn.Parameter(
+ ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
+ requires_grad=False,
+ )
+
+ layer.w13_input_scale_quant = torch.nn.Parameter(
+ (w13_input_global_scale), requires_grad=False
+ )
+
+ # w2
+ if self.use_flashinfer_trtllm:
+ w2_input_global_scale = (
+ layer.w2_input_global_scale.min()
+ .to(torch.float32)
+ .expand(layer.num_local_experts)
+ )
+ else:
+ w2_input_global_scale = layer.w2_input_global_scale
+
+ layer.g2_alphas = torch.nn.Parameter(
+ ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
+ requires_grad=False,
+ )
+
+ layer.w2_input_scale_quant = torch.nn.Parameter(
+ (w2_input_global_scale), requires_grad=False
+ )
+
+ # TensorRT-LLM specific processing
+ if self.use_flashinfer_trtllm:
+ # Prepare static weights for TRT-LLM kernel
+ (
+ gemm1_weights_fp4_shuffled,
+ gemm1_scales_fp4_shuffled,
+ gemm2_weights_fp4_shuffled,
+ gemm2_scales_fp4_shuffled,
+ ) = prepare_static_weights_for_trtllm_fp4_moe(
+ layer.w13_weight,
+ layer.w2_weight,
+ layer.w13_weight_scale,
+ layer.w2_weight_scale,
+ layer.w2_weight.size(-2), # hidden_size
+ layer.w13_weight.size(-2) // 2, # intermediate_size
+ layer.w13_weight.size(0), # num_experts
+ )
+ logger.debug("Finished shuffling weights for TRT-LLM MOE")
+
+ layer.gemm1_weights_fp4_shuffled = torch.nn.Parameter(
+ gemm1_weights_fp4_shuffled, requires_grad=False
+ )
+ layer.gemm2_weights_fp4_shuffled = torch.nn.Parameter(
+ gemm2_weights_fp4_shuffled, requires_grad=False
+ )
+ layer.gemm1_scales_fp4_shuffled = torch.nn.Parameter(
+ gemm1_scales_fp4_shuffled, requires_grad=False
+ )
+ layer.gemm2_scales_fp4_shuffled = torch.nn.Parameter(
+ gemm2_scales_fp4_shuffled, requires_grad=False
+ )
+
+ # Additional parameter needed for TRT-LLM
+ layer.g1_scale_c = torch.nn.Parameter(
+ (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
+ requires_grad=False,
+ )
+
+ # Clean up weights that won't be used by TRT-LLM
+ del layer.w2_weight
+ del layer.w2_weight_scale
+ del layer.w13_weight
+ del layer.w13_weight_scale
+ else:
+ # swizzle weight scales
+ layer.w13_weight_scale = torch.nn.Parameter(
+ swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+ )
+
+ layer.w2_weight_scale = torch.nn.Parameter(
+ swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+ )
+
+ layer.cutlass_moe_params = CutlassMoEParams(
+ CutlassMoEType.BlockscaledFP4,
+ layer.w13_weight.device,
+ num_experts=layer.num_experts,
+ intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,
+ hidden_size=layer.w13_weight.shape[2] * 2,
+ )
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+ self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ if self.use_flashinfer_trtllm:
+ from flashinfer import fp4_quantize, trtllm_fp4_block_scale_moe
+
+ router_logits = topk_output.router_logits
+ topk_config = topk_output.topk_config
+
+ # Quantize input hidden states using fp4_quantize
+ hs_fp4_bytes, hs_sf_bytes = fp4_quantize(
+ x,
+ layer.w13_input_scale_quant,
+ self.group_size, # sf_vec_size
+ False, # use_ue8m0
+ False, # is_sf_swizzled_layout
+ )
+ hs_fp4 = hs_fp4_bytes.reshape(x.shape[0], x.shape[1] // 2)
+ hs_scale = hs_sf_bytes.view(torch.float8_e4m3fn).reshape(
+ *hs_sf_bytes.shape[:-1], -1
+ )
+
+ correction_bias = (
+ None
+ if topk_config.correction_bias is None
+ else topk_config.correction_bias.to(x.dtype)
+ )
+
+ assert layer.routing_method_type is not None
+
+ # DeepSeekV3 style routing requires float32 router logits
+ if layer.routing_method_type == RoutingMethodType.DeepSeekV3:
+ router_logits = router_logits.to(torch.float32)
+
+ routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
+ routed_scaling_factor = (
+ routed_scaling_factor if routed_scaling_factor is not None else 1.0
+ )
+
+ with use_symmetric_memory(
+ get_tp_group(), disabled=not is_allocation_symmetric()
+ ):
+ num_tokens = hs_fp4.shape[0]
+ hidden_size = (
+ hs_fp4.shape[-1] * 2
+ if hs_fp4.dtype == torch.uint8
+ else hs_fp4.shape[-1]
+ )
+ symm_output = torch.empty(
+ num_tokens, hidden_size, dtype=torch.bfloat16, device=hs_fp4.device
+ )
+
+ output = trtllm_fp4_block_scale_moe(
+ routing_logits=router_logits,
+ routing_bias=correction_bias,
+ hidden_states=hs_fp4,
+ hidden_states_scale=hs_scale,
+ gemm1_weights=layer.gemm1_weights_fp4_shuffled,
+ gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.view(
+ torch.float8_e4m3fn
+ ),
+ gemm1_bias=None,
+ gemm1_alpha=None,
+ gemm1_beta=None,
+ gemm1_clamp_limit=None,
+ gemm2_weights=layer.gemm2_weights_fp4_shuffled,
+ gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.view(
+ torch.float8_e4m3fn
+ ),
+ gemm2_bias=None,
+ output1_scale_scalar=layer.g1_scale_c,
+ output1_scale_gate_scalar=layer.g1_alphas,
+ output2_scale_scalar=layer.g2_alphas,
+ num_experts=layer.num_experts,
+ top_k=topk_config.top_k,
+ n_group=topk_config.num_expert_group,
+ topk_group=topk_config.topk_group,
+ intermediate_size=layer.intermediate_size_per_partition,
+ local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+ local_num_experts=layer.num_local_experts,
+ routed_scaling_factor=routed_scaling_factor,
+ routing_method_type=layer.routing_method_type,
+ do_finalize=True,
+ tune_max_num_tokens=next_power_of_2(hs_fp4.shape[0]),
+ output=symm_output,
+ )[0]
+ else:
+ from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+
+ topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+ output = cutlass_moe_fp4(
+ a=x,
+ a1_gscale=layer.w13_input_scale_quant,
+ w1_fp4=layer.w13_weight,
+ w1_blockscale=layer.w13_weight_scale,
+ w1_alphas=layer.g1_alphas,
+ a2_gscale=layer.w2_input_scale_quant,
+ w2_fp4=layer.w2_weight,
+ w2_blockscale=layer.w2_weight_scale,
+ w2_alphas=layer.g2_alphas,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ params=layer.cutlass_moe_params,
+ apply_router_weight_on_input=self.moe_runner_config.apply_router_weight_on_input,
+ ).to(x.dtype)
+
+ return StandardCombineInput(hidden_states=output)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int8_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45b63fc4e44f45ba59b4a7b0ed7c2561b74f07d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int8_moe.py
@@ -0,0 +1,293 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+ NPUW4A8Int8DynamicMoEMethod,
+)
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+__all__ = ["NPUCompressedTensorsW4A8Int8DynamicMoE"]
+
+
+logger = logging.getLogger(__name__)
+
+
+class NPUCompressedTensorsW4A8Int8DynamicMoE(CompressedTensorsMoEScheme):
+
+ ### TODO: Get rid of code duplication with python/sglang/srt/modelslim/modelslim_moe.py @OrangeRedeng @TamirBaydasov
+ def __init__(self, quantization_config) -> None:
+ self.group_size = 0
+ self.is_per_channel_weight = self.group_size == 0
+ self.tp_size = 1
+ self.activation_use_clip = (
+ quantization_config.get("config_groups", {})
+ .get("group_1", {})
+ .get("activation_use_clip", False)
+ )
+ self.kernel = NPUW4A8Int8DynamicMoEMethod()
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ) -> None:
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ self.num_experts = num_experts
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+ )
+
+ # >> weight
+ w13_output_size = intermediate_size_per_partition
+ w2_output_size = hidden_size // 2
+ w13_weight = torch.nn.Parameter(
+ torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ w2_output_size,
+ intermediate_size_per_partition,
+ dtype=torch.int8,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # >> scale
+ weight_scale_dtype = torch.int64 if self.activation_use_clip else torch.float32
+ w13_weight_scale = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ 1,
+ dtype=weight_scale_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+ w2_weight_scale = torch.nn.Parameter(
+ torch.empty(num_experts, hidden_size, 1, dtype=weight_scale_dtype),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # >> offset
+ w13_weight_offset = torch.nn.Parameter(
+ torch.empty(
+ num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_offset", w13_weight_offset)
+ set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+ w2_weight_offset = torch.nn.Parameter(
+ torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_offset", w2_weight_offset)
+ set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+ # >>> special param for w4a8
+ if self.activation_use_clip:
+ self._init_activation_clip_params(
+ layer,
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition,
+ extra_weight_attrs,
+ )
+ else:
+ self._init_extra_scale_params(
+ layer,
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition,
+ extra_weight_attrs,
+ )
+
+ def _init_activation_clip_params(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ extra_weight_attrs: dict,
+ ) -> None:
+ """
+ Initializes bias and alpha parameters for quantization schemes that use activation clipping.
+
+ This helper registers `w13_bias`, `w2_bias`, and `w2_alpha`, which are required to
+ shift and scale the activations or outputs to compensate for the precision loss
+ introduced by clamping activations.
+ """
+ w13_bias = torch.nn.Parameter(
+ torch.ones(
+ num_experts, 2 * intermediate_size_per_partition, dtype=torch.float
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_bias", w13_bias)
+ set_weight_attrs(w13_bias, extra_weight_attrs)
+
+ w2_bias = torch.nn.Parameter(
+ torch.ones(num_experts, hidden_size, dtype=torch.float),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_bias", w2_bias)
+ set_weight_attrs(w2_bias, extra_weight_attrs)
+
+ w2_alpha = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float), requires_grad=False
+ )
+ layer.register_parameter("w2_alpha", w2_alpha)
+ set_weight_attrs(w2_alpha, extra_weight_attrs)
+
+ def _init_extra_scale_params(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ extra_weight_attrs: dict,
+ ) -> None:
+ """
+ Initializes additional scaling, offset, and bias parameters for quantization schemes without activation clipping.
+
+ This method registers the following parameters:
+ 1. Scale Biases: `w13_scale_bias` and `w2_scale_bias`.
+ 2. Secondary Quantization Params (initialized only for grouped quantization):
+ `w13_weight_scale_second`, `w13_weight_offset_second`,
+ `w2_weight_scale_second`, and `w2_weight_offset_second`.
+ """
+ if not self.is_per_channel_weight:
+ w13_weight_scale_second = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.group_size,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
+ set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
+
+ w13_weight_offset_second = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.group_size,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter(
+ "w13_weight_offset_second", w13_weight_offset_second
+ )
+ set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
+
+ w2_weight_scale_second = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.group_size,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
+ set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
+
+ w2_weight_offset_second = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.group_size,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
+ set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
+
+ w13_scale_bias = torch.nn.Parameter(
+ torch.empty(
+ num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_scale_bias", w13_scale_bias)
+ set_weight_attrs(w13_scale_bias, extra_weight_attrs)
+
+ w2_scale_bias = torch.nn.Parameter(
+ torch.empty(
+ num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_scale_bias", w2_scale_bias)
+ set_weight_attrs(w2_scale_bias, extra_weight_attrs)
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ self.kernel.process_weights_after_loading(
+ layer, self.is_per_channel_weight, self.activation_use_clip
+ )
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ return self.kernel.apply(layer, dispatch_output)
+
+ def apply_weights_with_router_logits(
+ self,
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ ):
+ return self.kernel.apply_without_routing_weights(
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..353f049b99537968eb9e82b6297fc1401f90f99a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -0,0 +1,135 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.layers.parameter import (
+ ChannelQuantScaleParameter,
+ ModelWeightParameter,
+ PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsLinearScheme,
+)
+from sglang.srt.layers.quantization.marlin_utils_fp8 import (
+ apply_fp8_marlin_linear,
+ prepare_fp8_layer_for_marlin,
+)
+from sglang.srt.layers.quantization.utils import convert_to_channelwise
+
+__all__ = ["CompressedTensorsW8A16Fp8"]
+
+SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR]
+
+
+class CompressedTensorsW8A16Fp8(CompressedTensorsLinearScheme):
+ def __init__(self, strategy: str, is_static_input_scheme: bool):
+ self.strategy = strategy
+ self.is_static_input_scheme = is_static_input_scheme
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # ampere and up
+ return 80
+
+ # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+ # So if we have a fused module (QKV, MLP) with per tensor scales,
+ # we expand each scale to its shard's channels.
+ def process_weights_after_loading(self, layer) -> None:
+ if self.strategy == QuantizationStrategy.TENSOR:
+ ws_channelwise = convert_to_channelwise(
+ layer.weight_scale, layer.logical_widths
+ )
+ layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False)
+ else:
+ # required by torch.compile to be torch.nn.Parameter
+ layer.weight_scale = torch.nn.Parameter(
+ layer.weight_scale.data, requires_grad=False
+ )
+
+ # Weights must be transposed for marlin
+ layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False)
+
+ if self.is_static_input_scheme:
+ # required by torch.compile to be torch.nn.Parameter
+ layer.input_scale = torch.nn.Parameter(
+ layer.input_scale.data, requires_grad=False
+ )
+ prepare_fp8_layer_for_marlin(layer, size_k_first=True)
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ input_size: int,
+ output_partition_sizes: List[int],
+ input_size_per_partition: int,
+ params_dtype: torch.dtype,
+ weight_loader: Callable,
+ **kwargs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ layer.logical_widths = output_partition_sizes
+ layer.input_size_per_partition = input_size_per_partition
+ layer.output_size_per_partition = output_size_per_partition
+ layer.orig_dtype = params_dtype
+
+ # WEIGHT
+ weight = ModelWeightParameter(
+ data=torch.empty(
+ output_size_per_partition,
+ input_size_per_partition,
+ dtype=torch.float8_e4m3fn,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight", weight)
+
+ # WEIGHT SCALE
+ if self.strategy == QuantizationStrategy.CHANNEL:
+ weight_scale = ChannelQuantScaleParameter(
+ data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ elif self.strategy == QuantizationStrategy.TENSOR:
+ weight_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ else:
+ raise ValueError(
+ f"Unsupported weight strategy={self.strategy}, "
+ f"supported strategies are {SUPPORTED_STRATEGIES}"
+ )
+
+ weight_scale[:] = torch.finfo(torch.float32).min
+ layer.register_parameter("weight_scale", weight_scale)
+
+ # INPUT SCALE (to deal with converted checkpoints)
+ if self.is_static_input_scheme:
+ input_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("input_scale", input_scale)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ return apply_fp8_marlin_linear(
+ input=x,
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ workspace=layer.workspace,
+ size_n=layer.output_size_per_partition,
+ size_k=layer.input_size_per_partition,
+ bias=bias,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4334481a87cb4723708669e908f2f45809c6176
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -0,0 +1,244 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
+from torch.nn import Parameter
+
+from sglang.srt.layers.parameter import (
+ BlockQuantScaleParameter,
+ ChannelQuantScaleParameter,
+ ModelWeightParameter,
+ PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsLinearScheme,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+ apply_fp8_linear,
+ apply_fp8_ptpc_linear,
+ dispatch_w8a8_block_fp8_linear,
+ normalize_e4m3fn_to_e4m3fnuz,
+ validate_fp8_block_shape,
+)
+from sglang.srt.layers.quantization.utils import requantize_with_max_scale
+from sglang.srt.utils import get_bool_env_var, is_hip
+
+__all__ = ["CompressedTensorsW8A8Fp8"]
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+ from aiter.ops.shuffle import shuffle_weight
+
+
+strategy_to_parameter_type = {
+ QuantizationStrategy.BLOCK: BlockQuantScaleParameter,
+ QuantizationStrategy.CHANNEL: ChannelQuantScaleParameter,
+ QuantizationStrategy.TENSOR: PerTensorScaleParameter,
+}
+
+
+class CompressedTensorsW8A8Fp8(CompressedTensorsLinearScheme):
+ def __init__(self, weight_quant: QuantizationArgs, is_static_input_scheme: bool):
+ self.weight_quant = weight_quant
+ self.strategy = self.weight_quant.strategy
+ self.is_static_input_scheme = is_static_input_scheme
+ self.weight_block_size = self.weight_quant.block_structure
+ if self.weight_block_size is not None:
+ self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # lovelace and up
+ return 89
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ input_size_per_partition: int,
+ output_partition_sizes: list[int],
+ input_size: int,
+ output_size: int,
+ params_dtype: torch.dtype,
+ weight_loader: Callable,
+ **kwargs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ layer.logical_widths = output_partition_sizes
+ layer.weight_block_size = None
+ layer.orig_dtype = params_dtype
+
+ if self.strategy == QuantizationStrategy.BLOCK:
+ assert self.weight_block_size is not None
+ layer.weight_block_size = self.weight_block_size
+ # Validate block quantization shapes
+ validate_fp8_block_shape(
+ layer,
+ input_size,
+ output_size,
+ input_size_per_partition,
+ output_partition_sizes,
+ self.weight_block_size,
+ )
+
+ # WEIGHT
+ weight = ModelWeightParameter(
+ data=torch.empty(
+ output_size_per_partition,
+ input_size_per_partition,
+ dtype=torch.float8_e4m3fn,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight", weight)
+
+ # WEIGHT SCALE
+ if self.strategy == QuantizationStrategy.CHANNEL:
+ weight_scale = ChannelQuantScaleParameter(
+ data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ weight_scale[:] = torch.finfo(torch.float32).min
+ elif self.strategy == QuantizationStrategy.TENSOR:
+ weight_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ weight_scale[:] = torch.finfo(torch.float32).min
+ elif self.strategy == QuantizationStrategy.BLOCK:
+ assert layer.weight_block_size is not None
+ block_n, block_k = layer.weight_block_size[0], layer.weight_block_size[1]
+ output_size_per_partition = sum(output_partition_sizes)
+ weight_scale = BlockQuantScaleParameter(
+ data=torch.empty(
+ (output_size_per_partition + block_n - 1) // block_n,
+ (input_size_per_partition + block_k - 1) // block_k,
+ dtype=torch.float32,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ weight_scale.format_ue8m0 = False
+ weight_scale[:] = torch.finfo(torch.float32).min
+
+ layer.register_parameter("weight_scale", weight_scale)
+ # INPUT SCALE
+ if self.is_static_input_scheme:
+ input_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ input_scale[:] = torch.finfo(torch.float32).min
+ layer.register_parameter("input_scale", input_scale)
+
+ def process_weights_after_loading(self, layer) -> None:
+ if self.strategy == QuantizationStrategy.TENSOR:
+ max_w_scale, weight = requantize_with_max_scale(
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ logical_widths=layer.logical_widths,
+ )
+
+ if is_fp8_fnuz():
+ input_scale = getattr(layer, "input_scale", None)
+
+ weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ weight=weight, weight_scale=max_w_scale, input_scale=input_scale
+ )
+ if input_scale is not None:
+ layer.input_scale = Parameter(input_scale, requires_grad=False)
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+ layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+ elif self.strategy == QuantizationStrategy.CHANNEL:
+ weight = layer.weight
+
+ if is_fp8_fnuz():
+ input_scale = getattr(layer, "input_scale", None)
+
+ weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ weight=weight,
+ weight_scale=layer.weight_scale,
+ input_scale=input_scale,
+ )
+ if input_scale is not None:
+ layer.input_scale = Parameter(input_scale, requires_grad=False)
+ else:
+ weight_scale = layer.weight_scale.data
+
+ if _use_aiter:
+ # keep the weight as (N, K)
+ layer.weight = Parameter(
+ shuffle_weight(weight, (16, 16)), requires_grad=False
+ )
+ else:
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+
+ # required by torch.compile to be torch.nn.Parameter
+ layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+ elif self.strategy == QuantizationStrategy.BLOCK:
+ assert self.is_static_input_scheme is False
+ weight = layer.weight
+ weight_scale = layer.weight_scale
+
+ if is_fp8_fnuz():
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+ weight=weight, weight_scale=weight_scale
+ )
+ layer.weight = Parameter(weight.data, requires_grad=False)
+ layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
+
+ else:
+ raise ValueError(f"Unknown quantization strategy {self.strategy}")
+
+ # INPUT SCALE
+ if self.is_static_input_scheme and hasattr(layer, "input_scale"):
+ layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+ else:
+ layer.input_scale = None
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ if self.weight_block_size is not None:
+ return self.w8a8_block_fp8_linear(
+ input=x,
+ weight=layer.weight,
+ block_size=self.weight_block_size,
+ weight_scale=layer.weight_scale,
+ input_scale=layer.input_scale,
+ bias=bias,
+ )
+
+ if _use_aiter and self.strategy == QuantizationStrategy.CHANNEL:
+ return apply_fp8_ptpc_linear(
+ input=x,
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ input_scale=layer.input_scale,
+ bias=bias,
+ use_per_token_if_dynamic=True,
+ compressed_tensor_quant=True,
+ )
+ else:
+ return apply_fp8_linear(
+ input=x,
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ input_scale=layer.input_scale,
+ bias=bias,
+ use_per_token_if_dynamic=True,
+ compressed_tensor_quant=True,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd3c9622ad29b2fe693039588c0e2dfe1c81e6c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8_moe.py
@@ -0,0 +1,424 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+ FlashInferTrtllmFp8MoeQuantInfo,
+)
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.utils import get_moe_runner_backend
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.utils import (
+ all_close_1d,
+ per_tensor_dequantize,
+ swap_w13_to_w31,
+)
+from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+__all__ = ["CompressedTensorsW8A8Fp8MoE"]
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+ from aiter import ActivationType, QuantType
+ from aiter.fused_moe import fused_moe
+ from aiter.ops.shuffle import shuffle_weight
+
+
+logger = logging.getLogger(__name__)
+
+
+class CompressedTensorsW8A8Fp8MoE(CompressedTensorsMoEScheme):
+
+ def __init__(self, weight_quant, input_quant):
+ self.weight_quant = weight_quant
+ self.input_quant = input_quant
+ self.use_flashinfer_trtllm = get_moe_runner_backend().is_flashinfer_trtllm()
+
+ per_tensor = (
+ self.weight_quant.strategy == QuantizationStrategy.TENSOR
+ and self.input_quant.strategy == QuantizationStrategy.TENSOR
+ )
+ per_channel = (
+ self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+ and self.input_quant.strategy == QuantizationStrategy.TOKEN
+ )
+ if not (per_tensor or per_channel):
+ assert self.weight_quant.strategy == QuantizationStrategy.BLOCK
+ self.weight_block_size = self.weight_quant.block_structure
+ assert self.weight_quant.dynamic is not None
+ else:
+ self.weight_block_size = None
+ self.block_quant = self.weight_block_size is not None
+
+ self.static_input_scales = not self.input_quant.dynamic
+ if self.static_input_scales and per_channel:
+ raise ValueError(
+ "For FP8 Fused MoE layer, we require either per tensor or "
+ "channelwise, dynamic per token quantization."
+ )
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # ampere and up
+ return 80
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ params_dtype = torch.float8_e4m3fn
+
+ if self.block_quant:
+ assert self.weight_block_size is not None
+ layer.weight_block_size = self.weight_block_size
+ tp_size = get_tensor_model_parallel_world_size()
+ block_n, block_k = (
+ self.weight_block_size[0],
+ self.weight_block_size[1],
+ )
+ # NOTE: To ensure proper alignment of the block-wise quantization
+ # scales, the output_size of the weights for both the gate and up
+ # layers must be divisible by block_n.
+ # Required by column parallel or enabling merged weights
+ if intermediate_size_per_partition % block_n != 0:
+ raise ValueError(
+ f"The output_size of gate's and up's weight = "
+ f"{intermediate_size_per_partition} is not divisible by "
+ f"weight quantization block_n = {block_n}."
+ )
+ if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
+ # Required by row parallel
+ raise ValueError(
+ f"The input_size of down's weight = "
+ f"{intermediate_size_per_partition} is not divisible by "
+ f"weight quantization block_k = {block_k}."
+ )
+
+ # WEIGHTS
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # WEIGHT_SCALES
+ # per-tensor quantization
+ if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+ # Allocate 2 scales for w1 and w3 respectively.
+ # They will be combined to a single scale after weight loading.
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
+ elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ 1,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+ requires_grad=False,
+ )
+ weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+ elif self.weight_quant.strategy == QuantizationStrategy.BLOCK:
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ 2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+ (hidden_size + block_k - 1) // block_k,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ (hidden_size + block_n - 1) // block_n,
+ (intermediate_size_per_partition + block_k - 1) // block_k,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ weight_quant_method = FusedMoeWeightScaleSupported.BLOCK.value
+ else:
+ raise ValueError(
+ f"Unsupported weight quantization strategy: {self.weight_quant.strategy}"
+ )
+
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ # Add the quantization method used (per tensor/grouped/channel)
+ # to ensure the weight scales are loaded in properly
+ extra_weight_attrs.update({"quant_method": weight_quant_method})
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # INPUT_SCALES
+ if self.static_input_scales:
+ assert (
+ self.input_quant.strategy == QuantizationStrategy.TENSOR
+ ), "Only per-tensor quantization is supported for static input scales"
+ w13_input_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w13_input_scale", w13_input_scale)
+ set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+ w2_input_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w2_input_scale", w2_input_scale)
+ set_weight_attrs(w2_input_scale, extra_weight_attrs)
+ else:
+ layer.w13_input_scale = None
+ layer.w2_input_scale = None
+
+ def process_weights_after_loading(self, layer: torch.nn.Module | FusedMoE) -> None:
+ # Fp8 moe kernels require a single activation scale.
+ # We take the max of all the scales in case they differ.
+ if self.static_input_scales:
+ if layer.w13_input_scale is None or layer.w2_input_scale is None:
+ raise ValueError(
+ "QuantConfig has static quantization, but found "
+ "activation scales are None."
+ )
+ if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+ layer.w2_input_scale
+ ):
+ logger.warning(
+ "Found input_scales that are not equal for "
+ "fp8 MoE layer. Using the maximum across experts "
+ "for each layer."
+ )
+ layer.w13_input_scale = torch.nn.Parameter(
+ layer.w13_input_scale.max(), requires_grad=False
+ )
+ layer.w2_input_scale = torch.nn.Parameter(
+ layer.w2_input_scale.max(), requires_grad=False
+ )
+
+ if is_fp8_fnuz():
+ # Normalize the weights and scales
+ w13_weight, w13_weight_scale, w13_input_scale = (
+ normalize_e4m3fn_to_e4m3fnuz(
+ layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+ )
+ )
+ w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+ )
+ # Reset the parameter
+ layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+ layer.w13_weight_scale = torch.nn.Parameter(
+ w13_weight_scale, requires_grad=False
+ )
+ if w13_input_scale is not None:
+ layer.w13_input_scale = torch.nn.Parameter(
+ w13_input_scale, requires_grad=False
+ )
+ layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+ layer.w2_weight_scale = torch.nn.Parameter(
+ w2_weight_scale, requires_grad=False
+ )
+ if w2_input_scale is not None:
+ layer.w2_input_scale = torch.nn.Parameter(
+ w2_input_scale, requires_grad=False
+ )
+ if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+ # Fp8 moe kernel needs single weight scale for w13 per expert.
+ # We take the max then dequant and requant each expert.
+ assert layer.w13_weight_scale is not None
+ shard_size = layer.intermediate_size_per_partition
+ max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+ for expert_id in range(layer.num_local_experts):
+ start = 0
+ for shard_id in range(2):
+ dq_weight = per_tensor_dequantize(
+ layer.w13_weight[expert_id][start : start + shard_size, :],
+ layer.w13_weight_scale[expert_id][shard_id],
+ )
+ (
+ layer.w13_weight[expert_id][start : start + shard_size, :],
+ _,
+ ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+ start += shard_size
+
+ layer.w13_weight_scale = torch.nn.Parameter(
+ max_w13_scales, requires_grad=False
+ )
+
+ if self.weight_quant.strategy == QuantizationStrategy.CHANNEL and _use_aiter:
+ with torch.no_grad():
+ # Pre-shuffle weights
+ layer.w13_weight = torch.nn.Parameter(
+ shuffle_weight(layer.w13_weight.data, (16, 16)),
+ requires_grad=False,
+ )
+ torch.cuda.empty_cache()
+ layer.w2_weight = torch.nn.Parameter(
+ shuffle_weight(layer.w2_weight.data, (16, 16)),
+ requires_grad=False,
+ )
+ torch.cuda.empty_cache()
+
+ if (
+ self.weight_quant.strategy == QuantizationStrategy.BLOCK
+ and self.use_flashinfer_trtllm
+ ):
+ layer.w13_weight = torch.nn.Parameter(
+ swap_w13_to_w31(layer.w13_weight.data),
+ requires_grad=False,
+ )
+ layer.w13_weight_scale = torch.nn.Parameter(
+ swap_w13_to_w31(layer.w13_weight_scale.data),
+ requires_grad=False,
+ )
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+ moe_runner_backend = get_moe_runner_backend()
+ if moe_runner_backend.is_auto():
+ moe_runner_backend = MoeRunnerBackend.TRITON
+ self.runner = MoeRunner(moe_runner_backend, moe_runner_config)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ moe_runner_config = self.moe_runner_config
+
+ if _use_aiter and self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+ assert not moe_runner_config.no_combine, "unsupported"
+ topk_weights, topk_ids, _ = topk_output
+ if moe_runner_config.apply_router_weight_on_input:
+ assert (
+ topk_weights.dim() == 2
+ ), "`topk_weights` should be in shape (num_tokens, topk)"
+ _, topk = topk_weights.shape
+ assert (
+ topk == 1
+ ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+ x = x * topk_weights.to(x.dtype)
+ topk_weights = torch.ones_like(
+ topk_weights, dtype=torch.float32
+ ) # topk_weights must be FP32 (float32)
+ output = fused_moe(
+ x,
+ layer.w13_weight,
+ layer.w2_weight,
+ topk_weights,
+ topk_ids,
+ activation=(
+ ActivationType.Silu
+ if moe_runner_config.activation == "silu"
+ else ActivationType.Gelu
+ ),
+ quant_type=QuantType.per_Token,
+ w1_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ a1_scale=layer.w13_input_scale,
+ a2_scale=layer.w2_input_scale,
+ )
+ return StandardCombineInput(hidden_states=output)
+ elif self.weight_quant.strategy == QuantizationStrategy.BLOCK:
+ if self.use_flashinfer_trtllm:
+ quant_info = FlashInferTrtllmFp8MoeQuantInfo(
+ w13_weight=layer.w13_weight,
+ w2_weight=layer.w2_weight,
+ global_num_experts=layer.num_experts,
+ local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+ local_num_experts=layer.num_local_experts,
+ intermediate_size=layer.w2_weight.shape[2],
+ routing_method_type=layer.routing_method_type,
+ block_quant=self.block_quant,
+ weight_block_k=self.weight_block_size[1],
+ w13_weight_scale_inv=layer.w13_weight_scale,
+ w2_weight_scale_inv=layer.w2_weight_scale,
+ )
+ else:
+ quant_info = TritonMoeQuantInfo(
+ w13_weight=layer.w13_weight,
+ w2_weight=layer.w2_weight,
+ use_fp8_w8a8=True,
+ w13_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ a13_scale=layer.w13_input_scale,
+ a2_scale=layer.w2_input_scale,
+ block_shape=self.weight_block_size,
+ )
+ return self.runner.run(dispatch_output, quant_info)
+ else:
+ quant_info = TritonMoeQuantInfo(
+ w13_weight=layer.w13_weight,
+ w2_weight=layer.w2_weight,
+ use_fp8_w8a8=True,
+ per_channel_quant=self.weight_quant.strategy
+ == QuantizationStrategy.CHANNEL,
+ w13_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ a13_scale=layer.w13_input_scale,
+ a2_scale=layer.w2_input_scale,
+ )
+ return self.runner.run(dispatch_output, quant_info)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c5410b575134252bfa982f61edb0bf756dcc59
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -0,0 +1,203 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from torch.nn import Parameter
+
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+ NPUW8A8Int8DynamicLinearMethod,
+)
+from sglang.srt.layers.parameter import (
+ ChannelQuantScaleParameter,
+ ModelWeightParameter,
+ PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsLinearScheme,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.layers.quantization.utils import requantize_with_max_scale
+from sglang.srt.utils import is_cuda
+
+__all__ = ["CompressedTensorsW8A8Int8", "NPUCompressedTensorsW8A8Int8"]
+
+_is_cuda = is_cuda()
+if _is_cuda:
+ from sgl_kernel import int8_scaled_mm
+
+
+class CompressedTensorsW8A8Int8(CompressedTensorsLinearScheme):
+
+ def __init__(
+ self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
+ ):
+ self.strategy = strategy
+ self.is_static_input_scheme = is_static_input_scheme
+ self.input_symmetric = input_symmetric
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ output_partition_sizes: list[int],
+ input_size_per_partition: int,
+ params_dtype: torch.dtype,
+ weight_loader: Callable,
+ **kwargs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ layer.logical_widths = output_partition_sizes
+
+ # WEIGHT
+ weight = ModelWeightParameter(
+ data=torch.empty(
+ output_size_per_partition, input_size_per_partition, dtype=torch.int8
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+
+ layer.register_parameter("weight", weight)
+
+ # WEIGHT SCALE
+ if self.strategy == QuantizationStrategy.CHANNEL:
+ weight_scale = ChannelQuantScaleParameter(
+ data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ else:
+ assert self.strategy == QuantizationStrategy.TENSOR
+ weight_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight_scale", weight_scale)
+
+ # INPUT SCALE
+ if self.is_static_input_scheme:
+ input_scale = PerTensorScaleParameter(
+ data=torch.empty(1, dtype=torch.float32), weight_loader=weight_loader
+ )
+ layer.register_parameter("input_scale", input_scale)
+
+ if not self.input_symmetric:
+ # Note: compressed-tensors stores the zp using the same dtype
+ # as the weights
+ # AZP loaded as int8 but used as int32
+ input_zero_point = PerTensorScaleParameter(
+ data=torch.empty(1, dtype=torch.int8), weight_loader=weight_loader
+ )
+ layer.register_parameter("input_zero_point", input_zero_point)
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # ampere and up
+ return 80
+
+ def process_weights_after_loading(self, layer) -> None:
+ # If per tensor, when we have a fused module (e.g. QKV) with per
+ # tensor scales (thus N scales being passed to the kernel),
+ # requantize so we can always run per channel
+ if self.strategy == QuantizationStrategy.TENSOR:
+ max_w_scale, weight = requantize_with_max_scale(
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ logical_widths=layer.logical_widths,
+ )
+
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+ layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+ # If channelwise, scales are already lined up, so just transpose.
+ elif self.strategy == QuantizationStrategy.CHANNEL:
+ weight = layer.weight
+ weight_scale = layer.weight_scale.data
+
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+ # required by torch.compile to be torch.nn.Parameter
+ layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+ else:
+ raise ValueError(f"Unknown quantization strategy {self.strategy}")
+
+ # INPUT SCALE
+ if self.is_static_input_scheme and hasattr(layer, "input_scale"):
+ if self.input_symmetric:
+ layer.input_scale = Parameter(
+ layer.input_scale.max(), requires_grad=False
+ )
+ else:
+ input_scale = layer.input_scale
+ input_zero_point = layer.input_zero_point
+
+ # reconstruct the ranges
+ int8_traits = torch.iinfo(torch.int8)
+ azps = input_zero_point.to(dtype=torch.int32)
+ range_max = (input_scale * (int8_traits.max - azps)).max()
+ range_min = (input_scale * (int8_traits.min - azps)).min()
+
+ scale = (range_max - range_min) / (int8_traits.max - int8_traits.min)
+
+ # AZP loaded as int8 but used as int32
+ azp = (int8_traits.min - range_min / scale).to(dtype=torch.int32)
+
+ layer.input_scale = Parameter(scale, requires_grad=False)
+ layer.input_zero_point = Parameter(azp, requires_grad=False)
+ else:
+ layer.input_scale = None
+ layer.input_zero_point = None
+
+ # azp_adj is the AZP adjustment term, used to account for weights.
+ # It does not depend on scales or azp, so it is the same for
+ # static and dynamic quantization.
+ # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
+ # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
+ if not self.input_symmetric:
+ weight = layer.weight
+ azp_adj = weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+ if self.is_static_input_scheme:
+ # cutlass_w8a8 requires azp to be folded into azp_adj
+ # in the per-tensor case
+ azp_adj = layer.input_zero_point * azp_adj
+ layer.azp_adj = Parameter(azp_adj, requires_grad=False)
+ else:
+ layer.azp_adj = None
+
+ def apply_weights(
+ self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+ ) -> torch.Tensor:
+ # TODO: add cutlass_scaled_mm_azp support
+ x_q, x_scale = per_token_quant_int8(x)
+
+ return int8_scaled_mm(
+ x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
+ )
+
+
+class NPUCompressedTensorsW8A8Int8(CompressedTensorsW8A8Int8):
+
+ def __init__(
+ self, strategy: str, is_static_input_scheme: bool, input_symmetric: bool
+ ):
+ super().__init__(strategy, is_static_input_scheme, input_symmetric)
+ # TODO: Currently, NPU kernel for static quant requires quant_bias field,
+ # which can't be replicated in compressed-tensors.
+ if self.is_static_input_scheme:
+ raise NotImplementedError(
+ "Static compressed-tensors scheme is not yet supported on NPU."
+ )
+ self.kernel = NPUW8A8Int8DynamicLinearMethod()
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ return NotImplementedError
+
+ def process_weights_after_loading(self, layer):
+ return self.kernel.process_weights_after_loading(layer)
+
+ def apply_weights(self, layer, x, bias):
+ return self.kernel.apply(layer, x, bias)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b391a1c6999a970dab153d25fe2986eeb0497d02
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8_moe.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+ NPUW8A8Int8DynamicMoEMethod,
+)
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+__all__ = ["NPUCompressedTensorsW8A8Int8DynamicMoE"]
+
+logger = logging.getLogger(__name__)
+
+
+class NPUCompressedTensorsW8A8Int8DynamicMoE(CompressedTensorsMoEScheme):
+
+ def __init__(self, weight_quant, input_quant):
+ self.weight_quant = weight_quant
+ self.input_quant = input_quant
+ self.kernel = NPUW8A8Int8DynamicMoEMethod()
+
+ self.static_input_scales = not self.input_quant.dynamic
+ per_channel = (
+ self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+ and self.input_quant.strategy == QuantizationStrategy.TOKEN
+ )
+ if not per_channel:
+ raise ValueError(
+ "For INT8 Fused MoE layers, we require channelwise, "
+ "dynamic per token quantization. Found "
+ f"{self.weight_quant}, {self.input_quant}"
+ )
+
+ self.static_input_scales = not self.input_quant.dynamic
+ if self.static_input_scales:
+ raise ValueError(
+ "For INT8 Fused MoE layers, we require channelwise, "
+ "dynamic per token quantization. Found static input scales."
+ )
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ params_dtype = torch.int8
+
+ # WEIGHTS
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # WEIGHT_SCALES
+ assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+ )
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # INPUT_SCALES
+ assert not self.static_input_scales
+ layer.w13_input_scale = None
+ layer.w2_input_scale = None
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ self.kernel.process_weights_after_loading(layer)
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ return self.kernel.apply(layer, dispatch_output)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
new file mode 100644
index 0000000000000000000000000000000000000000..15375212c30b1d45a53baf92dd37b88e3aa182e4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -0,0 +1,339 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.quantization import ActivationOrdering
+
+# yapf conflicts with isort for this block
+# yapf: disable
+from sglang.srt.layers.parameter import (
+ BasevLLMParameter,
+ ChannelQuantScaleParameter,
+ GroupQuantScaleParameter,
+ PackedColumnParameter,
+ PackedvLLMParameter,
+ RowvLLMParameter,
+ permute_param_layout_,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ CompressedTensorsLinearScheme,
+)
+from sglang.srt.layers.quantization.marlin_utils import (
+ MarlinLinearLayerConfig,
+ apply_gptq_marlin_linear,
+ check_marlin_supports_shape,
+ marlin_is_k_full,
+ marlin_make_empty_g_idx,
+ marlin_make_workspace,
+ marlin_permute_scales,
+ marlin_repeat_scales_on_all_ranks,
+ marlin_sort_g_idx,
+ marlin_zero_points,
+)
+from sglang.srt.layers.quantization.utils import (
+ get_scalar_types,
+ replace_parameter,
+ unpack_cols,
+)
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+ from sglang.jit_kernel.gptq_marlin_repack import gptq_marlin_repack
+
+
+ScalarType, scalar_types = get_scalar_types()
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsWNA16"]
+WNA16_SUPPORTED_TYPES_MAP = {
+ 4: scalar_types.uint4b8,
+ 8: scalar_types.uint8b128
+}
+WNA16_ZP_SUPPORTED_TYPES_MAP = {4: scalar_types.uint4, 8: scalar_types.uint8}
+WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
+
+
+class CompressedTensorsWNA16(CompressedTensorsLinearScheme):
+ _kernel_backends_being_used: set[str] = set()
+
+ def __init__(self,
+ strategy: str,
+ num_bits: int,
+ group_size: Optional[int] = None,
+ symmetric: Optional[bool] = True,
+ actorder: Optional[ActivationOrdering] = None):
+
+ self.pack_factor = 32 // num_bits
+ self.strategy = strategy
+ self.symmetric = symmetric
+ self.group_size = -1 if group_size is None else group_size
+ self.has_g_idx = actorder == ActivationOrdering.GROUP
+
+ if self.group_size == -1 and self.strategy != "channel":
+ raise ValueError("Marlin kernels require group quantization or "
+ "channelwise quantization, but found no group "
+ "size and strategy is not channelwise.")
+
+ if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
+ raise ValueError(
+ f"Unsupported num_bits = {num_bits}. "
+ f"Supported num_bits = {WNA16_SUPPORTED_TYPES_MAP.keys()}")
+
+ self.quant_type = (WNA16_ZP_SUPPORTED_TYPES_MAP[num_bits]
+ if not self.symmetric else
+ WNA16_SUPPORTED_TYPES_MAP[num_bits])
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # ampere and up
+ return 80
+
+ def create_weights(self, layer: torch.nn.Module, output_size: int,
+ input_size: int, output_partition_sizes: list[int],
+ input_size_per_partition: int,
+ params_dtype: torch.dtype, weight_loader: Callable,
+ **kwargs):
+
+ output_size_per_partition = sum(output_partition_sizes)
+
+ self.kernel_config = MarlinLinearLayerConfig(
+ full_weight_shape=(input_size, output_size),
+ partition_weight_shape=(
+ input_size_per_partition,
+ output_size_per_partition,
+ ),
+ weight_type=self.quant_type,
+ act_type=params_dtype,
+ group_size=self.group_size,
+ zero_points=not self.symmetric,
+ has_g_idx=self.has_g_idx
+ )
+
+ # If group_size is -1, we are in channelwise case.
+ group_size = self.group_size if self.group_size != -1 else input_size
+ row_parallel = (input_size != input_size_per_partition)
+ partition_scales = not marlin_repeat_scales_on_all_ranks(
+ self.has_g_idx, self.group_size, row_parallel)
+
+ scales_and_zp_size = input_size // group_size
+
+ if partition_scales:
+ assert input_size_per_partition % group_size == 0
+ scales_and_zp_size = input_size_per_partition // group_size
+
+ weight = PackedvLLMParameter(input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ packed_factor=self.pack_factor,
+ packed_dim=1,
+ data=torch.empty(
+ output_size_per_partition,
+ input_size_per_partition //
+ self.pack_factor,
+ dtype=torch.int32,
+ ))
+
+ weight_scale_args = {
+ "weight_loader":
+ weight_loader,
+ "data":
+ torch.empty(
+ output_size_per_partition,
+ scales_and_zp_size,
+ dtype=params_dtype,
+ )
+ }
+
+ zeros_args = {
+ "weight_loader":
+ weight_loader,
+ "data":
+ torch.zeros(
+ output_size_per_partition // self.pack_factor,
+ scales_and_zp_size,
+ dtype=torch.int32,
+ )
+ }
+
+ if not partition_scales:
+ weight_scale = ChannelQuantScaleParameter(output_dim=0,
+ **weight_scale_args)
+
+ if not self.symmetric:
+ qzeros = PackedColumnParameter(output_dim=0,
+ packed_dim=0,
+ packed_factor=self.pack_factor,
+ **zeros_args)
+ else:
+ weight_scale = GroupQuantScaleParameter(output_dim=0,
+ input_dim=1,
+ **weight_scale_args)
+ if not self.symmetric:
+ qzeros = PackedvLLMParameter(input_dim=1,
+ output_dim=0,
+ packed_dim=0,
+ packed_factor=self.pack_factor,
+ **zeros_args)
+
+ # A 2D array defining the original shape of the weights
+ # before packing
+ weight_shape = BasevLLMParameter(data=torch.empty(2,
+ dtype=torch.int64),
+ weight_loader=weight_loader)
+
+ layer.register_parameter("weight_packed", weight)
+ layer.register_parameter("weight_scale", weight_scale)
+ layer.register_parameter("weight_shape", weight_shape)
+
+ if not self.symmetric:
+ layer.register_parameter("weight_zero_point", qzeros)
+
+ # group index (for activation reordering)
+ if self.has_g_idx:
+ weight_g_idx = RowvLLMParameter(data=torch.empty(
+ input_size_per_partition,
+ dtype=torch.int32,
+ ),
+ input_dim=0,
+ weight_loader=weight_loader)
+ layer.register_parameter("weight_g_idx", weight_g_idx)
+
+ # Checkpoints are serialized in compressed-tensors format, which is
+ # different from the format the kernel may want. Handle repacking here.
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ # Default names since marlin requires empty parameters for these,
+ # TODO: remove this requirement from marlin (allow optional tensors)
+ self.w_q_name = "weight_packed"
+ self.w_s_name = "weight_scale"
+ self.w_zp_name = "weight_zero_point"
+ self.w_gidx_name = "weight_g_idx"
+
+ device = getattr(layer, self.w_q_name).device
+ c = self.kernel_config
+
+ check_marlin_supports_shape(
+ c.partition_weight_shape[1], # out_features
+ c.partition_weight_shape[0], # in_features
+ c.full_weight_shape[0], # in_features
+ c.group_size,
+ )
+
+ row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
+ self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+
+ # Allocate marlin workspace.
+ self.workspace = marlin_make_workspace(device)
+
+ def _transform_param(
+ layer: torch.nn.Module, name: Optional[str], fn: Callable
+ ) -> None:
+ if name is not None and getattr(layer, name, None) is not None:
+
+ old_param = getattr(layer, name)
+ new_param = fn(old_param)
+ # replace the parameter with torch.nn.Parameter for TorchDynamo
+ # compatibility
+ replace_parameter(
+ layer, name, torch.nn.Parameter(new_param.data, requires_grad=False)
+ )
+
+ def transform_w_q(x):
+ assert isinstance(x, BasevLLMParameter)
+ permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+ x.data = gptq_marlin_repack(
+ x.data.contiguous(),
+ perm=layer.g_idx_sort_indices,
+ size_k=c.partition_weight_shape[0],
+ size_n=c.partition_weight_shape[1],
+ num_bits=c.weight_type.size_bits,
+ )
+ return x
+
+ def transform_w_s(x):
+ assert isinstance(x, BasevLLMParameter)
+ permute_param_layout_(x, input_dim=0, output_dim=1)
+ x.data = marlin_permute_scales(
+ x.data.contiguous(),
+ size_k=c.partition_weight_shape[0],
+ size_n=c.partition_weight_shape[1],
+ group_size=c.group_size,
+ )
+ return x
+
+ if c.has_g_idx:
+ g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+ getattr(layer, self.w_gidx_name)
+ )
+ _transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+ layer.g_idx_sort_indices = g_idx_sort_indices
+ else:
+ setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+ layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+ if c.zero_points:
+ grouped_k = (
+ c.partition_weight_shape[0] // c.group_size if c.group_size != -1 else 1
+ )
+ _transform_param(
+ layer,
+ self.w_zp_name,
+ lambda x: marlin_zero_points(
+ unpack_cols(
+ x.t(),
+ c.weight_type.size_bits,
+ grouped_k,
+ c.partition_weight_shape[1],
+ ),
+ size_k=grouped_k,
+ size_n=c.partition_weight_shape[1],
+ num_bits=c.weight_type.size_bits,
+ ),
+ )
+ else:
+ setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+ _transform_param(layer, self.w_q_name, transform_w_q)
+ _transform_param(layer, self.w_s_name, transform_w_s)
+
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+ c = self.kernel_config
+
+ def _get_weight_params(
+ layer: torch.nn.Module,
+ ) -> tuple[
+ torch.Tensor, # w_q
+ torch.Tensor, # w_s
+ Optional[torch.Tensor], # w_zp,
+ Optional[torch.Tensor], # w_gidx
+ ]:
+ return (
+ getattr(layer, self.w_q_name),
+ getattr(layer, self.w_s_name),
+ getattr(layer, self.w_zp_name or "", None),
+ getattr(layer, self.w_gidx_name or "", None),
+ )
+
+ w_q, w_s, w_zp, w_gidx = _get_weight_params(layer)
+
+ # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+ # None for marlin
+ return apply_gptq_marlin_linear(
+ input=x,
+ weight=w_q,
+ weight_scale=w_s,
+ weight_zp=w_zp, # type: ignore
+ g_idx=w_gidx, # type: ignore
+ g_idx_sort_indices=layer.g_idx_sort_indices,
+ workspace=self.workspace,
+ wtype=c.weight_type,
+ input_size_per_partition=c.partition_weight_shape[0],
+ output_size_per_partition=c.partition_weight_shape[1],
+ is_k_full=self.is_k_full,
+ bias=bias,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16_moe.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a8fb6542189bd2c419dc591e420d5e3f8530cc8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16_moe.py
@@ -0,0 +1,621 @@
+from __future__ import annotations
+
+import enum
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING
+
+import torch
+from compressed_tensors import CompressionFormat
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+ NPUW4A16Int4DynamicMoEMethod,
+)
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+ WNA16_SUPPORTED_BITS,
+ CompressedTensorsMoEScheme,
+)
+from sglang.srt.layers.quantization.gptq import gptq_marlin_moe_repack
+from sglang.srt.layers.quantization.marlin_utils import marlin_moe_permute_scales
+from sglang.srt.layers.quantization.utils import replace_parameter
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip, set_weight_attrs
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+ CompressedTensorsConfig,
+ )
+
+
+__all__ = [
+ "CompressedTensorsWNA16MoE",
+ "CompressedTensorsWNA16TritonMoE",
+ "NPUCompressedTensorsW4A16Int4DynamicMoE",
+]
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+ pass
+
+
+logger = logging.getLogger(__name__)
+
+
+class GPTQMarlinState(Enum):
+ REPACK = enum.auto()
+ READY = enum.auto()
+
+
+class CompressedTensorsWNA16MoE(CompressedTensorsMoEScheme):
+
+ def __init__(self, quant_config: CompressedTensorsConfig, num_gpu_experts=-1):
+ self.quant_config = quant_config
+ config = self.quant_config.target_scheme_map["Linear"].get("weights")
+ self.num_bits = config.num_bits
+ self.packed_factor = 32 // config.num_bits
+ self.strategy = config.strategy
+ self.group_size = config.group_size
+ self.actorder = config.actorder
+ assert config.symmetric, "Only symmetric quantization is supported for MoE"
+
+ if not (
+ self.quant_config.quant_format == CompressionFormat.pack_quantized.value
+ and self.num_bits in WNA16_SUPPORTED_BITS
+ ):
+ raise ValueError(
+ "For Fused MoE layers, only ",
+ f"{CompressionFormat.pack_quantized.value} ",
+ "is supported for the following bits: ",
+ f"{WNA16_SUPPORTED_BITS}",
+ )
+ self.num_gpu_experts = num_gpu_experts
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # ampere and up
+ return 80
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ # Will transpose the loaded weight along the
+ # intermediate and hidden dim sizes. Will
+ # shard for TP along the transposed dims
+ extra_weight_attrs.update(
+ {"is_transposed": True, "quant_method": self.strategy}
+ )
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size // self.packed_factor,
+ 2 * intermediate_size_per_partition,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_packed", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ intermediate_size_per_partition // self.packed_factor,
+ hidden_size,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_packed", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # In the case where we have actorder/g_idx,
+ # we do not partition the w2 scales
+ load_full_w2 = self.actorder and self.group_size != -1
+
+ if load_full_w2:
+ w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
+ else:
+ w2_scales_size = intermediate_size_per_partition
+
+ self.is_k_full = (not self.actorder) or layer.moe_tp_size == 1
+
+ if self.strategy == "channel":
+ num_groups_w2 = num_groups_w13 = 1
+ self.group_size = -1
+ else:
+ num_groups_w2 = w2_scales_size // self.group_size
+ num_groups_w13 = hidden_size // self.group_size
+
+ w13_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ num_groups_w13,
+ 2 * intermediate_size_per_partition,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_scale)
+ set_weight_attrs(w13_scale, extra_weight_attrs)
+
+ w2_scale = torch.nn.Parameter(
+ torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_scale)
+ set_weight_attrs(w2_scale, extra_weight_attrs)
+ set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
+
+ w2_weight_shape = torch.nn.Parameter(
+ torch.empty(num_experts, 2), requires_grad=False
+ )
+ layer.register_parameter("w2_weight_shape", w2_weight_shape)
+ set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+ w13_weight_shape = torch.nn.Parameter(
+ torch.empty(num_experts, 2), requires_grad=False
+ )
+
+ layer.register_parameter("w13_weight_shape", w13_weight_shape)
+ set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+ w13_g_idx = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+ set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+ w2_g_idx = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ intermediate_size_per_partition,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+ set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+ w13_g_idx_sort_indices = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+ set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+ w2_g_idx_sort_indices = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ intermediate_size_per_partition,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+ set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+ layer.a13_scale = None
+ layer.a2_scale = None
+ layer.marlin_state = GPTQMarlinState.REPACK
+
+ if not hasattr(layer, "_original_shapes"):
+ layer._original_shapes = {}
+
+ # Force record: these are the target GPTQ shapes for rollback.
+ layer._original_shapes["w13_weight_packed"] = tuple(w13_weight.shape)
+ layer._original_shapes["w2_weight_packed"] = tuple(w2_weight.shape)
+
+ # Also record the shapes of the scales.
+ layer._original_shapes["w2_weight_scale"] = tuple(w2_scale.shape)
+ layer._original_shapes["w13_weight_scale"] = tuple(w13_scale.shape)
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+ # Skip if the layer is already converted to Marlin format to prevent double-packing.
+ if getattr(layer, "is_marlin_converted", False):
+ return
+
+ if not hasattr(layer, "_original_shapes"):
+ layer._original_shapes = {}
+
+ def replace_tensor(name, new_t):
+ target_attr = getattr(layer, name)
+
+ # Only save if the key doesn't exist to prevent overwriting with Marlin shapes.
+ if name not in layer._original_shapes:
+ # This is a safety check; `create_weights` usually handles this already.
+ layer._original_shapes[name] = tuple(target_attr.shape)
+
+ # It is important to use resize_() here since it ensures
+ # the same buffer is reused
+ target_attr.resize_(new_t.shape)
+ target_attr.copy_(new_t)
+ del new_t
+
+ num_experts = layer.w13_weight_g_idx.shape[0]
+ device = layer.w13_weight_g_idx.device
+
+ # when running models with grouped act order,
+ # resort to g_idx values provided in checkpoint
+ if self.actorder == "group":
+ w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
+ w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
+ w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
+ w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
+
+ for e in range(num_experts):
+ w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to(
+ torch.int32
+ )
+ w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to(
+ torch.int32
+ )
+ w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
+ w13_g_idx_sort_indices[e]
+ ]
+ w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]]
+
+ replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
+ replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
+ replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+ replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+
+ else:
+ layer.w13_weight_g_idx = torch.nn.Parameter(
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+ requires_grad=False,
+ )
+ layer.w2_weight_g_idx = torch.nn.Parameter(
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+ requires_grad=False,
+ )
+ layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+ requires_grad=False,
+ )
+ layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+ requires_grad=False,
+ )
+
+ marlin_w13_qweight = gptq_marlin_moe_repack(
+ layer.w13_weight_packed,
+ layer.w13_g_idx_sort_indices,
+ layer.w13_weight_packed.shape[1] * self.packed_factor,
+ layer.w13_weight_packed.shape[2],
+ self.num_bits,
+ )
+ replace_tensor("w13_weight_packed", marlin_w13_qweight)
+ marlin_w2_qweight = gptq_marlin_moe_repack(
+ layer.w2_weight_packed,
+ layer.w2_g_idx_sort_indices,
+ layer.w2_weight_packed.shape[1] * self.packed_factor,
+ layer.w2_weight_packed.shape[2],
+ self.num_bits,
+ )
+ replace_tensor("w2_weight_packed", marlin_w2_qweight)
+ # Repack scales
+ marlin_w13_scales = marlin_moe_permute_scales(
+ layer.w13_weight_scale,
+ layer.w13_weight_packed.shape[2],
+ layer.w13_weight_scale.shape[2],
+ self.group_size,
+ )
+ replace_tensor("w13_weight_scale", marlin_w13_scales)
+
+ marlin_w2_scales = marlin_moe_permute_scales(
+ layer.w2_weight_scale,
+ layer.w2_weight_scale.shape[1]
+ * (self.group_size if self.group_size != -1 else self.packed_factor),
+ layer.w2_weight_scale.shape[2],
+ self.group_size,
+ )
+ replace_tensor("w2_weight_scale", marlin_w2_scales)
+
+ layer.is_marlin_converted = True
+
+ def restore_weights_before_loading(self, layer: torch.nn.Module):
+ """Forcibly resize parameters back to their original shapes (e.g., GPTQ format) before loading weights."""
+
+ if not hasattr(layer, "_original_shapes"):
+ return
+
+ for name, orig_shape in layer._original_shapes.items():
+ param = getattr(layer, name, None)
+
+ if param is not None and param.shape != orig_shape:
+ param.resize_(orig_shape)
+
+ layer.is_marlin_converted = False
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+ from sglang.srt.layers.moe.fused_moe_triton.fused_marlin_moe import (
+ fused_marlin_moe,
+ )
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ assert (
+ self.moe_runner_config.activation == "silu"
+ ), "Only SiLU activation is supported."
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ topk_weights, topk_ids, router_logits = topk_output
+
+ # Get expert_map for EP support
+ expert_map = None
+ global_num_experts = -1
+ if hasattr(layer, "dispatcher") and hasattr(
+ layer.dispatcher, "local_expert_mapping"
+ ):
+ expert_map = layer.dispatcher.local_expert_mapping
+ if expert_map is not None:
+ global_num_experts = self.moe_runner_config.num_experts
+
+ output = fused_marlin_moe(
+ x,
+ layer.w13_weight_packed,
+ layer.w2_weight_packed,
+ layer.w13_weight_scale,
+ layer.w2_weight_scale,
+ router_logits,
+ topk_weights,
+ topk_ids,
+ global_num_experts=global_num_experts,
+ expert_map=expert_map,
+ g_idx1=layer.w13_weight_g_idx,
+ g_idx2=layer.w2_weight_g_idx,
+ sort_indices1=layer.w13_g_idx_sort_indices,
+ sort_indices2=layer.w2_g_idx_sort_indices,
+ num_bits=self.num_bits,
+ is_k_full=self.is_k_full,
+ routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
+ )
+ return StandardCombineInput(hidden_states=output)
+
+
+class CompressedTensorsWNA16TritonMoE(CompressedTensorsWNA16MoE):
+ """ROCm/HIP-compatible W4A16 MoE method using Triton kernels instead of Marlin.
+
+ Inherits weight creation from CompressedTensorsWNA16MoE but converts
+ weights to the uint8-packed format expected by the Triton fused MoE kernel
+ instead of the Marlin-specific format.
+ """
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ if getattr(layer, "is_triton_converted", False):
+ return
+
+ num_experts = layer.w13_weight_packed.shape[0]
+
+ # Convert w13 weights: [E, K//8, N] int32 -> [E, N, K//2] uint8
+ w13 = layer.w13_weight_packed.data
+ w13 = w13.transpose(1, 2).contiguous().view(torch.uint8)
+ layer.w13_weight_packed = torch.nn.Parameter(w13, requires_grad=False)
+
+ # Convert w2 weights: [E, K//8, N] int32 -> [E, N, K//2] uint8
+ w2 = layer.w2_weight_packed.data
+ w2 = w2.transpose(1, 2).contiguous().view(torch.uint8)
+ layer.w2_weight_packed = torch.nn.Parameter(w2, requires_grad=False)
+
+ # Convert w13 scales: [E, K//group_size, N] -> [E, N, K//group_size]
+ w13_scale = layer.w13_weight_scale.data
+ w13_scale = w13_scale.transpose(1, 2).contiguous()
+ layer.w13_weight_scale = torch.nn.Parameter(w13_scale, requires_grad=False)
+
+ # Convert w2 scales: [E, K//group_size, N] -> [E, N, K//group_size]
+ w2_scale = layer.w2_weight_scale.data
+ w2_scale = w2_scale.transpose(1, 2).contiguous()
+ layer.w2_weight_scale = torch.nn.Parameter(w2_scale, requires_grad=False)
+
+ layer.is_triton_converted = True
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+ self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: "StandardDispatchOutput",
+ ) -> "CombineInput":
+ from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+
+ assert (
+ self.moe_runner_config.activation == "silu"
+ ), "Only SiLU activation is supported."
+
+ quant_info = TritonMoeQuantInfo(
+ w13_weight=layer.w13_weight_packed,
+ w2_weight=layer.w2_weight_packed,
+ use_int4_w4a16=True,
+ w13_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ block_shape=[0, self.group_size],
+ )
+ return self.runner.run(dispatch_output, quant_info)
+
+
+class NPUCompressedTensorsW4A16Int4DynamicMoE(CompressedTensorsMoEScheme):
+
+ def __init__(self, quantization_config) -> None:
+ self.pack_factor = 8 # weight dtype is int4, but use int32 to create
+ target = (
+ "MoEGMM" if "MoEGMM" in quantization_config.target_scheme_map else "Linear"
+ )
+ if target in quantization_config.target_scheme_map:
+ self.group_size = quantization_config.target_scheme_map[target][
+ "weights"
+ ].group_size
+ else:
+ self.group_size = 128
+
+ self.kernel = NPUW4A16Int4DynamicMoEMethod()
+
+ # TODO: See if we can merge this method's logic
+ # with CompressedTensorsWNA16MoE. Need more models and tests.
+ # @OrangeRedeng @TamirBaydasov
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ) -> None:
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ self.num_experts = num_experts
+ if (
+ extra_weight_attrs.get(
+ "moe_intermediate_size", intermediate_size_per_partition
+ )
+ // intermediate_size_per_partition
+ > 1
+ ):
+ quant_method = FusedMoeWeightScaleSupported.GROUP.value
+ else:
+ quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+ extra_weight_attrs.update({"quant_method": quant_method})
+ # weight
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.pack_factor,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.pack_factor,
+ dtype=torch.int32,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # scale
+ weight_scale_dtype = torch.bfloat16
+ w13_weight_scale = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.group_size,
+ dtype=weight_scale_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+ w2_weight_scale = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.group_size,
+ dtype=weight_scale_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # offset
+ w13_weight_offset = torch.nn.Parameter(
+ torch.zeros(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // self.group_size,
+ dtype=weight_scale_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight_offset", w13_weight_offset)
+ set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+ w2_weight_offset = torch.nn.Parameter(
+ torch.zeros(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // self.group_size,
+ dtype=weight_scale_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight_offset", w2_weight_offset)
+ set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ self.kernel.process_weights_after_loading(layer)
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ return self.kernel.apply(layer, dispatch_output)
+
+ def apply_without_routing_weights(
+ self,
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ ):
+ return self.kernel.apply_without_routing_weights(
+ layer,
+ hidden_states,
+ hidden_states_scale,
+ group_list_type,
+ group_list,
+ output_dtype,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5c0c8d76195f18ff01a3107c335a155fc3df9d98
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6ae89c7571a5dc1e866a9c1c698c8be5d79eea78
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..ae4c397ef37ff57f17e66484275f2880f738fbec
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6496a38fba8ae09b3025a75f357815b9d6a5e3f4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..effb7bda707ac38654851aa0b05785a44e8fd488
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..eb18488946020293ac3cc04cb96d6ac2d4c6df00
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c098ef2dbb9a17666cd3a17425ffb57578252278
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c098ef2dbb9a17666cd3a17425ffb57578252278
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3633e363f3034c90834c028aa7cca790f68e2c2c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..653bb997ee114a81b4cdf7f0760341e3a050087c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..31d29e10c1c80de92dcec9ca135d6381d96e0cbb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3618053b65831b95c4bb0f20ef3b9aa816b2d637
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..cb573ccdff98acf08726d763d81ec13dd892f0cc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7d8044772f81da17c7889fac3c2e9c964ac2bc61
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..eb11c1696d113057c8d027dda35a136517b51a9f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..035ec027fa56622196b24a03a5042ce010deaebf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8b49f2781cb54d19a2789767ebb7e8c3fb55b981
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dd66019ccf317803eef5066296a499248da0f606
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5011fd572c49c863a2f33c2cd51c7b4ddcdf2b59
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1957a7f3287fda176f642c7989b5c342cdf59919
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d1227c2157990216d2ca51c69ad0944017f53b6a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..448daf360889b82057a62a49ac8edca82bebdf16
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6f5adbb9361229bc24b19098e3711a3fd15e939b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6f5adbb9361229bc24b19098e3711a3fd15e939b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7b10d6d781484473282d8fc81c0170c58a678708
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..66ca4ee165e350212f4d8233622fc1a34b9f2fd0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..48b64a87d24a0dcb837f67969760b82c11b4aed0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d72288781e40ad7d51bc84d496406b81102325b0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..cf354037903c0d1fcd077c4647aabce026a723fb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6f5adbb9361229bc24b19098e3711a3fd15e939b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..eccb86a76df0d7302b760ab6d83a8ceb9fa9d0d9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..cb91a279d423d0ca25197e0edd5e8c2f4da58720
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..88af48431d8b8791af8df03429704606b670f1f7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a794fc821b502b79e4144b80b0599d9c19afc0f7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dd069726d7ed4dcbb449af243f4f4af21815f854
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4225c78eb72cb55c7a65b72d27fab9b4f33cc87a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4225c78eb72cb55c7a65b72d27fab9b4f33cc87a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1c36fe95169b72f7387deee716d683e6bcbc56d2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..30932a1ccab41b97e3cb409fcd3fcd6a61eaa1e0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..63d9a0bf5d79ddaaad547d44338ad4b959ad72b1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..65989f012523036746302e174071ab68550d07df
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4225c78eb72cb55c7a65b72d27fab9b4f33cc87a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..cd3e07804fdec10c2cfb291c1ede3ba67b753f9c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9d5a329d7466a37c0ca68a65a089fbb99f9327a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5e6789d00e0ae0a47ef2cc5aad489b24586f255f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5e6789d00e0ae0a47ef2cc5aad489b24586f255f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..03dba5ad15ba5f7f49100a5c78e8685e64334b2a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..96e1594a3eabbaedc792b84b07f05ae8752b7251
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..017dca003117d65aa4a82f57b6fbfb1deee58096
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..49ac14d2a57613b59a61d9138bcf0378f75175ec
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..49ac14d2a57613b59a61d9138bcf0378f75175ec
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..09502f05cc075e2d0a85c46b3f2025f7b2a44e85
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9a5ff48b8942957dde9b862aed848390dd267948
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b57588f9bb6d90b8daadaa12a9db7c1fb08543de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..eabc423949a24c2a1fb2368a73e5249caf8d07df
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..15e91cde59a3fe4d56320ca50e29acca3ca3aca6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7fd942e3ed320a1f26d04ac16b5c105015f453dd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..13e3c20b74828ebe9d54c533eb54e3c6c3cdf48d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..ecbca9d8dfa5de3d5ce4ee89a1993b3be5583292
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..51e237b91b8e775a36bcf783c078c2c1cecbcbd2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..22ca71e2ca24398c74d5d77302c3236f4b93efce
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6280219c9ee7d26f7e2fd3625dc92d847ddc7982
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..78db7307852d52067c602319f3af6ec013771077
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f33809b0ad0563644216261ffa100b8b6dba0299
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5e40bbe6471ace395efa41df05eaa2acee185bf6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..40c01c0b92b4b26fe480879dda33f18c5eb59a6d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b96981dec4b2d9371bd731fe26041eb5142a3d63
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c6fd3659799bc31e17f3577e7f0e8d7268faf1fb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e026497eb077bac680699fe6e59095f5d6d9a5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0b24efa37c6f54322825098bdf2064ef436a5334
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..160f12ed3f95a6967439ff53bc3e3a2cdc97c700
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e5c4a1d2c94e5c7864f462e083ea5f530b8efe3f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d694716c3e2c9ca65a265acf7557ee2762635ec5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f211a5d38ab200c9b31ac4181fd9650b0b1bd06c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5c1e43d687e22d066295483e3324a803feb87
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5c1e43d687e22d066295483e3324a803feb87
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b2cab0831118e9a8df2d2ea482af3e865e267a80
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9de2ba9c8fcd8969ad925ceecf7452050999d04d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e14cffbb2a894a701352193947d272427db0d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c4b1bd4358c65926c92c266a2824b19d070b8516
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c87cb4f957f6b7166d338dad001f8b2b9e606032
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5bf208ce4b73085838afc23b69b5aeb9082c12a7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6ae00329eccf0e6a4c9aedad5e2f812aacdca1cf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e997126f94a98392d3387b099398e8e8252a7b4a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b474e6265bc152f30f3120c5a04fd792b4dba3fd
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4532f93681e2be175b1bf94f81bfde711821cd60
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5c1e43d687e22d066295483e3324a803feb87
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a87f5de1b1830890ec665b9b37975c78e21e1caf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..37208577d4506167c9d03cebfd7453fe0161a288
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 16,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..58cdd93e90b8c29bc7a211861711565dbeeb529a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a5ce5e49b5923b8430e1ed359986a73ad2f588fa
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b72e0371d1421a1decc9d57860f83eea8f790942
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d8cc0f8967e9358b28e575e9c062d0b2d11422a7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,164 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 8,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 4,
+ "num_warps": 4,
+ "num_stages": 2,
+ "waves_per_eu": 0
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3cb7eaa07c745fd3aa2b3242780a7061bedac1de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..129f59e0c9a911e19be3d3772e0c48484e938079
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..293adce387e066fce75b6e606d4b8b6a5aa10bdb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/README.md b/sglang/python/sglang/srt/layers/quantization/configs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..718c9adb93d3e6314dd72a867fd8317ce1a5c2e3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/README.md
@@ -0,0 +1,16 @@
+# W8A8 Block FP8 Kernel Configurations
+
+This directory contains optimized kernel configurations for the W8A8 block FP8 matrix multiplication kernel.
+
+## Configuration File Format
+
+Configuration files are named using the following pattern:
+```
+N={N},K={K},device_name={DEVICE_NAME},dtype=fp8_w8a8,block_shape=[{BLOCK_N},{BLOCK_K}].json
+```
+
+Where:
+- `N`: Output dimension (number of columns in weight matrix)
+- `K`: Input dimension (number of columns in activation matrix)
+- `DEVICE_NAME`: GPU device name with spaces replaced by underscores (e.g., `NVIDIA_H100_80GB_HBM3`)
+- `BLOCK_N`, `BLOCK_K`: Block quantization granularity (typically `[128,128]`)
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86f489e235b95f2a1ec984c78ffccd1787e50709
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1df883c22498567cbc61ef487fec7439ec39a03
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..184426da457d3de43dd8a9c567f9c620461cf191
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..211fb30e1a1f3c52f38fc165635d18555bb4e31d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_scheme.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b0ebd7f3f30bcb946b0b448e7f024a913d4dade
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fea9a395a4bc7f9bd071e528a332baa57114830
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w4a4_mxfp4_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1618c31a99d7b4032f61dfa37ec5204af1e1ea84
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..832e8699a0e022d34b7e12f6804d3ab47421eb15
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__pycache__/quark_w8a8_fp8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4_moe.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6650be8bd475c5093fb3bd9385db15dd12a95e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4_moe.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.quark.schemes import QuarkMoEScheme
+from sglang.srt.utils import (
+ get_bool_env_var,
+ is_gfx95_supported,
+ is_hip,
+ set_weight_attrs,
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+logger = logging.getLogger(__name__)
+
+_is_shuffle_moe_mxfp4 = is_gfx95_supported()
+
+__all__ = ["QuarkW4A4MXFp4MoE"]
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+ from aiter import ActivationType, QuantType
+ from aiter.fused_moe import fused_moe
+ from aiter.ops.shuffle import shuffle_weight
+ from aiter.utility.fp4_utils import e8m0_shuffle
+
+OCP_MX_BLOCK_SIZE = 32
+
+
+class QuarkW4A4MXFp4MoE(QuarkMoEScheme):
+
+ def __init__(self, weight_config: dict[str, Any], input_config: dict[str, Any]):
+ self.weight_quant = weight_config
+ self.input_quant = input_config
+
+ weight_qscheme = self.weight_quant.get("qscheme")
+ input_qscheme = self.input_quant.get("qscheme")
+ if not (weight_qscheme == "per_group" and input_qscheme == "per_group"):
+ raise ValueError(
+ "For MX(FP4) Fused MoE layers, only per-group scales "
+ "for weights and activations are supported. Found "
+ f"{weight_qscheme}, {input_qscheme}"
+ ) # noqa E501
+
+ self.static_input_scales = not self.input_quant.get("is_dynamic")
+ self.with_bias = False
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ return 70
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ # Add the quantization method used (per tensor/grouped/channel)
+ # to ensure the weight scales are loaded in properly
+ extra_weight_attrs.update(
+ {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+ )
+
+ params_dtype = torch.uint8
+
+ # WEIGHTS
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // 2,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // 2,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # WEIGHT_SCALES
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size // OCP_MX_BLOCK_SIZE,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition // OCP_MX_BLOCK_SIZE,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ float_dtype = torch.get_default_dtype()
+
+ # Pre-shuffle weight scales
+ s0, s1, _ = layer.w13_weight_scale.shape
+ w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1)
+ w13_weight_scale = e8m0_shuffle(w13_weight_scale)
+ # layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale, requires_grad=False)
+ layer.w13_weight_scale.data = w13_weight_scale.view(s0, s1, -1)
+
+ s0, s1, _ = layer.w2_weight_scale.shape
+ w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1)
+ w2_weight_scale = e8m0_shuffle(w2_weight_scale)
+ # layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
+ layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1)
+
+ # Pre-shuffle weight
+ if _is_shuffle_moe_mxfp4:
+ layer.w13_weight.data = shuffle_weight(
+ layer.w13_weight.contiguous(), (16, 16)
+ )
+ layer.w2_weight.data = shuffle_weight(
+ layer.w2_weight.contiguous(), (16, 16)
+ )
+ layer.w13_weight.is_shuffled = True
+ layer.w2_weight.is_shuffled = True
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+ moe_runner_config = self.moe_runner_config
+ topk_weights, topk_ids, _ = topk_output
+ if _is_hip:
+ topk_weights = topk_weights.to(
+ torch.float32
+ ) # aiter's moe_sorting requires topk_weights to be FP32
+
+ if hasattr(torch, "float4_e2m1fn_x2"):
+ w13_weight = layer.w13_weight.view(torch.float4_e2m1fn_x2)
+ w2_weight = layer.w2_weight.view(torch.float4_e2m1fn_x2)
+ else:
+ w13_weight = layer.w13_weight
+ w2_weight = layer.w2_weight
+
+ if hasattr(layer.w13_weight, "is_shuffled"):
+ w13_weight.is_shuffled = True
+ w2_weight.is_shuffled = True
+
+ output = fused_moe(
+ x,
+ w13_weight,
+ w2_weight,
+ topk_weights,
+ topk_ids,
+ quant_type=QuantType.per_1x32,
+ w1_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ activation=(
+ ActivationType.Silu
+ if moe_runner_config.activation == "silu"
+ else ActivationType.Gelu
+ ),
+ doweight_stage1=False,
+ expert_mask=layer.expert_mask_gpu,
+ )
+ return StandardCombineInput(hidden_states=output)
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b1dde597086b172b2bbdc46c39408f8982cd17
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Optional, cast
+
+import torch
+from torch.nn import Parameter
+
+from sglang.srt.layers.parameter import (
+ ChannelQuantScaleParameter,
+ ModelWeightParameter,
+ PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+ apply_fp8_linear,
+ cutlass_fp8_supported,
+ normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.layers.quantization.quark.schemes import QuarkLinearScheme
+from sglang.srt.layers.quantization.utils import requantize_with_max_scale
+from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+
+__all__ = ["QuarkW8A8Fp8"]
+
+_is_fp8_fnuz = is_fp8_fnuz()
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+ from aiter.ops.shuffle import shuffle_weight
+
+
+class QuarkW8A8Fp8(QuarkLinearScheme):
+
+ def __init__(
+ self, weight_config: dict[str, Any], input_config: Optional[dict[str, Any]]
+ ):
+ self.cutlass_fp8_supported = cutlass_fp8_supported()
+ self.weight_qscheme = cast(str, weight_config.get("qscheme"))
+ self.is_static_input_scheme: bool = False
+ self.input_qscheme: Optional[str] = None
+ if input_config is not None:
+ self.is_static_input_scheme = not cast(bool, input_config.get("is_dynamic"))
+ self.input_qscheme = cast(str, input_config.get("qscheme"))
+
+ self.per_token = (
+ not self.is_static_input_scheme and self.input_qscheme == "per_channel"
+ )
+ self.out_dtype = torch.get_default_dtype()
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # lovelace and up
+ return 89
+
+ def process_weights_after_loading(self, layer) -> None:
+ # If per tensor, when we have a fused module (e.g. QKV) with per
+ # tensor scales (thus N scales being passed to the kernel),
+ # requantize so we can always run per tensor
+ if self.weight_qscheme == "per_tensor":
+ if _is_fp8_fnuz:
+ input_scale = getattr(layer, "input_scale", None)
+ weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ weight=layer.weight,
+ weight_scale=layer.weight_scale,
+ input_scale=input_scale,
+ )
+ if input_scale is not None:
+ layer.input_scale = Parameter(input_scale, requires_grad=False)
+ else:
+ max_w_scale = layer.weight_scale
+ weight = layer.weight
+
+ max_w_scale, weight = requantize_with_max_scale(
+ weight=weight,
+ weight_scale=max_w_scale,
+ logical_widths=layer.logical_widths,
+ )
+
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+ layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+ # If channelwise, scales are already lined up, so just transpose.
+ elif self.weight_qscheme == "per_channel":
+ weight = layer.weight
+
+ if _is_fp8_fnuz:
+ input_scale = getattr(layer, "input_scale", None)
+ weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ weight=weight,
+ weight_scale=layer.weight_scale,
+ input_scale=input_scale,
+ )
+ if input_scale is not None:
+ layer.input_scale = Parameter(input_scale, requires_grad=False)
+ else:
+ weight_scale = layer.weight_scale.data
+ if self.per_token:
+ weight_scale = weight_scale.view(-1, 1)
+ if _use_aiter:
+ layer.weight = Parameter(
+ shuffle_weight(weight, (16, 16)).t(), requires_grad=False
+ )
+ else:
+ layer.weight = Parameter(weight.t(), requires_grad=False)
+ # required by torch.compile to be torch.nn.Parameter
+ layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+ else:
+ raise ValueError(f"Unknown quantization scheme {self.weight_qscheme}")
+
+ # INPUT SCALE
+ if self.is_static_input_scheme:
+ layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+ else:
+ layer.input_scale = None
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ output_partition_sizes: list[int],
+ input_size_per_partition: int,
+ params_dtype: torch.dtype,
+ weight_loader: Callable,
+ **kwargs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ layer.logical_widths = output_partition_sizes
+
+ # WEIGHT
+ weight = ModelWeightParameter(
+ data=torch.empty(
+ output_size_per_partition,
+ input_size_per_partition,
+ dtype=torch.float8_e4m3fn,
+ ),
+ input_dim=1,
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ layer.register_parameter("weight", weight)
+
+ # WEIGHT SCALE
+ if self.weight_qscheme == "per_channel":
+ weight_scale = ChannelQuantScaleParameter(
+ data=torch.empty((sum(output_partition_sizes)), dtype=torch.float32),
+ output_dim=0,
+ weight_loader=weight_loader,
+ )
+ else:
+ assert self.weight_qscheme == "per_tensor"
+ weight_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ set_weight_attrs(weight_scale, {"needs_scalar_to_array": True})
+
+ # min requirement for fp8 kernels
+ weight_scale[:] = torch.finfo(torch.float32).min
+ layer.register_parameter("weight_scale", weight_scale)
+
+ # INPUT SCALE
+ if self.is_static_input_scheme:
+ input_scale = PerTensorScaleParameter(
+ data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+ weight_loader=weight_loader,
+ )
+ input_scale[:] = torch.finfo(torch.float32).min
+ set_weight_attrs(input_scale, {"needs_scalar_to_array": True})
+ layer.register_parameter("input_scale", input_scale)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+
+ return apply_fp8_linear(
+ x,
+ layer.weight,
+ layer.weight_scale,
+ input_scale=layer.input_scale,
+ bias=bias,
+ cutlass_fp8_supported=self.cutlass_fp8_supported,
+ use_per_token_if_dynamic=self.per_token,
+ )
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8_moe.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55c9816b9752cdeac1c5ea77d539d3186f2c463
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w8a8_fp8_moe.py
@@ -0,0 +1,312 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.quark.schemes import QuarkMoEScheme
+from sglang.srt.layers.quantization.utils import all_close_1d, per_tensor_dequantize
+from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs
+
+if TYPE_CHECKING:
+ from sglang.srt.layers.moe.token_dispatcher import (
+ CombineInput,
+ StandardDispatchOutput,
+ )
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["QuarkW8A8FP8MoE"]
+
+_is_fp8_fnuz = is_fp8_fnuz()
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+ from aiter.ops.shuffle import shuffle_weight
+
+ from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1
+
+
+class QuarkW8A8FP8MoE(QuarkMoEScheme):
+
+ def __init__(self, weight_config: dict[str, Any], input_config: dict[str, Any]):
+ self.is_static_input_scheme: bool = False
+ self.input_qscheme = None
+
+ if input_config is not None:
+ self.is_static_input_scheme = not input_config.get("is_dynamic")
+ self.input_qscheme = input_config.get("qscheme")
+
+ self.input_per_token = (
+ not self.is_static_input_scheme and self.input_qscheme == "per_channel"
+ )
+ self.weight_qscheme = weight_config.get("qscheme")
+ self.is_weight_per_channel = self.weight_qscheme == "per_channel"
+ self.out_dtype = torch.get_default_dtype()
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # lovelace and up
+ return 89
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ num_experts: int,
+ hidden_size: int,
+ intermediate_size_per_partition: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+ params_dtype = torch.float8_e4m3fn
+
+ # WEIGHTS
+ w13_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ hidden_size,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w13_weight", w13_weight)
+ set_weight_attrs(w13_weight, extra_weight_attrs)
+
+ w2_weight = torch.nn.Parameter(
+ torch.empty(
+ num_experts,
+ hidden_size,
+ intermediate_size_per_partition,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ layer.register_parameter("w2_weight", w2_weight)
+ set_weight_attrs(w2_weight, extra_weight_attrs)
+
+ # WEIGHT_SCALES
+ # per-tensor quantization
+ if self.weight_qscheme == "per_tensor":
+ # Allocate 2 scales for w1 and w3 respectively.
+ # They will be combined to a single scale after weight loading.
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
+ elif self.weight_qscheme == "per_channel":
+ w13_weight_scale = torch.nn.Parameter(
+ torch.ones(
+ num_experts,
+ 2 * intermediate_size_per_partition,
+ dtype=torch.float32,
+ ),
+ requires_grad=False,
+ )
+ w2_weight_scale = torch.nn.Parameter(
+ torch.ones(num_experts, hidden_size, dtype=torch.float32),
+ requires_grad=False,
+ )
+ weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+ else:
+ raise ValueError(
+ f"Unsupported weight quantization strategy: {self.weight_qscheme}."
+ )
+
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
+ # Add the quantization method used (per tensor/grouped/channel)
+ # to ensure the weight scales are loaded in properly
+ extra_weight_attrs.update({"quant_method": weight_quant_method})
+ set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+ set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+ # INPUT_SCALES
+ if self.is_static_input_scheme:
+ assert (
+ self.input_qscheme == "per_tensor"
+ ), "Only per-tensor quantization is supported for static input scales"
+ w13_input_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w13_input_scale", w13_input_scale)
+ set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+ w2_input_scale = torch.nn.Parameter(
+ torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+ )
+ layer.register_parameter("w2_input_scale", w2_input_scale)
+ set_weight_attrs(w2_input_scale, extra_weight_attrs)
+ else:
+ layer.w13_input_scale = None
+ layer.w2_input_scale = None
+
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ # Fp8 moe kernels require a single activation scale.
+ # We take the max of all the scales in case they differ.
+ if self.is_static_input_scheme:
+ if layer.w13_input_scale is None or layer.w2_input_scale is None:
+ raise ValueError(
+ "QuantConfig has static quantization, but found "
+ "activation scales are None."
+ )
+ if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+ layer.w2_input_scale
+ ):
+ logger.warning(
+ "Found input_scales that are not equal for "
+ "fp8 MoE layer. Using the maximum across experts "
+ "for each layer."
+ )
+ layer.w13_input_scale = torch.nn.Parameter(
+ layer.w13_input_scale.max(), requires_grad=False
+ )
+ layer.w2_input_scale = torch.nn.Parameter(
+ layer.w2_input_scale.max(), requires_grad=False
+ )
+
+ if _is_fp8_fnuz:
+ # Normalize the weights and scales
+ w13_weight, w13_weight_scale, w13_input_scale = (
+ normalize_e4m3fn_to_e4m3fnuz(
+ layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+ )
+ )
+ w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+ layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+ )
+ # Reset the parameter
+ layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+ layer.w13_weight_scale = torch.nn.Parameter(
+ w13_weight_scale, requires_grad=False
+ )
+ if w13_input_scale is not None:
+ layer.w13_input_scale = torch.nn.Parameter(
+ w13_input_scale, requires_grad=False
+ )
+ layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+ layer.w2_weight_scale = torch.nn.Parameter(
+ w2_weight_scale, requires_grad=False
+ )
+ if w2_input_scale is not None:
+ layer.w2_input_scale = torch.nn.Parameter(
+ w2_input_scale, requires_grad=False
+ )
+ if self.weight_qscheme == "per_tensor":
+ # Fp8 moe kernel needs single weight scale for w13 per expert.
+ # We take the max then dequant and requant each expert.
+ assert layer.w13_weight_scale is not None
+ shard_size = layer.intermediate_size_per_partition
+ max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+ for expert_id in range(layer.num_local_experts):
+ start = 0
+ for shard_id in range(2):
+ dq_weight = per_tensor_dequantize(
+ layer.w13_weight[expert_id][start : start + shard_size, :],
+ layer.w13_weight_scale[expert_id][shard_id],
+ )
+ (
+ layer.w13_weight[expert_id][start : start + shard_size, :],
+ _,
+ ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+ start += shard_size
+
+ layer.w13_weight_scale = torch.nn.Parameter(
+ max_w13_scales, requires_grad=False
+ )
+ elif self.weight_qscheme == "per_channel":
+ layer.w13_weight_scale = torch.nn.Parameter(
+ layer.w13_weight_scale.unsqueeze(-1), requires_grad=False
+ )
+ layer.w2_weight_scale = torch.nn.Parameter(
+ layer.w2_weight_scale.unsqueeze(-1), requires_grad=False
+ )
+ else:
+ raise ValueError(
+ f"Unsupported weight quantization strategy: {self.weight_qscheme}."
+ )
+
+ if (
+ _use_aiter
+ and self.is_weight_per_channel
+ and self.moe_runner_config.apply_router_weight_on_input
+ ):
+ with torch.no_grad():
+ # Pre-shuffle weights
+ layer.w13_weight = torch.nn.Parameter(
+ shuffle_weight(layer.w13_weight.data, (16, 16)),
+ requires_grad=False,
+ )
+ torch.cuda.empty_cache()
+ layer.w2_weight = torch.nn.Parameter(
+ shuffle_weight(layer.w2_weight.data, (16, 16)),
+ requires_grad=False,
+ )
+ torch.cuda.empty_cache()
+
+ def create_moe_runner(
+ self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+ ):
+ self.moe_runner_config = moe_runner_config
+ self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ dispatch_output: StandardDispatchOutput,
+ ) -> CombineInput:
+
+ from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+ x = dispatch_output.hidden_states
+ topk_output = dispatch_output.topk_output
+
+ moe_runner_config = self.moe_runner_config
+
+ if (
+ _use_aiter
+ and self.is_weight_per_channel
+ and moe_runner_config.apply_router_weight_on_input
+ ):
+ topk_weights, topk_ids, _ = topk_output
+ output = rocm_fused_experts_tkw1(
+ hidden_states=x,
+ w1=layer.w13_weight,
+ w2=layer.w2_weight,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ activation=moe_runner_config.activation,
+ apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+ use_fp8_w8a8=True,
+ per_channel_quant=self.is_weight_per_channel,
+ w1_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ a1_scale=layer.w13_input_scale,
+ a2_scale=layer.w2_input_scale,
+ )
+ return StandardCombineInput(hidden_states=output)
+ else:
+ quant_info = TritonMoeQuantInfo(
+ w13_weight=layer.w13_weight,
+ w2_weight=layer.w2_weight,
+ use_fp8_w8a8=True,
+ per_channel_quant=self.is_weight_per_channel,
+ w13_scale=layer.w13_weight_scale,
+ w2_scale=layer.w2_weight_scale,
+ a13_scale=layer.w13_input_scale,
+ a2_scale=layer.w2_input_scale,
+ )
+ return self.runner.run(dispatch_output, quant_info)
diff --git a/sglang/python/sglang/srt/models/__pycache__/afmoe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/afmoe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dbe98fd4816453662a7609f0bd397b8f1b5e96b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/afmoe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/apertus.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/apertus.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96c7243d9ea4fd999f987251598b630ca8523fbf
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/apertus.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/arcee.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/arcee.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f0b474833f9fc3e498515efc08eda35547b7edb
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/arcee.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/baichuan.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/baichuan.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0174b35f84f84b8272516aacf94738cddd56908
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/baichuan.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/bailing_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/bailing_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b2666666422f72f1d8c4b7ba69deb4528eb80e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/bailing_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/bailing_moe_linear.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/bailing_moe_linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a7c39ee9e94cc549e3b0fa19b884989ab4a13a8
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/bailing_moe_linear.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/bailing_moe_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/bailing_moe_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4041c8316f1ce51cd8d47a4f506eda09612e066c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/bailing_moe_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/bert.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/bert.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65598b3fbd40467f19b1e40f81a9bc67aba5062a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/bert.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/chatglm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/chatglm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92c0f15b13e4f64468622027117088de234fda66
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/chatglm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/clip.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/clip.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae9093330dcdb570e4838f1a2abdf1668430298e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/clip.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/commandr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/commandr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fce42bc16cc8fcf186e5c82bdec1000a332b26d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/commandr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/dbrx.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/dbrx.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c77fba21b92250485ee12d98269211596a2b42a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/dbrx.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af20f701356c1c5b3190846ad455228e47b8ef10
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek_janus_pro.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek_janus_pro.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1fe2d72858a73580eb3a1a60f389628d96e6b7e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek_janus_pro.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b12be1b4b109e0aa2fcd24105994ba247bab75e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek_ocr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek_ocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..612819cec0f7a54f1a593a6fe90b62218f31eee1
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek_ocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek_v2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek_v2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb91d4c07156482bff1b4a4fc8bb05355714f5e6
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek_v2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/deepseek_vl2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/deepseek_vl2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35f8fcd7b28b85dfeadcd526eb9343e99757970f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/deepseek_vl2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/dots_ocr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/dots_ocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26f503a728d24c35a75566dd9fde2dfe0b27e580
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/dots_ocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/dots_vlm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/dots_vlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a740c51fc17a10806f2da8093988aefd43b981bd
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/dots_vlm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/dots_vlm_vit.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/dots_vlm_vit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..585166b754adf54910695fbcf8018cc2c46bba11
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/dots_vlm_vit.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/ernie4.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/ernie4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..572d83cadf0053b531f3b6bc0c8d73d667b25c42
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/ernie4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/ernie45_moe_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/ernie45_moe_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c136ff4807db68c185e19a623eb3a832919c4579
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/ernie45_moe_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/ernie45_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/ernie45_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c37695ca2d60e1847c1e0ad403e2d9a5e7d759e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/ernie45_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/ernie4_eagle.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/ernie4_eagle.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbf7833aee20be80460797f6bce22fc661af066a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/ernie4_eagle.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/exaone.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/exaone.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..487d4cf25d3a06e44343c1df734d1da547e8affe
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/exaone.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/exaone4.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/exaone4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6a56a82620113acab22dc110c4c96bea43a9ff6
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/exaone4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/exaone_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/exaone_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93bdcb9d5ecbbfccd6c3788e22f6d8380c5036ef
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/exaone_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/exaone_moe_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/exaone_moe_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce463698546245ab8ebbd49d9bc5948ecfc8385e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/exaone_moe_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/falcon_h1.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/falcon_h1.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb73f556dba0dc2980107439a403a2ca67b43a6
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/falcon_h1.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ee4d4e89ecaa7fc10b91ec5ed6f7d3b93f1f84e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd485fb5da9823fb5045327a072af90df84e4cf3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma2_reward.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma2_reward.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..668657b11388c84ae2374165ac6531a835c7b424
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma2_reward.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma3_causal.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma3_causal.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b852bbf3d5437d9df9ab7f5497c16c669a4af207
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma3_causal.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma3_mm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma3_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dd606e4c6be28eef5bb03f43551fdc4c63a8c9c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma3_mm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma3n_audio.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma3n_audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0d6c7474de9c8345f9b34b1c03b84d801258ead
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma3n_audio.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma3n_causal.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma3n_causal.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..120b6f887071215d359d537ba46b44953734dcba
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma3n_causal.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gemma3n_mm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gemma3n_mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a21b48477094fe7a88a4a65203fbafc2d5e909d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gemma3n_mm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bb36f8a23507d3e3ea455d2895c426e24d29239
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bda8bc894aee716a16d607e569dffa3523064368
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4_moe_lite.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4_moe_lite.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8889cda748705b79245b52c62ee22b6196b743b3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4_moe_lite.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4_moe_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4_moe_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39be0c13548e8764018a037838ff1a5a7538c0f5
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4_moe_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4v.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4v.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c44f7a3299a1c2cb7e971961b5a30404447f91f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4v.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm4v_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm4v_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7211808306a628c72927a6ed8a76642e092b3623
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm4v_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm_ocr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm_ocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ab237e8eb0db1b9fd181f68145205f33e01a7c3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm_ocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glm_ocr_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glm_ocr_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af7ab9892e4813bfb1841281a5725f11566e4c05
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glm_ocr_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/glmasr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/glmasr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29d1a6214f4168b7df75c73817758a8738a74d80
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/glmasr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gpt2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gpt2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b51d10dc914134d499840f0aaaf1502e7d6d85d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gpt2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gpt_bigcode.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gpt_bigcode.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93e15779f980f99f92a6c4e69243ded303632a25
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gpt_bigcode.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gpt_j.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gpt_j.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb0f81e369d8142db84a270d93a9ab1d43c48118
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gpt_j.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/gpt_oss.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/gpt_oss.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3813a368b5e19625a7c989a3a012840fdb189a07
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/gpt_oss.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/granite.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/granite.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0049c69532fe48eed07fb5f962ca125517486e57
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/granite.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/granitemoe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/granitemoe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8064d3d403a96ef8a2a51aa4d4765c8b20ee61b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/granitemoe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/granitemoehybrid.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/granitemoehybrid.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..498604310b1ef705b2264d005c9112e9d0c6d85c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/granitemoehybrid.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/grok.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/grok.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52eefe65c0fc1f0a33de4358f71305d990c08d1b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/grok.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/hunyuan.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/hunyuan.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6048d4fb98e570c3b03127f8898ead14bfca0aee
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/hunyuan.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/idefics2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/idefics2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3aa886c075e1b80b0a579450412174cff85ec609
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/idefics2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/internlm2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/internlm2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04a7c15072a76520ac6eb452a252da08007a1815
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/internlm2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/internlm2_reward.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/internlm2_reward.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdce7d72b5375a7f08fe6ccdeba3d002d9e27456
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/internlm2_reward.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/interns1.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/interns1.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65258366895390f933d6cfac3d7e343af66ec363
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/interns1.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/interns1pro.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/interns1pro.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5f3f738bd1684cf9f7efb5204e34719f489a980
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/interns1pro.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/internvl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/internvl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4125cc4be3a61e259fd57dc74a2ccf44c608ac69
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/internvl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/iquest_loopcoder.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/iquest_loopcoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1932a950f464f617a07385387f5b4b0825e3354
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/iquest_loopcoder.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/jet_nemotron.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/jet_nemotron.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8b64414912cec74b3e8f784297171802f91cb0c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/jet_nemotron.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/jet_vlm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/jet_vlm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b21fdc6e0fa3ac3241bb4e734de67dbdd01033ff
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/jet_vlm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/kimi_k25.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/kimi_k25.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c16ca8abdc1a0eb468b9a41c8016a8a4cb69a450
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/kimi_k25.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/kimi_linear.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/kimi_linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df5a470ecaf53b12d0a7111019cbc07de1521270
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/kimi_linear.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/kimi_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/kimi_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26f26c84df730ffbb0b9c7a549cb996ccea56fa8
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/kimi_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/kimi_vl_moonvit.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/kimi_vl_moonvit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7db5ab9d63ad4dac3d35f0fa75af3df4cb4d629d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/kimi_vl_moonvit.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/lfm2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/lfm2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e925bae2cb90b77211c3594b90b5a5163bb68f1c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/lfm2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/lfm2_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/lfm2_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..275b7ff4b45cd382bf4c3c8303ff23ae9d022105
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/lfm2_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/lightonocr.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/lightonocr.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e652cc5322e0fda5ccc3cecf1bfe252db0653338
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/lightonocr.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llada2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llada2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4ff0dfd89c7378a546afe4397f0d4cce6cc08ea
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llada2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ada278122769fbf463c5b28f4211f9ae2412a9e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama4.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad2c15a0c1ef06b441aa0c9e3a115ff516b76d7a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama_classification.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..922b722ecb3f059ee5bf32ceb58f650e16595c05
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama_classification.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama_eagle.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama_eagle.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15d7930189cf2d161b0fc137351d0467949887f2
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama_eagle.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama_eagle3.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama_eagle3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..882851f557be4b70c634e3c8a0098baa03e0df93
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama_eagle3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama_embedding.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama_embedding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e24c119ad6da5926b0e7ebec022d15ef4b29aec
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama_embedding.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llama_reward.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llama_reward.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5047b470ce286b7431e14a8bed5c81af8789260
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llama_reward.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llava.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llava.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bf3e6aa909956d8761b46f107ac076ceb45f360
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llava.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/llavavid.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/llavavid.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8c87c5afad2ccc9925df7ed983e7e4041ff684
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/llavavid.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/longcat_flash.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/longcat_flash.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67e22c846f61a94428f386a4f0a7cbc379917c03
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/longcat_flash.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/longcat_flash_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/longcat_flash_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a6a218cab0eaee9003e36c3783e72bf1fab6af1
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/longcat_flash_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/midashenglm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/midashenglm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65be1e7876dee307840ec17a7decc430c3dd619d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/midashenglm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mimo.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mimo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6ca477f46b57b59ddecc5eeea61e70b330cd38
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mimo.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mimo_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mimo_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65f8e1a52cc602f0c714a1a4e8e7bea922488481
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mimo_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee2ba4431d2995b8839c2a896f30481378e80e3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash_nextn.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash_nextn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c38b05c0f0747d1c95322689dbcbdfaf951fcae3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mimo_v2_flash_nextn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mindspore.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mindspore.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5fafed1865e7f898cd6b1d4a1cb6efd192d6bc3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mindspore.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/minicpm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/minicpm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..706fc69883bc31abea4978744a7a4a09566353d5
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/minicpm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/minicpm3.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/minicpm3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1e9a7669d5739158057140f6029e302b4069fac
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/minicpm3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/minicpmo.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/minicpmo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4da30985384653cf98d2bdcd7c14d1fc577a3282
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/minicpmo.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/minicpmv.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/minicpmv.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c643757621af29e64c27cd2f1be293639ec1ef4
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/minicpmv.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/minimax_m2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/minimax_m2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..791663814bcb772e5d2d0452836bb93e4a9dc0cd
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/minimax_m2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/ministral3.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/ministral3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e052ee9660ed4542f85b1867fd140eb221c4ca4e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/ministral3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mistral.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mistral.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9af4eb18b09cd95dfdfdd29da25ae1af6f060d80
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mistral.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mistral_large_3.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mistral_large_3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ed4cbb4de075bb5b5f869d9d1dda4bfbd894882
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mistral_large_3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mistral_large_3_eagle.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mistral_large_3_eagle.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b840d7fd23a7603343c65b8c1292525e77db959d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mistral_large_3_eagle.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mixtral.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mixtral.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe7d152b9cce32dccaccf445fd8b7ce0025bb21f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mixtral.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mixtral_quant.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mixtral_quant.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e0ca9971c6b90ec779de19c29906005f198768
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mixtral_quant.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mllama.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mllama.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..391273dd9bbf65886f2be032ee84f5c55fcc50d9
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mllama.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/mllama4.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/mllama4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10f49620ce6362335526a15d834c9a18d9fa0c3d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/mllama4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nano_nemotron_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nano_nemotron_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55f0e9b5a3c3fed21583d89c55f7b11c2a721e4b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nano_nemotron_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nemotron_h.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nemotron_h.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c888b2466a01c97a7b05606c43fcab6609c2a5f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nemotron_h.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nemotron_h_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nemotron_h_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff1c52e09e9234f24268f39b3eecd654c098de97
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nemotron_h_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nemotron_nas.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nemotron_nas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..555be0ebfe61765bea064cf1b88e89572d5081e1
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nemotron_nas.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nvila.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nvila.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1628763aac25877d425327daf0f989e2dbbb4793
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nvila.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/nvila_lite.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/nvila_lite.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c01ae85a7b0c16a032a82950a9b08ea2f30e5d85
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/nvila_lite.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/olmo.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/olmo.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db9436ed0c31d2a77811291d5d27a1b7ea0b3de9
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/olmo.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/olmo2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/olmo2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1e9ca7b6b660a17142e1c265d316f5905afc0fd
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/olmo2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/olmoe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/olmoe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea785c6b9362af29f80f57eb8b25452da25bd05f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/olmoe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/opt.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/opt.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b2f5d61da8f02f3d724a9018b7a3e6bf2818a27
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/opt.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/orion.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/orion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af01a5749565bc003b65dc5d44771d09b435fc56
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/orion.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/paddleocr_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/paddleocr_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf6c51134ca11ce47a6d8a6f7acfc6bd8e357d2c
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/paddleocr_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/persimmon.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/persimmon.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..216f6c183bd6ec15f3255a6c604456bf4a718433
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/persimmon.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phi.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phi.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2b31d3b4dfe0e0c3eb6328a4933d775f9e97ef6
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phi.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phi3_small.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phi3_small.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74d9c1b37965692cedb686d4f8fd7c7e9cc44515
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phi3_small.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phi4mm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phi4mm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bbd19f61ab2ca686f849d210edd793ebd531287
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phi4mm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phi4mm_audio.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phi4mm_audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abd02db92fbec00fa68afdd516c1465f6008c577
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phi4mm_audio.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phi4mm_utils.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phi4mm_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b56889b9bf992e3cdbe99c5efe07ff998bbd55f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phi4mm_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/phimoe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/phimoe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54d23d1de0413922f1b15ca38cdffa2e9911b753
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/phimoe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/pixtral.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/pixtral.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4b8e8cc29b6c4672c6ec62e49f61aab1c481d90
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/pixtral.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/points_v15_chat.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/points_v15_chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b31f561d78d1933dec1aaad069ef08bd87c33a9d
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/points_v15_chat.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f7c4e53a86ee4c3c2dfb7d4f3394994d9805ee1
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1f4770a6863c3c240885f03846019518721a955
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_5_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_5_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebf425d4c86df19489d375066fd9a7e486dad243
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_5_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_audio.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74251d81954c65a45af69620fe3e26dc012b01e8
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_audio.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_classification.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28a286d8989b9ab6d730649791e372dbb2194574
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_classification.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_eagle.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_eagle.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a00079bb7f5063d679fde2bfc1c358dde1733b3b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_eagle.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b681513a8cbfbc671d5a4d302a8b3dc9a3edfff0
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_rm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_rm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a39c874e9b41488bd18fb460725e4a38bd14fb67
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_rm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen2_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen2_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58e4250e3703094b68649f3e5a251aa846da0f3a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen2_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75e10dd71b016434b9e53e54c3cc5615df73fcf7
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_5.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_5.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03cf23c4ba91bb85ac09c85d4c26e2376b756752
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_5.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_5_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_5_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70a2670da274a45acfc78804ad13f63fbadd69a0
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_5_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_classification.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09e75255031ccba0b30bc487193d06a4bdc7039e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_classification.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f9573b7d1937988bfcf2c603ced499bf3ba7b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_next.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_next.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1a9f64b5dbd09eaa0e28eb19a7db66e04d0170a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_next.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_next_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_next_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..735b53f375d1b26c01f6760538caecaaecfcc910
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_next_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_omni_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_omni_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46258dbaa1bb9cfc4478b92f4ac9778358ce208b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_omni_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_rm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_rm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ae03a0de6be55cafec92217bc27eab466d2ac6
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_rm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91a3e0574769ef3585b8a571028d30b357eac989
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/qwen3_vl_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/qwen3_vl_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c67ee7a722529eba60252853b731f4a445447411
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/qwen3_vl_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/radio.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/radio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..922e8b84c48d8a97112d4f7c81a6b823671ca1b9
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/radio.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/registry.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/registry.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f7f2f51b5f4d14026434edee784bad7d5a68cb5
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/registry.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/roberta.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/roberta.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb040d3958e9dd2088eb72230062daa2c946672e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/roberta.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/sarashina2_vision.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/sarashina2_vision.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f89b64be597dad8d5e0e8b2c552292bcc775bd8f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/sarashina2_vision.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/sarvam_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/sarvam_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bae3915d443af8032b928593a609229ab85c418
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/sarvam_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/sdar.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/sdar.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab0cc6834860905353fb163b305cd2e93261224
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/sdar.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/sdar_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/sdar_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54720cc40f725c7ec3b06e39bd5fd89248d3f687
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/sdar_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/siglip.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/siglip.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb1c23ffd0bcce0ecb8e7a24ef62f1ff4a35062
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/siglip.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/solar.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/solar.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eb5ca904a91e42f780b55246e9fdce17f2857d2
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/solar.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/stablelm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/stablelm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee51bbabb121ffd1a674721332f5c4451811914b
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/stablelm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/starcoder2.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/starcoder2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78bcb6a7377af4b751f79e09dfa37593e0ec4239
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/starcoder2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/step3_vl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/step3_vl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd47835f0fb8ffd55c6bd84da3de9c573011cf85
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/step3_vl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/step3_vl_10b.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/step3_vl_10b.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f99d2a55b644afa814c29d3b7b27cfb558b20adb
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/step3_vl_10b.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/step3p5.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/step3p5.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8a8658a9d7d306570206b8bc081d75c745b9f14
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/step3p5.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/step3p5_mtp.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/step3p5_mtp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c29cb670c9391212e28a74e50aac561c26c4184
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/step3p5_mtp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/teleflm.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/teleflm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..827f5cbf1c14a24ea3ab0d4e4aadd63dd7bce939
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/teleflm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/torch_native_llama.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/torch_native_llama.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc7deef8f1a7e3531448b452d9b4fef7d24510e3
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/torch_native_llama.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/transformers.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/transformers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0061bdff23a0c9c8534edd572b0c5795fc77092a
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/transformers.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c74e605ff59bb10f1c555d4cb89b08a039dc81f
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/whisper.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/whisper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1f813510fd900a49c3ca5d17b7bafa0e01b177e
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/whisper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/xverse.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/xverse.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27c93ee02b69bf5359022214d22c59ff9fbb1774
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/xverse.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/xverse_moe.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/xverse_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c78f36f88253b9467881de07031bf9c7ea8120d2
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/xverse_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/__pycache__/yivl.cpython-311.pyc b/sglang/python/sglang/srt/models/__pycache__/yivl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62fccd4914e1f067daa17eb6ccc70881a39ea958
Binary files /dev/null and b/sglang/python/sglang/srt/models/__pycache__/yivl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/__init__.py b/sglang/python/sglang/srt/models/deepseek_common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/models/deepseek_common/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a13191ea40b7fcda4f065bd90ed2f820d524cea
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/__pycache__/attention_backend_handler.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/attention_backend_handler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e3c0ad0fd172a3c090c6c98795af053661a9bd7
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/attention_backend_handler.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/__pycache__/deepseek_weight_loader.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/deepseek_weight_loader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5eed4ebcdca0e8027c8da6e9cc1430ac22ea47d9
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/deepseek_weight_loader.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f72421f664214064bdcbf57cd10a791bceabb7d0
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_backend_handler.py b/sglang/python/sglang/srt/models/deepseek_common/attention_backend_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc1d4c97241e060286a9611aba2f48e4425d9d4
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_backend_handler.py
@@ -0,0 +1,184 @@
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+from sglang.srt.models.deepseek_common.attention_forward_methods.forward_methods import (
+ AttnForwardMethod,
+)
+from sglang.srt.models.deepseek_common.utils import _is_hip
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import use_intel_amx_backend
+
+MHA_ONE_SHOT_SUPPORTED_BACKENDS = ["fa3", "flashinfer", "flashmla"]
+
+
+class AttentionBackendRegistry:
+ _handlers = {}
+
+ @classmethod
+ def register(cls, backend_name, handler_func):
+ cls._handlers[backend_name] = handler_func
+
+ @classmethod
+ def get_handler(cls, backend_name):
+ return cls._handlers.get(backend_name, cls._handlers.get("triton"))
+
+
+def _dispatch_mla_subtype(attn, forward_batch):
+ if _is_hip:
+ if attn.rocm_fused_decode_mla and forward_batch.forward_mode.is_decode():
+ return AttnForwardMethod.MLA_FUSED_ROPE_ROCM
+ else:
+ return AttnForwardMethod.MLA
+ else:
+ if hasattr(attn, "fused_qkv_a_proj_with_mqa") and use_intel_amx_backend(attn):
+ return AttnForwardMethod.MLA_FUSED_ROPE_CPU
+ else:
+ return AttnForwardMethod.MLA
+
+
+def handle_attention_ascend(attn, forward_batch):
+ if (
+ forward_batch.forward_mode.is_extend()
+ and not forward_batch.forward_mode.is_target_verify()
+ and not forward_batch.forward_mode.is_draft_extend()
+ and not forward_batch.forward_mode.is_draft_extend_v2()
+ ):
+ if hasattr(attn, "indexer"):
+ return AttnForwardMethod.DSA_NPU
+ else:
+ return AttnForwardMethod.MHA_NPU
+ else:
+ if hasattr(attn, "indexer"):
+ return AttnForwardMethod.DSA_NPU
+ else:
+ return AttnForwardMethod.MLA_NPU
+
+
+def _get_sum_extend_prefix_lens(forward_batch):
+ return (
+ sum(forward_batch.extend_prefix_lens_cpu)
+ if forward_batch.extend_prefix_lens_cpu is not None
+ else 0
+ )
+
+
+def _support_mha_one_shot(attn, forward_batch, backend_name):
+ attn_supported = backend_name in MHA_ONE_SHOT_SUPPORTED_BACKENDS
+ sum_seq_lens = (
+ sum(forward_batch.seq_lens_cpu) if forward_batch.seq_lens_cpu is not None else 0
+ )
+ return attn_supported and sum_seq_lens <= forward_batch.get_max_chunk_capacity()
+
+
+def _handle_attention_backend(attn, forward_batch, backend_name):
+ if is_in_piecewise_cuda_graph():
+ return AttnForwardMethod.MLA
+
+ sum_extend_prefix_lens = _get_sum_extend_prefix_lens(forward_batch)
+ disable_ragged = (
+ backend_name in ["flashinfer", "flashmla"]
+ ) and attn.flashinfer_mla_disable_ragged
+
+ if (
+ not disable_ragged
+ and forward_batch.forward_mode.is_extend_without_speculative()
+ and (
+ (
+ sum_extend_prefix_lens >= attn.chunked_prefix_cache_threshold
+ and not attn.disable_chunked_prefix_cache
+ )
+ or sum_extend_prefix_lens == 0
+ )
+ ):
+ if _support_mha_one_shot(attn, forward_batch, backend_name):
+ return AttnForwardMethod.MHA_ONE_SHOT
+ return AttnForwardMethod.MHA_CHUNKED_KV
+ else:
+ return _dispatch_mla_subtype(attn, forward_batch)
+
+
+def handle_attention_flashinfer(attn, forward_batch):
+ return _handle_attention_backend(attn, forward_batch, "flashinfer")
+
+
+def handle_attention_fa3(attn, forward_batch):
+ # when deterministic inference is enabled, use MLA
+ if get_global_server_args().enable_deterministic_inference:
+ return _dispatch_mla_subtype(attn, forward_batch)
+ else:
+ return _handle_attention_backend(attn, forward_batch, "fa3")
+
+
+def handle_attention_flashmla(attn, forward_batch):
+ return _handle_attention_backend(attn, forward_batch, "flashmla")
+
+
+def handle_attention_cutlass_mla(attn, forward_batch):
+ return _handle_attention_backend(attn, forward_batch, "cutlass_mla")
+
+
+def handle_attention_fa4(attn, forward_batch):
+ # TODO(cicirori): use FA4 MHA for DeepSeekV3 for now
+ return AttnForwardMethod.MHA_CHUNKED_KV
+
+
+def handle_attention_trtllm_mla(attn, forward_batch):
+ if is_in_piecewise_cuda_graph():
+ return AttnForwardMethod.MLA
+
+ sum_extend_prefix_lens = _get_sum_extend_prefix_lens(forward_batch)
+ if forward_batch.forward_mode.is_extend_without_speculative() and (
+ not attn.disable_chunked_prefix_cache or sum_extend_prefix_lens == 0
+ ):
+ return AttnForwardMethod.MHA_CHUNKED_KV
+ else:
+ return _dispatch_mla_subtype(attn, forward_batch)
+
+
+def handle_attention_aiter(attn, forward_batch):
+ if forward_batch.forward_mode.is_extend_without_speculative():
+ return AttnForwardMethod.MHA
+ else:
+ return AttnForwardMethod.MLA
+
+
+def handle_attention_nsa(attn, forward_batch):
+ """
+ Dispatch logic is centralized in NativeSparseAttnBackend.set_nsa_prefill_impl and executed
+ in init_forward_metadata. Read the decision from backend.use_mha.
+ """
+
+ backend = forward_batch.attn_backend
+ if isinstance(backend, TboAttnBackend): # if enable tbo, get primary backend
+ backend = backend.primary
+ if hasattr(backend, "use_mha") and backend.use_mha:
+ return AttnForwardMethod.MHA_ONE_SHOT
+ return AttnForwardMethod.MLA
+
+
+def handle_attention_triton(attn, forward_batch):
+ if is_in_piecewise_cuda_graph():
+ return AttnForwardMethod.MLA
+
+ # when deterministic inference is enabled, use MLA
+ if get_global_server_args().enable_deterministic_inference:
+ return _dispatch_mla_subtype(attn, forward_batch)
+
+ if (
+ forward_batch.forward_mode.is_extend_without_speculative()
+ and sum(forward_batch.extend_prefix_lens_cpu) == 0
+ ):
+ return AttnForwardMethod.MHA
+ else:
+ return _dispatch_mla_subtype(attn, forward_batch)
+
+
+AttentionBackendRegistry.register("ascend", handle_attention_ascend)
+AttentionBackendRegistry.register("flashinfer", handle_attention_flashinfer)
+AttentionBackendRegistry.register("fa3", handle_attention_fa3)
+AttentionBackendRegistry.register("flashmla", handle_attention_flashmla)
+AttentionBackendRegistry.register("cutlass_mla", handle_attention_cutlass_mla)
+AttentionBackendRegistry.register("fa4", handle_attention_fa4)
+AttentionBackendRegistry.register("trtllm_mla", handle_attention_trtllm_mla)
+AttentionBackendRegistry.register("aiter", handle_attention_aiter)
+AttentionBackendRegistry.register("nsa", handle_attention_nsa)
+AttentionBackendRegistry.register("triton", handle_attention_triton)
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__init__.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a9508fe22a569bf8fac3b298be02017daff01a8
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__init__.py
@@ -0,0 +1,13 @@
+from .forward_methods import AttnForwardMethod
+from .forward_mha import DeepseekMHAForwardMixin
+from .forward_mla import DeepseekMLAForwardMixin
+from .forward_mla_fused_rope_cpu import DeepseekMLACpuForwardMixin
+from .forward_mla_fused_rope_rocm import DeepseekMLARocmForwardMixin
+
+__all__ = [
+ "AttnForwardMethod",
+ "DeepseekMHAForwardMixin",
+ "DeepseekMLACpuForwardMixin",
+ "DeepseekMLAForwardMixin",
+ "DeepseekMLARocmForwardMixin",
+]
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ce3f926619917e8f7b2440b62e1f9f0addedcbe
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_methods.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_methods.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23ab17741982abb6d1b9a86d3d749855c506688a
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_methods.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mha.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mha.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dddc3cf73acf0c67d98e2390ef07e55be15b111
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mha.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0808fb4b8764ea54404bc2835b2aab34e57323b9
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_cpu.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_cpu.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68129d16cde8ac6ca0de937d70161f920b9d6de3
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_cpu.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_rocm.cpython-311.pyc b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_rocm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85d7f5dd7efc43a13f84fd5c88338189a7a00843
Binary files /dev/null and b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/__pycache__/forward_mla_fused_rope_rocm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_methods.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..c234a6b38c121908263410a06e487299100adac2
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_methods.py
@@ -0,0 +1,32 @@
+from enum import IntEnum, auto
+
+
+class AttnForwardMethod(IntEnum):
+ # Use multi-head attention
+ MHA = auto()
+
+ # Use absorbed multi-latent attention
+ MLA = auto()
+
+ # Use multi-head attention, but with KV cache chunked.
+ # This method can avoid OOM when prefix lengths are long.
+ MHA_CHUNKED_KV = auto()
+
+ # Use multi-head attention, execute the MHA for prefix and extended kv in a single kernel
+ # when the sequence lengths are below the threshold.
+ MHA_ONE_SHOT = auto()
+
+ # Use MLA but with fused RoPE
+ MLA_FUSED_ROPE_ROCM = auto()
+
+ # Use MLA with fused RoPE kernel for CPU
+ MLA_FUSED_ROPE_CPU = auto()
+
+ # Use multi-head attention for NPU
+ MHA_NPU = auto()
+
+ # Use absorbed multi-latent attention for NPU
+ MLA_NPU = auto()
+
+ # Use Deepseek V3.2 sparse multi-latent attention for NPU
+ DSA_NPU = auto()
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mha.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mha.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c83904f87a5f940c9681cbaaff67509b0db3375
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mha.py
@@ -0,0 +1,524 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.nsa.dequant_k_cache import dequantize_k_cache_paged
+from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+from sglang.srt.layers.attention.utils import concat_and_cast_mha_k_triton
+from sglang.srt.layers.communicator import get_attn_tp_context
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_common.utils import (
+ _is_cuda,
+ _is_hip,
+ _is_npu,
+ _use_aiter_gfx95,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import BumpAllocator, get_bool_env_var, next_power_of_2
+
+_use_fp8_prefill_attn = (
+ get_bool_env_var("SGLANG_AITER_FP8_PREFILL_ATTN", "True") and _use_aiter_gfx95
+)
+
+if TYPE_CHECKING:
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+
+if _is_cuda:
+ from sgl_kernel import concat_mla_k, merge_state_v2
+
+if _use_aiter_gfx95:
+ from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+ from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype
+ from sglang.srt.layers.quantization.rocm_mxfp4_utils import fused_rms_mxfp4_quant
+
+# Configs for DeepSeek-V3:
+# num_local_heads = 128
+# qk_nope_head_dim = 128
+# qk_rope_head_dim = 64
+# qk_head_dim = qk_nope_head_dim + qk_rope_head_dim = 192
+# v_head_dim = 128
+
+# Configs for kv chunking strategy:
+# sum_prefix_length:
+# Total number of tokens to be fetched from kv cache for current batch.
+# e.g: For batch with 2 sequences, seq_lens_kv = [1024, 2048], seq_lens_q = [512, 1024], then sum_prefix_length = (1024 - 512) + (2048 - 1024) = 1536
+# sum_extended_length:
+# Total number of tokens in the extended part of the current batch. (=sum(seq_lens_q))
+# chunked_prefix_cache_threshold:
+# The minimum sum_prefix_length to enable mha with kv chunking, 8192 by default (can be changed with SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD)
+# For batches with smaller sum_prefix_length > 0, MLA kernel with absorption will be used instead.
+# max_kv_chunk_capacity:
+# The maximum number of tokens in each kv chunk, 128 * 1024 by default (can be get with forward_batch.get_max_chunk_capacity())
+
+# The forward methods for MHA in DeepSeek models:
+#
+# 1. forward_normal: AttnForwardMethod.MHA
+# use multi-head attention with empty kv cache (the first batch of chunked prefill, prefix lens = 0)
+# q: [sum_extended_length, num_local_heads, qk_head_dim]
+# k: [sum_extended_length, num_local_heads, qk_head_dim]
+# v: [sum_extended_length, num_local_heads, v_head_dim]
+#
+# 2. forward_normal_one_shot: AttnForwardMethod.MHA_ONE_SHOT
+# use multi-head attention with short kv prefix length (chunked_prefix_cache_threshold <= sum_prefix_lens <= max_kv_chunk_capacity)
+# the kv latent vectors are fetched from memory pool, with combined kv_indices of prefix part and extended part
+# q: [batch_size, num_local_heads, qk_head_dim]
+# k: [sum_extended_length + sum_prefix_length, num_local_heads, qk_head_dim]
+# v: [sum_extended_length + sum_prefix_length, num_local_heads, v_head_dim]
+#
+# 3. forward_normal_chunked_kv: AttnForwardMethod.MHA_CHUNKED_KV
+# multiple phases of multi-head attention with chunked kv cache (sum_prefix_length > max_kv_chunk_capacity)
+# For the first phase, it will execute normal forward method, and returns output o_1 and lse_1,
+# q_1: [sum_extended_length, num_local_heads, qk_head_dim],
+# k_1: [sum_extended_length, num_local_heads, qk_head_dim],
+# v_1: [sum_extended_length, num_local_heads, qk_head_dim],
+# acc_o_1, acc_lse_1 = o_1, lse_1
+# For i in range(2, n), (n-1 is the number of prefix chunks), kv latent vectors are fetched from memory pool with prefix kv indices
+# q_i: [sum_extended_length, num_local_heads, qk_head_dim],
+# k_i: [chunk_size, num_local_heads, qk_head_dim],
+# v_i: [chunk_size, num_local_heads, v_head_dim],
+# acc_o_i, acc_lse_i = merge_state(acc_o_{i-1}, acc_lse_{i-1}, o_i, lse_i)
+# The final output is the accumulated output acc_o_n
+
+
+class DeepseekMHAForwardMixin:
+
+ def init_mha_forward(self: DeepseekV2AttentionMLA):
+ self.disable_chunked_prefix_cache = (
+ get_global_server_args().disable_chunked_prefix_cache
+ )
+
+ # TODO: Design a finer way to determine the threshold
+ self.chunked_prefix_cache_threshold = (
+ envs.SGLANG_CHUNKED_PREFIX_CACHE_THRESHOLD.get()
+ )
+
+ def forward_normal_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ ):
+ if self.q_lora_rank is not None:
+ q, latent_cache = (
+ get_attn_tp_context()
+ .fetch_qkv_latent()
+ .split(
+ [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+ dim=-1,
+ )
+ )
+
+ # NSA Indexer: cache quantized keys, auto-skip topk for sequences <= nsa_index_topk
+
+ if self.use_nsa:
+ # NSA requires unquantized q_lora for the indexer. When q_b_proj is FP8
+ # on gfx95, we can still use fused RMSNorm+FP8 quant, but MUST request
+ # the unquantized output for q_lora; otherwise q_lora becomes the (fp8,scale)
+ # tuple.
+ if (
+ _use_aiter_gfx95
+ and self.q_b_proj.weight.dtype == torch.float8_e4m3fn
+ ):
+ q_quanted, q_lora, _, _ = fused_rms_fp8_group_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ None,
+ None,
+ None,
+ group_size=128,
+ dtype_quant=torch.float8_e4m3fn,
+ res1=None,
+ output_unquantized_inp1=True,
+ )
+ q = self.q_b_proj(q_quanted)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ else:
+ q_lora = self.q_a_layernorm(q)
+ q = self.q_b_proj(q_lora)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ _ = self.indexer(
+ x=hidden_states,
+ q_lora=q_lora,
+ positions=positions,
+ forward_batch=forward_batch,
+ layer_id=self.layer_id,
+ return_indices=False,
+ )
+ elif _use_aiter_gfx95 and self.q_b_proj.weight.dtype == torch.uint8:
+ # MXFP4: fused RMSNorm + quant
+ q, _, _, _ = fused_rms_mxfp4_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ None,
+ None,
+ None,
+ )
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+ elif _use_aiter_gfx95 and self.q_b_proj.weight.dtype == torch.float8_e4m3fn:
+
+ q, _, _, _ = fused_rms_fp8_group_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ None,
+ None,
+ None,
+ group_size=128,
+ dtype_quant=torch.float8_e4m3fn,
+ res1=None,
+ output_unquantized_inp1=False,
+ )
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+ else:
+ q = self.q_a_layernorm(q)
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+
+ else:
+ q = self.q_proj(hidden_states)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+
+ _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+ kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+ latent_cache = latent_cache.unsqueeze(1)
+
+ if _use_aiter_gfx95 and self.kv_b_proj.weight.dtype == torch.float8_e4m3fn:
+
+ kv_a_quanted, kv_a, _, _ = fused_rms_fp8_group_quant(
+ kv_a,
+ self.kv_a_layernorm.weight,
+ self.kv_a_layernorm.variance_epsilon,
+ None,
+ None,
+ None,
+ group_size=128,
+ dtype_quant=torch.float8_e4m3fn,
+ res1=None,
+ output_unquantized_inp1=True, # return unqaunt kv_a
+ )
+
+ else:
+ kv_a = self.kv_a_layernorm(kv_a)
+
+ k_pe = latent_cache[:, :, self.kv_lora_rank :]
+ if self.rotary_emb is not None:
+ q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+ q[..., self.qk_nope_head_dim :] = q_pe
+
+ self._set_mla_kv_buffer(latent_cache, kv_a, k_pe, forward_batch)
+ if (
+ forward_batch.mha_one_shot
+ and sum(forward_batch.extend_prefix_lens_cpu) != 0
+ ):
+ if (
+ self.use_nsa
+ and self.kv_cache_dtype == "fp8_e4m3"
+ and (
+ not get_global_server_args().nsa_decode_backend == "trtllm"
+ or not get_global_server_args().nsa_prefill_backend == "trtllm"
+ )
+ ):
+ # FP8 path: dequantize NSA-specific FP8 format to BF16
+ kv_a, k_pe = self._get_mla_kv_buffer_from_fp8_for_nsa(forward_batch)
+ else:
+ # BF16/FP16 path: directly fetch from cache
+ kv_a, k_pe = self._get_mla_kv_buffer(
+ forward_batch.fetch_mha_one_shot_kv_indices(),
+ q.dtype,
+ forward_batch,
+ )
+ if _use_fp8_prefill_attn and self.kv_b_proj.weight.dtype == torch.uint8:
+ # MXFP4 weights + FP8 prefill: fuse GEMM, nope/v split, and k_pe cat
+ # into a single kernel (fused_gemm_afp4wfp4_split_cat) that writes k and v
+ # directly in FP8, avoiding a separate elementwise cast
+ k, v = self.kv_b_proj(
+ (
+ kv_a,
+ k_pe.expand(-1, self.num_local_heads, -1),
+ self.qk_nope_head_dim,
+ self.v_head_dim,
+ fp8_dtype,
+ )
+ )[0]
+ else:
+ if _use_aiter_gfx95 and self.kv_b_proj.weight.dtype == torch.float8_e4m3fn:
+ kv = self.kv_b_proj(kv_a_quanted)[0]
+ else:
+ kv = self.kv_b_proj(kv_a)[0]
+ kv = kv.view(
+ -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+ )
+ k_nope = kv[..., : self.qk_nope_head_dim]
+ v = kv[..., self.qk_nope_head_dim :]
+
+ k = self._concat_and_cast_mha_k(k_nope, k_pe, forward_batch)
+ return q, k, v, forward_batch
+
+ def forward_normal_core(
+ self: DeepseekV2AttentionMLA,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ) -> torch.Tensor:
+ attn_output = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+ attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+ output, _ = self.o_proj(attn_output)
+ return output
+
+ def forward_normal_chunked_kv_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ ):
+ # In normal mha, the k and v tensors will become overly large when the prefix length is long.
+ # To avoid this, we split the kv cache into chunks and process them one after another.
+ # Since mha is compute friendly, the for loop induced here will not introduce significant overhead.
+ # The top comments in https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+ # will be helpful for understanding the purpose of this function.
+
+ # First do normal mha forward to get output for extended part
+ return self.forward_normal_prepare(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+
+ def forward_normal_chunked_kv_core(
+ self: DeepseekV2AttentionMLA,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ) -> torch.Tensor:
+ has_extend_prefix = forward_batch.extend_prefix_lens_cpu is not None and any(
+ forward_batch.extend_prefix_lens_cpu
+ )
+ # Only initialize the info once
+ if has_extend_prefix and forward_batch.num_prefix_chunks is None:
+ forward_batch.prepare_chunked_prefix_cache_info(q.device)
+ if hasattr(forward_batch.attn_backend, "init_mha_chunk_metadata"):
+ forward_batch.attn_backend.init_mha_chunk_metadata(forward_batch)
+
+ forward_batch.mha_return_lse = has_extend_prefix
+ # Do mha for extended part without prefix
+ forward_batch.set_attn_attend_prefix_cache(False)
+ attn_output = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+
+ # Do mha attention with chunked prefix cache if there are any sequence with prefix
+ if has_extend_prefix:
+ attn_output, lse = attn_output
+ forward_batch.set_attn_attend_prefix_cache(True)
+ attn_output = self._chunked_prefix_attn_mha(
+ q=q,
+ accum_output=attn_output,
+ accum_lse=lse,
+ forward_batch=forward_batch,
+ )
+
+ attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+ output, _ = self.o_proj(attn_output)
+ return output
+
+ def forward_normal_one_shot_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ ):
+ forward_batch.mha_one_shot = True
+ return self.forward_normal_prepare(
+ positions, hidden_states, forward_batch, zero_allocator
+ )
+
+ def forward_normal_one_shot_core(
+ self: DeepseekV2AttentionMLA,
+ q: torch.Tensor,
+ k: torch.Tensor,
+ v: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ) -> torch.Tensor:
+ has_extend_prefix = any(forward_batch.extend_prefix_lens_cpu)
+ # Only initialize the info once
+ if has_extend_prefix and forward_batch.num_prefix_chunks is None:
+ forward_batch.num_prefix_chunks = 0
+ if hasattr(forward_batch.attn_backend, "init_mha_chunk_metadata"):
+ forward_batch.attn_backend.init_mha_chunk_metadata(forward_batch)
+ forward_batch.mha_return_lse = False
+ # Do mha for extended part without prefix
+ forward_batch.set_attn_attend_prefix_cache(False)
+ return self.forward_normal_core(q, k, v, forward_batch)
+
+ def _chunked_prefix_attn_mha(
+ self: DeepseekV2AttentionMLA,
+ q: torch.Tensor,
+ accum_output: torch.Tensor,
+ accum_lse: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ) -> torch.Tensor:
+
+ assert forward_batch.num_prefix_chunks is not None
+ for i in range(forward_batch.num_prefix_chunks):
+ forward_batch.set_prefix_chunk_idx(i)
+
+ kv_indices = forward_batch.prefix_chunk_kv_indices[i]
+ # Fetch latent cache from memory pool with precomputed chunked kv indices
+ kv_a_normed, k_pe = self._get_mla_kv_buffer(
+ kv_indices, q.dtype, forward_batch
+ )
+ kv = self.kv_b_proj(kv_a_normed)[0]
+ kv = kv.view(
+ -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+ )
+ v = kv[..., self.qk_nope_head_dim :]
+ k_nope = kv[..., : self.qk_nope_head_dim]
+
+ k = torch.empty(
+ (
+ k_nope.shape[0],
+ self.num_local_heads,
+ self.qk_nope_head_dim + self.qk_rope_head_dim,
+ ),
+ dtype=v.dtype,
+ device=v.device,
+ )
+ k[..., : self.qk_nope_head_dim] = k_nope
+ k[..., self.qk_nope_head_dim :] = k_pe
+
+ output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+ tmp_output = torch.empty_like(accum_output)
+ tmp_lse = torch.empty_like(accum_lse)
+ merge_state_v2(output, lse, accum_output, accum_lse, tmp_output, tmp_lse)
+ accum_output, accum_lse = tmp_output, tmp_lse
+ del kv, k, v, output, lse, tmp_output, tmp_lse
+
+ return accum_output
+
+ def _set_mla_kv_buffer(
+ self: DeepseekV2AttentionMLA,
+ latent_cache: torch.Tensor,
+ kv_a: torch.Tensor,
+ k_pe: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ):
+ if _is_cuda or _use_aiter_gfx95:
+ # Save latent cache
+ forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+ self.attn_mha, forward_batch.out_cache_loc, kv_a.unsqueeze(1), k_pe
+ )
+ elif _is_npu:
+ # To reduce a time-costing split operation
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ self.attn_mha, forward_batch.out_cache_loc, kv_a.unsqueeze(1), k_pe
+ )
+ else:
+ latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
+ latent_cache[:, :, self.kv_lora_rank :] = k_pe
+
+ # Save latent cache
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
+ )
+
+ def _get_mla_kv_buffer(
+ self: DeepseekV2AttentionMLA,
+ kv_indices: torch.Tensor,
+ dst_dtype: torch.dtype,
+ forward_batch: ForwardBatch,
+ ):
+ if _is_cuda or _use_aiter_gfx95:
+ kv_a, k_pe = forward_batch.token_to_kv_pool.get_mla_kv_buffer(
+ self.attn_mha, kv_indices, dst_dtype
+ )
+ kv_a = kv_a.squeeze(1)
+ else:
+ latent_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+ self.attn_mha.layer_id
+ )
+ latent_cache = latent_cache_buf[kv_indices].contiguous().to(dst_dtype)
+
+ kv_a, k_pe = latent_cache.split(
+ [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+ )
+ kv_a = kv_a.squeeze(1).contiguous()
+ return kv_a, k_pe
+
+ def _get_mla_kv_buffer_from_fp8_for_nsa(
+ self: DeepseekV2AttentionMLA,
+ forward_batch: ForwardBatch,
+ ):
+ """
+ Dequantize FP8 KV cache to BF16 for MLA attention (NSA-specific format).
+
+ Returns: (kv_a, k_pe) both in BF16
+ """
+ backend = forward_batch.attn_backend
+ if isinstance(backend, TboAttnBackend): # if enable tbo, get primary backend
+ backend = backend.primary
+ kv_indices = backend.forward_metadata.page_table_1_flattened
+ assert (
+ kv_indices is not None
+ ), "page_table_1_flattened should have been generated for FP8 MHA path"
+
+ kv_cache_fp8 = forward_batch.token_to_kv_pool.get_key_buffer(
+ self.attn_mha.layer_id
+ )
+
+ kv_latent_bf16 = dequantize_k_cache_paged(kv_cache_fp8, kv_indices)
+
+ kv_a = kv_latent_bf16[:, :, : self.kv_lora_rank].squeeze(1).contiguous()
+ k_pe = kv_latent_bf16[:, :, self.kv_lora_rank :]
+
+ return kv_a, k_pe
+
+ def _concat_and_cast_mha_k(
+ self: DeepseekV2AttentionMLA,
+ k_nope: torch.Tensor,
+ k_pe: torch.Tensor,
+ forward_batch: ForwardBatch,
+ ):
+ # Temporary for DeepSeek V3/R1 only, but can generalize if needed
+ k_shape = (k_nope.shape[0], self.num_local_heads, self.qk_head_dim)
+ if (
+ _is_cuda
+ and (self.num_local_heads == 128)
+ and (self.qk_nope_head_dim == 128)
+ and (self.qk_rope_head_dim == 64)
+ ):
+ k = k_nope.new_empty(*k_shape)
+ concat_mla_k(k=k, k_nope=k_nope, k_rope=k_pe)
+ elif (
+ _is_cuda
+ and next_power_of_2(self.num_local_heads) == self.num_local_heads
+ and next_power_of_2(self.qk_nope_head_dim) == self.qk_nope_head_dim
+ and next_power_of_2(self.qk_rope_head_dim) == self.qk_rope_head_dim
+ ):
+ # fa3 mha support fp8 inputs
+ if (
+ self.current_attention_backend == "fa3"
+ and self.kv_cache_dtype != "auto"
+ ):
+ attn_dtype = forward_batch.token_to_kv_pool.dtype
+ else:
+ attn_dtype = k_nope.dtype
+ k = k_nope.new_empty(*k_shape, dtype=attn_dtype)
+ concat_and_cast_mha_k_triton(k, k_nope, k_pe)
+ elif _is_hip and self.current_attention_backend == "aiter":
+ k = k_nope.new_empty(*k_shape)
+ concat_and_cast_mha_k_triton(k, k_nope, k_pe)
+ else:
+ k = k_nope.new_empty(*k_shape)
+ k[..., : self.qk_nope_head_dim] = k_nope
+ k[..., self.qk_nope_head_dim :] = k_pe
+ return k
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eca43ce7439bccd6cb098c5bf62f7e8512afad1
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla.py
@@ -0,0 +1,518 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.attention.nsa.utils import nsa_use_prefill_cp
+from sglang.srt.layers.communicator import get_attn_tp_context
+from sglang.srt.layers.quantization.fp8_kernel import (
+ fp8_dtype,
+ per_tensor_quant_mla_fp8,
+ per_token_group_quant_mla_deep_gemm_masked_fp8,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_common.utils import (
+ FORWARD_ABSORB_CORE_ATTENTION_BACKENDS,
+ _is_cublas_ge_129,
+ _is_cuda,
+ _is_gfx95_supported,
+ _is_hip,
+ _use_aiter,
+ _use_aiter_gfx95,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import BumpAllocator
+
+if TYPE_CHECKING:
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+
+if _is_cuda:
+ from sgl_kernel import bmm_fp8 as _raw_bmm_fp8
+
+ from sglang.srt.utils.custom_op import register_custom_op
+
+ # TODO(yuwei): remove this wrapper after sgl-kernel registers its own fake/meta impl
+ # Wrap bmm_fp8 as a custom op so torch.compile does not trace into
+ # torch.cuda.current_blas_handle() (which returns a non-Tensor).
+ @register_custom_op(mutates_args=["out"])
+ def _bmm_fp8_op(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ out: torch.Tensor,
+ A_scale: torch.Tensor,
+ B_scale: torch.Tensor,
+ ) -> None:
+ _raw_bmm_fp8(A, B, A_scale, B_scale, out.dtype, out)
+
+ def bmm_fp8(A, B, A_scale, B_scale, dtype, out=None):
+ if out is None:
+ out = torch.empty(
+ (A.shape[0], A.shape[1], B.shape[2]),
+ device=A.device,
+ dtype=dtype,
+ )
+ _bmm_fp8_op(A, B, out, A_scale, B_scale)
+ return out
+
+
+if _use_aiter:
+ from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (
+ batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant,
+ )
+if _use_aiter_gfx95:
+ from aiter.ops.triton.fused_fp8_quant import (
+ fused_flatten_fp8_group_quant,
+ fused_rms_fp8_group_quant,
+ )
+
+ from sglang.srt.layers.quantization.rocm_mxfp4_utils import (
+ batched_gemm_afp4wfp4_pre_quant,
+ fused_flatten_mxfp4_quant,
+ fused_rms_mxfp4_quant,
+ )
+ from sglang.srt.layers.rocm_linear_utils import fused_qk_rope_cat_and_cache_mla
+
+
+class DeepseekMLAForwardMixin:
+
+ def init_mla_forward(self: DeepseekV2AttentionMLA):
+ self.flashinfer_mla_disable_ragged = (
+ get_global_server_args().flashinfer_mla_disable_ragged
+ )
+
+ def forward_absorb_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ llama_4_scaling: Optional[torch.Tensor] = None,
+ ):
+ from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+ q_lora = None
+ topk_indices = None
+ if self.q_lora_rank is not None:
+ q, latent_cache = (
+ get_attn_tp_context()
+ .fetch_qkv_latent()
+ .split(
+ [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+ dim=-1,
+ )
+ )
+ k_nope = latent_cache[..., : self.kv_lora_rank]
+
+ # overlap qk norm
+ if self.alt_stream is not None and get_is_capture_mode():
+ current_stream = torch.cuda.current_stream()
+ self.alt_stream.wait_stream(current_stream)
+ q = self.q_a_layernorm(q)
+ with torch.cuda.stream(self.alt_stream):
+ k_nope = self.kv_a_layernorm(k_nope)
+ current_stream.wait_stream(self.alt_stream)
+ else:
+ if _use_aiter_gfx95 and self.q_b_proj.weight.dtype == torch.uint8:
+ q, _, k_nope, *_ = fused_rms_mxfp4_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ k_nope,
+ self.kv_a_layernorm.weight,
+ self.kv_a_layernorm.variance_epsilon,
+ )
+ else:
+ q_lora = None
+ if (
+ _use_aiter_gfx95
+ and self.q_b_proj.weight.dtype == torch.float8_e4m3fn
+ ):
+ if self.use_nsa:
+ q_quanted, q_lora, k_nope, _ = fused_rms_fp8_group_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ k_nope,
+ self.kv_a_layernorm.weight,
+ self.kv_a_layernorm.variance_epsilon,
+ group_size=128,
+ dtype_quant=torch.float8_e4m3fn,
+ res1=None,
+ output_unquantized_inp1=True,
+ )
+ q = q_quanted
+ else:
+ q, _, k_nope, _ = fused_rms_fp8_group_quant(
+ q,
+ self.q_a_layernorm.weight,
+ self.q_a_layernorm.variance_epsilon,
+ k_nope,
+ self.kv_a_layernorm.weight,
+ self.kv_a_layernorm.variance_epsilon,
+ group_size=128,
+ dtype_quant=torch.float8_e4m3fn,
+ res1=None,
+ output_unquantized_inp1=False,
+ )
+
+ else:
+ q = self.q_a_layernorm(q)
+ k_nope = self.kv_a_layernorm(k_nope)
+
+ # q_lora needed by indexer
+ if self.use_nsa:
+ if q_lora is None:
+ q_lora = q
+
+ # overlap q_b_proj and indexer during decode
+ if (
+ self.alt_stream is not None
+ and get_is_capture_mode()
+ and forward_batch.forward_mode.is_decode_or_idle()
+ and q_lora is not None
+ ):
+ current_stream = torch.cuda.current_stream()
+ self.alt_stream.wait_stream(current_stream)
+ with torch.cuda.stream(self.alt_stream):
+ k_nope = k_nope.unsqueeze(1)
+ q = self.q_b_proj(q)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ topk_indices = self.indexer(
+ x=hidden_states,
+ q_lora=q_lora,
+ positions=positions,
+ forward_batch=forward_batch,
+ layer_id=self.layer_id,
+ )
+ current_stream.wait_stream(self.alt_stream)
+ else:
+ k_nope = k_nope.unsqueeze(1)
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+ if q_lora is not None:
+ topk_indices = self.indexer(
+ x=hidden_states,
+ q_lora=q_lora,
+ positions=positions,
+ forward_batch=forward_batch,
+ layer_id=self.layer_id,
+ )
+ else:
+ q = self.q_proj(hidden_states)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+ k_nope = latent_cache[..., : self.kv_lora_rank]
+ k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+
+ q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+ k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1)
+
+ if self.use_deep_gemm_bmm:
+ q_nope_val, q_nope_scale, masked_m, expected_m, aligned_m = (
+ per_token_group_quant_mla_deep_gemm_masked_fp8(q_nope.transpose(0, 1))
+ )
+ q_nope_out = q_nope.new_empty(
+ (self.num_local_heads, aligned_m, self.kv_lora_rank)
+ )
+ deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+ (q_nope_val, q_nope_scale),
+ (self.w_kc, self.w_scale_k),
+ q_nope_out,
+ masked_m,
+ expected_m,
+ )
+ q_nope_out = q_nope_out[:, :expected_m, :]
+ elif _is_hip:
+ # TODO(haishaw): add bmm_fp8 to ROCm
+ if _use_aiter_gfx95 and self.w_kc.dtype == torch.uint8:
+ x = q_nope.transpose(0, 1)
+ q_nope_out = torch.empty(
+ x.shape[0],
+ x.shape[1],
+ self.w_kc.shape[2],
+ device=x.device,
+ dtype=torch.bfloat16,
+ )
+ batched_gemm_afp4wfp4_pre_quant(
+ x,
+ self.w_kc.transpose(-2, -1),
+ self.w_scale_k.transpose(-2, -1),
+ torch.bfloat16,
+ q_nope_out,
+ )
+ else:
+ if (_use_aiter_gfx95 and self.w_kc.dtype == torch.float8_e4m3fn) or (
+ get_is_capture_mode() and self.w_kc.dtype == torch.float8_e4m3fnuz
+ ):
+ # fp8 Triton kernel: always on gfx950,
+ # cudagraph-only on gfx942 (hides launch overhead)
+ q_nope_out = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(
+ X=q_nope,
+ WQ=self.w_kc.transpose(-1, -2),
+ w_scale=self.w_scale,
+ group_size=128,
+ YQ=None, # allocate (B, M, N)
+ transpose_bm=False, # (B, M, N)
+ transpose_bm_in=True, # (M, B, K)
+ dtype=torch.bfloat16,
+ )
+
+ else:
+ q_nope_out = torch.bmm(
+ q_nope.to(torch.bfloat16).transpose(0, 1),
+ self.w_kc.to(torch.bfloat16) * self.w_scale,
+ )
+
+ elif self.w_kc.dtype == torch.float8_e4m3fn:
+ # fix bmm_fp8 error under cublas12.9 caused by bumpallocator, detail in pr#11612
+ q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+ q_nope.transpose(0, 1),
+ (
+ torch.zeros((1,), dtype=torch.float32, device=q_nope.device)
+ if _is_cublas_ge_129
+ else zero_allocator.allocate(1)
+ ),
+ )
+ q_nope_out = bmm_fp8(
+ q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+ )
+ else:
+ q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+
+ q_nope_out = q_nope_out.transpose(0, 1)
+
+ if (
+ self.rotary_emb is not None
+ and (not self._fuse_rope_for_trtllm_mla(forward_batch))
+ and (not _use_aiter or not _is_gfx95_supported or self.use_nsa)
+ ):
+ q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+ if nsa_use_prefill_cp(forward_batch):
+ # support allgather+rerrange
+ k_nope, k_pe = self.rebuild_cp_kv_cache(
+ latent_cache, forward_batch, k_nope, k_pe
+ )
+
+ return (
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ topk_indices,
+ llama_4_scaling,
+ )
+
+ def forward_absorb_core(
+ self: DeepseekV2AttentionMLA,
+ q_pe,
+ k_pe,
+ q_nope_out,
+ k_nope,
+ forward_batch,
+ zero_allocator,
+ positions,
+ topk_indices,
+ llama_4_scaling,
+ ):
+ save_kv_cache = True
+
+ if self.current_attention_backend in FORWARD_ABSORB_CORE_ATTENTION_BACKENDS:
+ extra_args = {}
+ if self._fuse_rope_for_trtllm_mla(forward_batch):
+ extra_args = {
+ "cos_sin_cache": self.rotary_emb.cos_sin_cache,
+ "is_neox": self.rotary_emb.is_neox_style,
+ "llama_4_scaling": llama_4_scaling,
+ }
+
+ attn_output = self.attn_mqa(
+ q_nope_out,
+ k_nope,
+ k_nope,
+ forward_batch,
+ q_rope=q_pe,
+ k_rope=k_pe,
+ **extra_args,
+ **(dict(topk_indices=topk_indices) if topk_indices is not None else {}),
+ )
+ else:
+ if _use_aiter_gfx95:
+ cos = self.rotary_emb.cos_cache
+ sin = self.rotary_emb.sin_cache
+
+ kv_cache_dtype = (
+ fp8_dtype if self.kv_cache_dtype == "fp8_e4m3" else q_nope_out.dtype
+ )
+
+ q, _, _, k = fused_qk_rope_cat_and_cache_mla(
+ q_nope_out,
+ q_pe,
+ k_nope,
+ k_pe,
+ forward_batch.token_to_kv_pool.get_key_buffer(
+ self.attn_mqa.layer_id
+ ),
+ forward_batch.out_cache_loc,
+ positions,
+ cos,
+ sin,
+ self.attn_mqa.k_scale,
+ self.rotary_emb.is_neox_style,
+ q_out_dtype=kv_cache_dtype,
+ )
+
+ save_kv_cache = False
+ else:
+ q = torch.cat([q_nope_out, q_pe], dim=-1)
+ k = torch.cat([k_nope, k_pe], dim=-1)
+
+ # Apply llama 4 scaling if provided
+ if llama_4_scaling is not None:
+ q *= llama_4_scaling
+
+ attn_output = self.attn_mqa(
+ q,
+ k,
+ k_nope,
+ forward_batch,
+ save_kv_cache=save_kv_cache,
+ **(dict(topk_indices=topk_indices) if topk_indices is not None else {}),
+ )
+ attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+ if self.use_deep_gemm_bmm:
+ attn_output_val, attn_output_scale, masked_m, expected_m, aligned_m = (
+ per_token_group_quant_mla_deep_gemm_masked_fp8(
+ attn_output.transpose(0, 1)
+ )
+ )
+ attn_bmm_output = attn_output.new_empty(
+ (self.num_local_heads, aligned_m, self.v_head_dim)
+ )
+ deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+ (attn_output_val, attn_output_scale),
+ (self.w_vc, self.w_scale_v),
+ attn_bmm_output,
+ masked_m,
+ expected_m,
+ )
+ attn_bmm_output = (
+ attn_bmm_output[:, :expected_m, :].transpose(0, 1).flatten(1, 2)
+ )
+ elif _is_hip:
+ # TODO(haishaw): add bmm_fp8 to ROCm
+ if _use_aiter_gfx95 and self.w_vc.dtype == torch.uint8:
+ x = attn_output.transpose(0, 1)
+ attn_bmm_output = torch.empty(
+ x.shape[0],
+ x.shape[1],
+ self.w_vc.shape[2],
+ device=x.device,
+ dtype=torch.bfloat16,
+ )
+ batched_gemm_afp4wfp4_pre_quant(
+ x,
+ self.w_vc.transpose(-2, -1),
+ self.w_scale_v.transpose(-2, -1),
+ torch.bfloat16,
+ attn_bmm_output,
+ )
+ else:
+ if _use_aiter_gfx95 and self.w_kc.dtype == torch.float8_e4m3fn:
+ attn_bmm_output = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(
+ X=attn_output,
+ WQ=self.w_vc.transpose(-1, -2),
+ w_scale=self.w_scale,
+ group_size=128,
+ YQ=None,
+ transpose_bm=False,
+ transpose_bm_in=True,
+ dtype=torch.bfloat16,
+ )
+ else:
+ attn_bmm_output = torch.bmm(
+ attn_output.to(torch.bfloat16).transpose(0, 1),
+ self.w_vc.to(torch.bfloat16) * self.w_scale,
+ )
+
+ if self.o_proj.weight.dtype == torch.uint8:
+ attn_bmm_output = attn_bmm_output.transpose(0, 1)
+ attn_bmm_output = fused_flatten_mxfp4_quant(attn_bmm_output)
+ elif self.o_proj.weight.dtype == torch.float8_e4m3fn:
+ attn_bmm_output = attn_bmm_output.transpose(0, 1)
+ attn_bmm_output = fused_flatten_fp8_group_quant(
+ attn_bmm_output, group_size=128, dtype_quant=torch.float8_e4m3fn
+ )
+ else:
+ attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+
+ elif self.w_vc.dtype == torch.float8_e4m3fn:
+ attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+ attn_output.transpose(0, 1),
+ (
+ torch.zeros((1,), dtype=torch.float32, device=attn_output.device)
+ if _is_cublas_ge_129
+ else zero_allocator.allocate(1)
+ ),
+ )
+ attn_bmm_output = bmm_fp8(
+ attn_output_val,
+ self.w_vc,
+ attn_output_scale,
+ self.w_scale,
+ torch.bfloat16,
+ )
+ attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+ else:
+ if is_in_piecewise_cuda_graph():
+ # torch dynamo requires out= op was called where output tensor was non-contiguous
+ attn_bmm_output = (
+ torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+ .transpose(0, 1)
+ .flatten(1, 2)
+ )
+ else:
+ attn_bmm_output = torch.empty(
+ (attn_output.shape[0], self.num_local_heads * self.v_head_dim),
+ dtype=attn_output.dtype,
+ device=attn_output.device,
+ )
+ torch.bmm(
+ attn_output.transpose(0, 1),
+ self.w_vc,
+ out=attn_bmm_output.view(
+ -1, self.num_local_heads, self.v_head_dim
+ ).transpose(0, 1),
+ )
+ output, _ = self.o_proj(attn_bmm_output)
+
+ return output
+
+ def _fuse_rope_for_trtllm_mla(
+ self: DeepseekV2AttentionMLA, forward_batch: ForwardBatch
+ ) -> bool:
+ """
+ Check if we should skip rope and do fused rope+quantize for TRTLLM MLA decode in fp8_e4m3 path.
+ """
+ if self.current_attention_backend == "nsa":
+ return (
+ get_global_server_args().nsa_decode_backend == "trtllm"
+ or get_global_server_args().nsa_prefill_backend == "trtllm"
+ ) and forward_batch.attn_backend.kv_cache_dtype == torch.float8_e4m3fn
+
+ return (
+ self.current_attention_backend == "trtllm_mla"
+ and (
+ forward_batch.forward_mode.is_decode_or_idle()
+ or forward_batch.forward_mode.is_target_verify()
+ )
+ and forward_batch.attn_backend.data_type == torch.float8_e4m3fn
+ )
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_cpu.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..43a5eed37501f231149d9d17e727e43cbcb918b1
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_cpu.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_common.utils import (
+ _is_cpu,
+ _is_cpu_amx_available,
+)
+from sglang.srt.utils import BumpAllocator, use_intel_amx_backend
+
+if TYPE_CHECKING:
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+
+
+class DeepseekMLACpuForwardMixin:
+
+ def init_mla_fused_rope_cpu_forward(self: DeepseekV2AttentionMLA):
+ assert hasattr(self, "has_fused_proj") and hasattr(self, "is_packed_weight")
+
+ # If we have self.fused_qkv_a_proj_with_mqa and we're running on CPU, we will choose the torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight kernel
+ # which requires self.w_kc and self.w_vc to be packed.
+ # If not, we will use torch.bmm and weight shouldn't be packed in this case
+ if self.has_fused_proj and _is_cpu and _is_cpu_amx_available:
+ self.quant_method = PackWeightMethod(
+ weight_names=["w_kc", "w_vc"], transpose_dims=[[1, 2], [1, 2]]
+ )
+
+ self.qkv_proj_with_rope_is_int8 = (
+ self.has_fused_proj
+ and not self.is_packed_weight
+ and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.int8
+ )
+ self.qkv_proj_with_rope_is_fp8 = (
+ self.has_fused_proj
+ and not self.is_packed_weight
+ and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.float8_e4m3fn
+ )
+
+ self.weight_block_size = None
+ if self.qkv_proj_with_rope_is_fp8 and _is_cpu and _is_cpu_amx_available:
+ assert getattr(
+ self.fused_qkv_a_proj_with_mqa.quant_method, "block_quant", False
+ ) == getattr(self.q_b_proj.quant_method, "block_quant", False)
+ use_block_quant = getattr(
+ self.fused_qkv_a_proj_with_mqa.quant_method, "block_quant", False
+ )
+
+ if use_block_quant:
+ assert (
+ self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.weight_block_size
+ == self.q_b_proj.quant_method.quant_config.weight_block_size
+ )
+ self.weight_block_size = (
+ self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.weight_block_size
+ )
+
+ def forward_absorb_fused_mla_rope_cpu_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ ):
+ assert self.q_lora_rank is not None and use_intel_amx_backend(
+ self
+ ), "forward_absorb_fused_mla_rope_cpu_prepare requires q_lora_rank is not None and use_intel_amx_backend"
+
+ q_input, k_input, v_input = (
+ torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight(
+ hidden_states,
+ self.fused_qkv_a_proj_with_mqa.weight,
+ self.q_b_proj.weight,
+ self.w_kc,
+ self.q_a_layernorm.weight,
+ self.kv_a_layernorm.weight,
+ positions,
+ self.rotary_emb.cos_sin_cache,
+ self.kv_a_layernorm.variance_epsilon,
+ self.qkv_proj_with_rope_is_int8,
+ self.qkv_proj_with_rope_is_fp8,
+ (
+ self.fused_qkv_a_proj_with_mqa.weight_scale
+ if self.qkv_proj_with_rope_is_int8
+ else (
+ self.fused_qkv_a_proj_with_mqa.weight_scale_inv
+ if self.qkv_proj_with_rope_is_fp8
+ else None
+ )
+ ),
+ (
+ self.q_b_proj.weight_scale
+ if self.qkv_proj_with_rope_is_int8
+ else (
+ self.q_b_proj.weight_scale_inv
+ if self.qkv_proj_with_rope_is_fp8
+ else None
+ )
+ ),
+ True, # is_vnni
+ self.weight_block_size,
+ self.q_lora_rank,
+ self.kv_lora_rank,
+ self.qk_rope_head_dim,
+ )
+ )
+ return (q_input, k_input, v_input, forward_batch, zero_allocator)
+
+ def forward_absorb_fused_mla_rope_cpu_core(
+ self: DeepseekV2AttentionMLA,
+ q_input,
+ k_input,
+ v_input,
+ forward_batch,
+ zero_allocator,
+ ):
+ assert self.q_lora_rank is not None and use_intel_amx_backend(
+ self
+ ), "forward_absorb_fused_mla_rope_cpu_core requires q_lora_rank is not None and use_intel_amx_backend"
+
+ attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
+ attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+ # [Note] Align shapes of bmm inputs.
+ # Shapes of inputs:
+ # q_nope: [M, B, K]
+ # original self.w_kc: [B, K, N]
+ # current self.w_kc (which has been converted in PackWeightMethod): [B, N, K]
+
+ # Shapes of inputs to sgl_kernel.cpu.bmm:
+ # out: [B, M, N]
+ # mat1: [B, M, K]
+ # mat2: [B, N, K]
+ B = self.w_vc.size(0)
+ N = self.w_vc.size(1)
+ M = attn_output.size(0)
+ output = torch.empty([M, int(B * N)], dtype=attn_output.dtype)
+ attn_bmm_output = output.view([M, B, N]).transpose_(0, 1)
+ torch.ops.sgl_kernel.bmm_cpu(
+ attn_bmm_output,
+ attn_output.transpose(0, 1),
+ self.w_vc,
+ True, # is_vnni
+ None, # scale
+ )
+ attn_output = output
+ output, _ = self.o_proj(attn_output)
+
+ return output
diff --git a/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_rocm.py b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_rocm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8868897af2b8c0f69078bcbb4c39b51b44e3a0ae
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mla_fused_rope_rocm.py
@@ -0,0 +1,227 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import per_tensor_quant_mla_fp8
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_common.utils import (
+ _is_cuda,
+ _is_hip,
+)
+from sglang.srt.utils import BumpAllocator, get_bool_env_var
+
+if TYPE_CHECKING:
+ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+
+if _is_cuda:
+ from sgl_kernel import bmm_fp8
+
+if _is_hip:
+ from sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope import (
+ decode_attention_fwd_grouped_rope,
+ )
+
+
+class DeepseekMLARocmForwardMixin:
+
+ def init_mla_fused_rope_rocm_forward(self: DeepseekV2AttentionMLA):
+ self.rocm_fused_decode_mla = get_bool_env_var(
+ "SGLANG_ROCM_FUSED_DECODE_MLA", "false"
+ )
+
+ def forward_absorb_fused_mla_rope_prepare(
+ self: DeepseekV2AttentionMLA,
+ positions: torch.Tensor,
+ hidden_states: torch.Tensor,
+ forward_batch: ForwardBatch,
+ zero_allocator: BumpAllocator,
+ ):
+ enable_rope_fusion = (
+ os.getenv("SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION", "1") == "1"
+ )
+ # NOTE: hidden_states can be a tuple for some quantization paths.
+ # For shape/device/dtype, use the first tensor; still pass the original
+ # hidden_states through linear ops which may accept tuple inputs.
+ hidden_states_tensor = (
+ hidden_states[0] if isinstance(hidden_states, tuple) else hidden_states
+ )
+
+ q_len = hidden_states_tensor.shape[0]
+ q_input = hidden_states_tensor.new_empty(
+ q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+ )
+ if self.q_lora_rank is not None:
+ q, latent_cache = self.fused_qkv_a_proj_with_mqa(hidden_states)[0].split(
+ [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1
+ )
+ q = self.q_a_layernorm(q)
+ q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+ else:
+ q = self.q_proj(hidden_states)[0].view(
+ -1, self.num_local_heads, self.qk_head_dim
+ )
+ latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+ q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ if _is_hip:
+ # TODO(haishaw): add bmm_fp8 to ROCm
+ q_nope_out = torch.bmm(
+ q_nope.to(torch.bfloat16).transpose(0, 1),
+ self.w_kc.to(torch.bfloat16) * self.w_scale,
+ )
+ elif self.w_kc.dtype == torch.float8_e4m3fn:
+ q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+ q_nope.transpose(0, 1),
+ zero_allocator.allocate(1),
+ dtype=torch.float8_e4m3fn,
+ )
+ q_nope_out = bmm_fp8(
+ q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+ )
+ else:
+ q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+ q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
+ v_input = latent_cache[..., : self.kv_lora_rank]
+ v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
+ k_input = latent_cache.unsqueeze(1)
+ k_input[..., : self.kv_lora_rank] = v_input
+
+ if not enable_rope_fusion:
+ k_pe = k_input[..., self.kv_lora_rank :]
+ q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+ q_input[..., self.kv_lora_rank :] = q_pe
+ k_input[..., self.kv_lora_rank :] = k_pe
+ k_pe_output = None
+ else:
+ k_pe_output = torch.empty_like(k_input[..., self.kv_lora_rank :])
+
+ q_input[..., self.kv_lora_rank :] = q_pe
+
+ # attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
+ # Use Fused ROPE with use_rope=OFF.
+ attn_output = torch.empty(
+ (q_len, self.num_local_heads, self.kv_lora_rank),
+ dtype=q.dtype,
+ device=q.device,
+ )
+ attn_logits, _, kv_indptr, kv_indices, _, _, _ = (
+ forward_batch.attn_backend.forward_metadata
+ )
+ cos_sin_cache = self.rotary_emb.cos_sin_cache
+ num_kv_split = forward_batch.attn_backend.num_kv_splits
+ sm_scale = self.attn_mqa.scaling
+ if attn_logits is None:
+ attn_logits = torch.empty(
+ (
+ forward_batch.batch_size,
+ self.num_local_heads,
+ num_kv_split,
+ self.kv_lora_rank + 1,
+ ),
+ dtype=torch.float32,
+ device=q.device,
+ )
+
+ # save current latent cache.
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+ )
+ key_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+ self.attn_mqa.layer_id
+ )
+ val_cache_buf = key_cache_buf[..., : self.kv_lora_rank]
+
+ return (
+ q_input,
+ key_cache_buf,
+ val_cache_buf,
+ attn_output,
+ kv_indptr,
+ kv_indices,
+ k_pe_output,
+ cos_sin_cache,
+ positions,
+ attn_logits,
+ num_kv_split,
+ sm_scale,
+ enable_rope_fusion,
+ k_input,
+ forward_batch,
+ zero_allocator,
+ )
+
+ def forward_absorb_fused_mla_rope_core(
+ self: DeepseekV2AttentionMLA,
+ q_input,
+ key_cache_buf,
+ val_cache_buf,
+ attn_output,
+ kv_indptr,
+ kv_indices,
+ k_pe_output,
+ cos_sin_cache,
+ positions,
+ attn_logits,
+ num_kv_split,
+ sm_scale,
+ enable_rope_fusion,
+ k_input,
+ forward_batch,
+ zero_allocator,
+ ):
+ decode_attention_fwd_grouped_rope(
+ q_input,
+ key_cache_buf,
+ val_cache_buf,
+ attn_output,
+ kv_indptr,
+ kv_indices,
+ k_pe_output,
+ self.kv_lora_rank,
+ self.rotary_emb.rotary_dim,
+ cos_sin_cache,
+ positions,
+ attn_logits,
+ num_kv_split,
+ sm_scale,
+ logit_cap=self.attn_mqa.logit_cap,
+ use_rope=enable_rope_fusion,
+ is_neox_style=self.rotary_emb.is_neox_style,
+ )
+
+ if enable_rope_fusion:
+ k_input[..., self.kv_lora_rank :] = k_pe_output
+ forward_batch.token_to_kv_pool.set_kv_buffer(
+ self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+ )
+
+ attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+ if _is_hip:
+ # TODO(haishaw): add bmm_fp8 to ROCm
+ attn_bmm_output = torch.bmm(
+ attn_output.to(torch.bfloat16).transpose(0, 1),
+ self.w_vc.to(torch.bfloat16) * self.w_scale,
+ )
+ elif self.w_vc.dtype == torch.float8_e4m3fn:
+ attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+ attn_output.transpose(0, 1),
+ zero_allocator.allocate(1),
+ dtype=torch.float8_e4m3fn,
+ )
+ attn_bmm_output = bmm_fp8(
+ attn_output_val,
+ self.w_vc,
+ attn_output_scale,
+ self.w_scale,
+ torch.bfloat16,
+ )
+ else:
+ attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+ attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+ output, _ = self.o_proj(attn_output)
+
+ return output
diff --git a/sglang/python/sglang/srt/models/deepseek_common/deepseek_weight_loader.py b/sglang/python/sglang/srt/models/deepseek_common/deepseek_weight_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..deda4f97e889a6cebdf8940f5eb2cd2ed2442f57
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/deepseek_weight_loader.py
@@ -0,0 +1,694 @@
+# Copyright 2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import concurrent.futures
+import logging
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import tqdm
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.environ import envs
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_utils import (
+ block_quant_dequant,
+ block_quant_to_tensor_quant,
+ channel_quant_to_tensor_quant,
+ inverse_transform_scale_ue8m0,
+ normalize_e4m3fn_to_e4m3fnuz,
+ quant_weight_ue8m0,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+ block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.model_loader.utils import (
+ maybe_executor_submit,
+ should_async_load,
+ should_deepgemm_weight_requant_ue8m0,
+)
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_common.utils import (
+ _is_cpu,
+ _is_cpu_amx_available,
+ _is_cuda,
+ _is_fp8_fnuz,
+ _is_hip,
+ _is_npu,
+ _use_aiter_gfx95,
+ awq_dequantize_func,
+ enable_nextn_moe_bf16_cast_to_fp8,
+)
+from sglang.srt.utils import bind_or_assign, get_bool_env_var, log_info_on_rank0
+
+if _use_aiter_gfx95:
+ from sglang.srt.layers.quantization.quark.utils import quark_post_load_weights
+
+logger = logging.getLogger(__name__)
+
+# Optional quantization for DeepSeek nvfp4 checkpoint
+NVFP4_CKPT_FP8_ATTN_QUANT_MODULES = ["q_b_proj"]
+
+
+@dataclass(frozen=True)
+class NextNEnabledConfig:
+ num_nextn_layers: int
+ nextn_layer_id: int
+ nextn_layer_prefix: str
+ nextn_spec_weight_names: List[str]
+
+
+@dataclass(frozen=True)
+class NextNDisabledConfig:
+ pass
+
+
+"""Union type for NextN configuration, including enabled and disabled configurations."""
+NextNConfig = NextNEnabledConfig | NextNDisabledConfig
+
+
+class DeepseekV2WeightLoaderMixin:
+ """Mixin for loading weights in DeepSeek V2/V3 models."""
+
+ model: nn.Module
+ config: PretrainedConfig
+ quant_config: Optional[QuantizationConfig]
+ pp_group: GroupCoordinator
+ num_fused_shared_experts: int
+
+ def do_load_weights(
+ self,
+ weights: Iterable[Tuple[str, torch.Tensor]],
+ is_nextn: bool = False,
+ ):
+ """Load model weights from checkpoint.
+
+ Args:
+ weights: Iterable of (weight_name, weight_tensor) pairs
+ is_nextn: Whether loading NextN speculative decoding weights
+ """
+ nextn_conf = self._initialize_nextn_conf(is_nextn)
+
+ weights = self._maybe_quant_weights_to_fp8_ue8m0(
+ weights, NVFP4_CKPT_FP8_ATTN_QUANT_MODULES, nextn_conf
+ )
+
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
+
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+ )
+ # Params for special naming rules in mixed-precision models, for example:
+ # model.layers.xx.mlp.experts.xx.w1.input_scale. For details,
+ # see https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main.
+ if self.quant_config and self.quant_config.get_name() == "w4afp8":
+ expert_params_mapping += FusedMoE.make_expert_input_scale_params_mapping(
+ num_experts=self.config.n_routed_experts
+ )
+
+ # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+ fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+ self.config.q_lora_rank is not None
+ )
+ cached_a_proj = {} if fuse_qkv_a_proj else None
+
+ if self.num_fused_shared_experts > 0:
+ assert self.num_fused_shared_experts == 1
+ log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+ with concurrent.futures.ThreadPoolExecutor() as executor:
+ futures = []
+ params_dict = dict(self.named_parameters())
+ weight_names = []
+ for name, loaded_weight in weights:
+ use_async_loading = should_async_load(loaded_weight)
+ layer_id = get_layer_id(name)
+ if (
+ layer_id is not None
+ and hasattr(self.model, "start_layer")
+ and (
+ layer_id < self.model.start_layer
+ or layer_id >= self.model.end_layer
+ )
+ ):
+ continue
+ if self.num_fused_shared_experts > 0 and "mlp.shared_experts" in name:
+ name = name.replace(
+ "mlp.shared_experts",
+ f"mlp.experts.{self.config.n_routed_experts}",
+ )
+
+ weight_names.append(name)
+
+ match nextn_conf:
+ case NextNEnabledConfig(
+ nextn_layer_prefix=layer_prefix,
+ nextn_spec_weight_names=spec_weight_names,
+ ):
+ if not name.startswith(layer_prefix):
+ continue
+
+ # Use shared head and embed weights from target model
+ if "shared_head.head" in name or "embed_tokens" in name:
+ continue
+
+ # Transform name: NextN-specific → "model.*", decoder → "model.decoder.*"
+ if any(s in name for s in spec_weight_names):
+ name = name.replace(layer_prefix, "model")
+ else:
+ name = name.replace(layer_prefix, "model.decoder")
+ case NextNDisabledConfig():
+ if hasattr(self.config, "num_nextn_predict_layers"):
+ num_nextn_layers = self.config.num_nextn_predict_layers
+ if num_nextn_layers > 0 and name.startswith("model.layers"):
+ name_list = name.split(".")
+ if (
+ len(name_list) >= 3
+ and int(name_list[2])
+ >= self.config.num_hidden_layers
+ ):
+ continue
+
+ if "rotary_emb.inv_freq" in name:
+ continue
+
+ for param_name, weight_name, shard_id in stacked_params_mapping:
+ # Skip non-stacked layers and experts (experts handled below).
+ if weight_name not in name:
+ continue
+ if _is_npu:
+ name = name.replace("weight_packed", "weight")
+ # We have mlp.experts[0].gate_proj in the checkpoint.
+ # Since we handle the experts below in expert_params_mapping,
+ # we need to skip here BEFORE we update the name, otherwise
+ # name will be updated to mlp.experts[0].gate_up_proj, which
+ # will then be updated below in expert_params_mapping
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+ if ("mlp.experts." in name) and name not in params_dict:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ maybe_executor_submit(
+ executor=executor,
+ futures=futures,
+ use_async=use_async_loading,
+ func=weight_loader,
+ func_args=(param, loaded_weight, shard_id),
+ )
+ break
+ else:
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ if _is_npu:
+ name = name.replace("weight_packed", "weight")
+ name = name.replace(weight_name, param_name)
+ if name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ maybe_executor_submit(
+ executor=executor,
+ futures=futures,
+ use_async=use_async_loading,
+ func=weight_loader,
+ func_args=(
+ param,
+ loaded_weight,
+ name,
+ ),
+ func_kwargs={
+ "shard_id": shard_id,
+ "expert_id": expert_id,
+ },
+ )
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ # Skip loading embed_tokens if not first rank in pipeline parallelism
+ if ".embed_tokens." in name and not self.pp_group.is_first_rank:
+ continue
+ # Skip loading norm if not last rank in pipeline parallelism
+ if ".norm." in name and not self.pp_group.is_last_rank:
+ continue
+ if fuse_qkv_a_proj and (
+ "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+ ):
+ cached_a_proj[name] = loaded_weight
+ q_a_proj_name = (
+ name
+ if "q_a_proj" in name
+ else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+ )
+ kv_a_proj_name = (
+ name
+ if "kv_a_proj_with_mqa" in name
+ else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+ )
+
+ # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+ if (
+ q_a_proj_name in cached_a_proj
+ and kv_a_proj_name in cached_a_proj
+ ):
+ q_a_proj_weight = cached_a_proj[q_a_proj_name]
+ kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+
+ if q_a_proj_weight.shape == torch.Size(
+ []
+ ) and kv_a_proj_weight.shape == torch.Size([]):
+ fused_weight = q_a_proj_weight
+ else:
+ cat_dim = 0
+ if self.quant_config is not None and (
+ self.quant_config.get_name() == "awq"
+ or self.quant_config.get_name() == "awq_marlin"
+ or self.quant_config.get_name() == "moe_wna16"
+ ):
+ cat_dim = 1
+
+ fused_weight = torch.cat(
+ [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+ )
+
+ param_name = (
+ name.replace(
+ "q_a_proj", "fused_qkv_a_proj_with_mqa"
+ )
+ if "q_a_proj" in name
+ else name.replace(
+ "kv_a_proj_with_mqa",
+ "fused_qkv_a_proj_with_mqa",
+ )
+ )
+ param = params_dict[param_name]
+
+ weight_loader = getattr(
+ param, "weight_loader", default_weight_loader
+ )
+ maybe_executor_submit(
+ executor=executor,
+ futures=futures,
+ use_async=use_async_loading,
+ func=weight_loader,
+ func_args=(param, fused_weight),
+ )
+ cached_a_proj.pop(q_a_proj_name)
+ cached_a_proj.pop(kv_a_proj_name)
+ else:
+ if (
+ "k_scale" in name or "v_scale" in name
+ ) and name not in params_dict:
+ # modelopt attn kv scale is named differently
+ for scale in ["k_scale", "v_scale"]:
+ if scale in name:
+ name = name.replace(
+ f"{scale[0]}_proj", "attn_mqa"
+ )
+ break
+ if name not in params_dict:
+ # modelopt ckpt contains not needed weights for MTP module:
+ # model.decoder.self_attn.attn_mqa.v_scale and
+ # model.decoder.self_attn.attn_mqa.k_scale
+ logger.warning(f"{name} not found in params_dict.")
+ continue
+ param = params_dict[name]
+ weight_loader = getattr(
+ param, "weight_loader", default_weight_loader
+ )
+ maybe_executor_submit(
+ executor=executor,
+ futures=futures,
+ use_async=use_async_loading,
+ func=weight_loader,
+ func_args=(param, loaded_weight),
+ )
+
+ # Wait for all tasks to complete and raise any exceptions.
+ for future in concurrent.futures.as_completed(futures):
+ future.result()
+
+ self.post_load_weights(is_nextn=is_nextn, weight_names=weight_names)
+
+ def _initialize_nextn_conf(self, is_nextn: bool) -> NextNConfig:
+ """
+ Initialize the nextn configuration.
+
+ Raises:
+ ValueError: If num_nextn_predict_layers is not in the config.
+ AssertionError: If num_nextn_predict_layers is not equal to 1.
+ """
+ if not is_nextn:
+ return NextNDisabledConfig()
+
+ if not hasattr(self.config, "num_nextn_predict_layers"):
+ raise ValueError("num_nextn_predict_layers is not in the config")
+
+ num_nextn_layers = self.config.num_nextn_predict_layers
+ assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+
+ # compatible with old design
+ nextn_layer_id = (
+ 0 if self.config.num_hidden_layers == 1 else self.config.num_hidden_layers
+ )
+
+ return NextNEnabledConfig(
+ num_nextn_layers=num_nextn_layers,
+ nextn_layer_id=nextn_layer_id,
+ nextn_layer_prefix=f"model.layers.{nextn_layer_id}",
+ nextn_spec_weight_names=[
+ "shared_head.norm",
+ "eh_proj",
+ "enorm",
+ "hnorm",
+ ],
+ )
+
+ def post_load_weights(
+ self,
+ is_nextn: bool = False,
+ weight_names: Optional[Iterable[str]] = None,
+ ) -> None:
+ """Post-process weights after loading.
+
+ Handles kv_b_proj weight processing including:
+ - AWQ dequantization
+ - FP8/INT8 requantization and block-wise to tensor-wise conversion
+ - Splitting weights into w_kc and w_vc components for MLA
+
+ Args:
+ is_nextn: Whether processing NextN weights
+ weight_names: Optional list of loaded weight names to determine which layers to process
+ """
+ if is_nextn:
+ layer_ids = [self.config.num_hidden_layers]
+ else:
+ if weight_names is None:
+ layer_ids = range(self.model.start_layer, self.model.end_layer)
+ else:
+ layer_ids = set()
+ for name in weight_names:
+ if "kv_b_proj" in name:
+ layer_id = int(name.split(".")[2])
+ if layer_id < self.config.num_hidden_layers:
+ layer_ids.add(layer_id)
+
+ for layer_id in layer_ids:
+ self_attn = (
+ self.model.layers[layer_id].self_attn
+ if not is_nextn
+ else self.model.decoder.self_attn
+ )
+
+ if hasattr(self_attn.kv_b_proj, "qweight"):
+ # awq compatible, dequantize the weight if supported
+ awq_dequantize_f = awq_dequantize_func()
+ if awq_dequantize_f is not None:
+ w = awq_dequantize_f(
+ self_attn.kv_b_proj.qweight,
+ self_attn.kv_b_proj.scales,
+ self_attn.kv_b_proj.qzeros,
+ ).T
+ else:
+ raise ValueError(
+ "AWQ dequantize function is not supported for the current device"
+ )
+ else:
+ w = self_attn.kv_b_proj.weight
+
+ # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
+ # This may affect the accuracy of fp8 model.
+ # Fix deepseek v3 blockwise bmm by using deep_gemm
+ use_deep_gemm_bmm = False
+
+ if w.dtype in (
+ torch.float8_e4m3fn,
+ torch.float8_e4m3fnuz,
+ ):
+ # For mixed quantization (experts int4, linear fp8), use linear_fp8_config
+ selected_quant_config = getattr(
+ self.quant_config, "linear_fp8_config", None
+ )
+ if selected_quant_config is None:
+ selected_quant_config = self.quant_config
+ weight_block_size = getattr(
+ selected_quant_config, "weight_block_size", None
+ )
+ if weight_block_size is not None:
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv") or hasattr(
+ self_attn.kv_b_proj, "weight_scale"
+ )
+ weight_scale = (
+ self_attn.kv_b_proj.weight_scale
+ if hasattr(self_attn.kv_b_proj, "weight_scale")
+ else self_attn.kv_b_proj.weight_scale_inv
+ )
+ if _is_fp8_fnuz:
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+ weight=w,
+ weight_scale=weight_scale,
+ input_scale=None,
+ )
+ else:
+ weight = w
+
+ # In multiple weight loading scenarios (e.g. RL), we need to inverse the scale of the weights after the requantization happened at the first loading.
+ if (
+ should_deepgemm_weight_requant_ue8m0(
+ weight_block_size=getattr(
+ self.quant_config, "weight_block_size", None
+ )
+ )
+ and weight_scale.format_ue8m0
+ ):
+ weight_scale = inverse_transform_scale_ue8m0(
+ weight_scale, mn=weight.shape[-2]
+ )
+
+ if (
+ _is_cuda
+ and weight_block_size[0] == 128
+ and weight_block_size[1] == 128
+ ):
+ if (
+ deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+ and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+ and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+ ):
+ block_scale = weight_scale
+ use_deep_gemm_bmm = True
+ else:
+ w = block_quant_dequant(
+ weight,
+ weight_scale,
+ weight_block_size,
+ torch.bfloat16,
+ )
+ else:
+ w, scale = block_quant_to_tensor_quant(
+ weight, weight_scale, weight_block_size
+ )
+ self_attn.w_scale = scale
+ else:
+ if _is_fp8_fnuz:
+ weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+ weight=w,
+ weight_scale=self_attn.kv_b_proj.weight_scale,
+ input_scale=None,
+ )
+ else:
+ weight = w
+ weight_scale = self_attn.kv_b_proj.weight_scale
+
+ w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+ self_attn.w_scale = scale
+
+ if w.dtype == torch.int8:
+ if hasattr(self.quant_config, "weight_block_size"):
+ # block-wise int8 need it
+ weight_block_size = self.quant_config.weight_block_size
+ if weight_block_size is not None:
+ assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+ weight = w
+ weight_scale = self_attn.kv_b_proj.weight_scale_inv
+ w = int8_block_dequant(
+ weight, weight_scale, weight_block_size
+ ).to(torch.bfloat16)
+ else:
+ # channel-wise int8 need it
+ w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+ torch.bfloat16
+ )
+
+ w_kc, w_vc = w.unflatten(
+ 0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+ ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+
+ if (
+ _use_aiter_gfx95
+ and self.quant_config is not None
+ and self.quant_config.get_name() == "quark"
+ ):
+ w_kc, self_attn.w_scale_k, w_vc, self_attn.w_scale_v = (
+ quark_post_load_weights(self_attn, w, "mxfp4")
+ )
+
+ if not use_deep_gemm_bmm:
+ self_attn.w_kc = bind_or_assign(
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+ )
+ w_vc = w_vc.contiguous().transpose(1, 2)
+ if _is_npu:
+ w_vc = w_vc.contiguous()
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc)
+ if (
+ hasattr(self_attn.kv_b_proj, "weight_scale")
+ and self_attn.w_scale is None
+ ):
+ self_attn.w_scale = bind_or_assign(
+ self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+ )
+ if _is_hip:
+ self_attn.w_scale *= 2.0
+ # TODO: remove this after adding FP8 support in bmm cpu kernel
+ if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
+ self_attn.w_kc = (
+ self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+ )
+ self_attn.w_vc = (
+ self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+ )
+ else:
+ num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+ num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+ ws_kc, ws_vc = block_scale.unflatten(
+ 0, (-1, (num_tiles_k + num_tiles_n))
+ ).split([num_tiles_k, num_tiles_n], dim=1)
+ self_attn.w_scale_k = bind_or_assign(
+ self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+ )
+ self_attn.w_scale_v = bind_or_assign(
+ self_attn.w_scale_v, ws_vc.contiguous()
+ )
+ self_attn.w_kc = bind_or_assign(
+ self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+ )
+ self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+ self_attn.use_deep_gemm_bmm = True
+
+ def _maybe_quant_weights_to_fp8_ue8m0(
+ self,
+ weights,
+ attn_quant_modules,
+ nextn_conf: NextNConfig,
+ ):
+ """Optionally quantize weights to FP8 UE8M0 format for DeepSeek nvfp4 checkpoints.
+
+ Args:
+ weights: Iterable of (name, tensor) weight pairs
+ attn_quant_modules: List of attention module names to quantize
+ nextn_conf: NextN configuration
+
+ Returns:
+ Original weights iterator if no quantization needed,
+ otherwise list of (name, tensor) pairs with quantized weights
+ """
+ weight_block_size = [128, 128]
+ partial_names = []
+
+ match nextn_conf:
+ case NextNEnabledConfig(nextn_layer_id=layer_id):
+ if envs.SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN.get():
+ for stem in attn_quant_modules:
+ partial_names.append(
+ f"model.layers.{layer_id}.self_attn.{stem}"
+ )
+
+ if enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
+ expert_sub_names = ["shared_experts"] + [
+ f"experts.{i}" for i in range(self.config.n_routed_experts)
+ ]
+ for expert_sub_name in expert_sub_names:
+ for stem in ["gate_proj", "up_proj", "down_proj"]:
+ partial_names.append(
+ f"model.layers.{layer_id}.mlp.{expert_sub_name}.{stem}"
+ )
+
+ case NextNDisabledConfig():
+ if envs.SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN.get():
+ for layer_id in range(self.config.num_hidden_layers):
+ for stem in attn_quant_modules:
+ partial_names.append(
+ f"model.layers.{layer_id}.self_attn.{stem}"
+ )
+
+ # Early return if no quantization needed - avoid materializing all weights into memory
+ if not partial_names:
+ return weights
+
+ # Only materialize weights dict when quantization is actually needed
+ weights_dict = dict(weights)
+
+ for partial_name in tqdm.tqdm(partial_names, desc="quant weights to fp8 ue8m0"):
+ original_weight = weights_dict[f"{partial_name}.weight"]
+ out_w, out_s = quant_weight_ue8m0(
+ original_weight, weight_block_size=weight_block_size
+ )
+ weights_dict[f"{partial_name}.weight"] = out_w
+ weights_dict[f"{partial_name}.weight_scale_inv"] = out_s
+
+ if isinstance(
+ nextn_conf, NextNEnabledConfig
+ ) and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
+ self._mark_nextn_moe_weights_as_ue8m0()
+
+ return list(weights_dict.items())
+
+ def _mark_nextn_moe_weights_as_ue8m0(self):
+ """Mark NextN MoE weight scales as UE8M0 format to avoid requantization."""
+ experts = self.model.decoder.mlp.experts
+ w13_scale = (
+ experts.w13_weight_scale_inv
+ if hasattr(experts, "w13_weight_scale_inv")
+ else experts.w13_weight_scale
+ )
+ w2_scale = (
+ experts.w2_weight_scale_inv
+ if hasattr(experts, "w2_weight_scale_inv")
+ else experts.w2_weight_scale
+ )
+ w13_scale.format_ue8m0 = True
+ w2_scale.format_ue8m0 = True
diff --git a/sglang/python/sglang/srt/models/deepseek_common/utils.py b/sglang/python/sglang/srt/models/deepseek_common/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5be323f29d4f2eb46ecba0139899b01339a2788e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_common/utils.py
@@ -0,0 +1,115 @@
+# Copyright 2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+import math
+from typing import Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.moe.fused_moe_triton.layer import get_moe_runner_backend
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.utils import (
+ cpu_has_amx_support,
+ get_bool_env_var,
+ get_device_sm,
+ is_cpu,
+ is_cuda,
+ is_gfx95_supported,
+ is_hip,
+ is_npu,
+ is_nvidia_cublas_version_ge_12_9,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+_is_gfx95_supported = is_gfx95_supported()
+_use_aiter_gfx95 = _use_aiter and _is_gfx95_supported
+
+
+_is_cublas_ge_129 = is_nvidia_cublas_version_ge_12_9()
+
+logger = logging.getLogger(__name__)
+
+NVFP4_CKPT_FP8_ATTN_QUANT_MODULES = ["q_b_proj"]
+
+FORWARD_ABSORB_CORE_ATTENTION_BACKENDS = [
+ "fa3",
+ "nsa",
+ "flashinfer",
+ "cutlass_mla",
+ "trtllm_mla",
+ "ascend",
+]
+
+
+def awq_dequantize_func():
+ """
+ Get the AWQ dequantize function for the current device
+
+ Return:
+ - The AWQ dequantize function for the current device.
+ - None if the current device is not supported.
+ """
+ if _is_cuda:
+ from sgl_kernel import awq_dequantize
+
+ return awq_dequantize
+ elif _is_hip:
+ from sglang.srt.layers.quantization.awq_triton import (
+ awq_dequantize_triton as awq_dequantize,
+ )
+
+ return awq_dequantize
+ elif _is_npu:
+ from sglang.srt.layers.quantization.awq_triton import (
+ awq_dequantize_decomposition as awq_dequantize,
+ )
+
+ return awq_dequantize
+ else:
+ return None
+
+
+def enable_nextn_moe_bf16_cast_to_fp8(
+ quant_config: Optional[QuantizationConfig],
+) -> bool:
+ return (
+ envs.SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE.get()
+ and quant_config is not None
+ and quant_config.get_name() == "modelopt_fp4"
+ and get_moe_runner_backend().is_deep_gemm()
+ )
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+ if scale <= 1:
+ return 1.0
+ return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def _get_llama_4_scaling(
+ original_max_position_embeddings: int, scaling_beta: float, positions: torch.Tensor
+) -> torch.Tensor:
+ scaling = 1 + scaling_beta * torch.log(
+ 1 + torch.floor(positions / original_max_position_embeddings)
+ )
+ return scaling[..., None, None]
diff --git a/sglang/python/sglang/srt/multimodal/__pycache__/vit_cuda_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/multimodal/__pycache__/vit_cuda_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1790ef1a7b33108da49a002f1c4be9af65aefe5c
Binary files /dev/null and b/sglang/python/sglang/srt/multimodal/__pycache__/vit_cuda_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_module.cpython-311.pyc b/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_module.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a95ac4f88db48aea7bbb9a53c8bcbd9c90a6a29b
Binary files /dev/null and b/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_module.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_processor.cpython-311.pyc b/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e35d36bcd8adfcdd0fe91aa6464dd91c6db90da4
Binary files /dev/null and b/sglang/python/sglang/srt/multimodal/evs/__pycache__/evs_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/multimodal/evs/evs_module.py b/sglang/python/sglang/srt/multimodal/evs/evs_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7ab6296e074a93bc56fe2cb9261b78bbe144eee
--- /dev/null
+++ b/sglang/python/sglang/srt/multimodal/evs/evs_module.py
@@ -0,0 +1,201 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import dataclasses
+import typing
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.mem_cache.multimodal_cache import EmbeddingResult
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
+from sglang.utils import logger
+
+from .evs_core import compute_retention_mask, replace_offsets_with_tokens_per_frame
+
+
+@dataclasses.dataclass(kw_only=True)
+class EVSDataItem(MultimodalDataItem):
+ thw_grids: list[tuple[int, int, int]]
+
+
+@dataclasses.dataclass(kw_only=True)
+class VideoEVSDataItem(EVSDataItem):
+ pre_chunked_input_ids: torch.Tensor
+
+ def __post_init__(self):
+ assert self.is_video()
+
+
+@dataclass(kw_only=True)
+class EVSEmbeddingResult(EmbeddingResult):
+ """
+ Embedding result that includes per-frame token counts after EVS pruning.
+
+ After pruning, each frame retains a different number of tokens based on its
+ dissimilarity to the previous frame. This metadata is needed downstream to
+ adjust the input_ids placeholder spans to match the actual embedding sizes.
+
+ Attributes:
+ embedding: The pruned video embeddings tensor.
+ num_tokens_per_frame: Actual retained token count for each frame.
+ For example, [256, 180, 195, 256] means frame 0 kept all 256 tokens
+ (first frame is never pruned), while frames 1-2 were pruned.
+ """
+
+ num_tokens_per_frame: list[int]
+
+ def redistribute_pruned_frames_placeholders(
+ self,
+ input_ids: torch.Tensor,
+ offsets: list[tuple[int, int]],
+ *,
+ item: VideoEVSDataItem,
+ extend_prefix_len: int,
+ extend_seq_len: int,
+ ) -> tuple[torch.Tensor, list[tuple[int, int]]]:
+ assert len(input_ids) == extend_seq_len
+ assert isinstance(
+ item, VideoEVSDataItem
+ ), f"Expected VideoEVSDataItem, got {type(item)}"
+ pre_chunked_input_ids = item.pre_chunked_input_ids
+ filler_token_id = item.pad_value
+ input_ids_list = replace_offsets_with_tokens_per_frame(
+ pre_chunked_input_ids=pre_chunked_input_ids,
+ num_tokens_per_frame=self.num_tokens_per_frame,
+ frame_offsets_inclusive=offsets,
+ filler_token_id=filler_token_id,
+ )
+ input_ids = torch.tensor(
+ input_ids_list, dtype=input_ids.dtype, device=input_ids.device
+ )
+ offsets = BaseMultimodalProcessor.get_mm_items_offset(
+ input_ids, filler_token_id
+ )
+ input_ids = input_ids[extend_prefix_len : extend_prefix_len + extend_seq_len]
+ assert (
+ len(input_ids) == extend_seq_len
+ ), f"Input ids length changed after redistribution, got {len(input_ids)} != {extend_seq_len}"
+ return input_ids, offsets
+
+
+@dataclass(frozen=True, kw_only=True)
+class EVSConfig:
+ video_pruning_rate: float
+ spatial_merge_size: int = 1
+
+ def __post_init__(self):
+ assert (
+ self.video_pruning_rate >= 0.0 and self.video_pruning_rate < 1.0
+ ), f"Video pruning rate must be between 0.0 and 1.0, got {self.video_pruning_rate=}"
+
+
+class EVS(torch.nn.Module, ABC):
+ """
+ Base class for video models that support EVS pruning.
+
+ Subclass this alongside your model class and implement the static `create_evs_config`.
+ On initialization, if video_pruning_rate > 0, this mixin replaces the model's
+ get_video_feature() method with a wrapper that applies EVS pruning.
+
+ Example: See `NemotronH_Nano_VL_V2`
+ """
+
+ @staticmethod
+ @abstractmethod
+ def create_evs_config(config: PretrainedConfig) -> EVSConfig:
+ """Extract EVS parameters from model config. Must be implemented by subclass."""
+ raise NotImplementedError
+
+ @abstractmethod
+ def get_video_feature(self, items: list[MultimodalDataItem]) -> torch.Tensor:
+ """Extract EVS parameters from model config. Must be implemented by subclass."""
+ raise NotImplementedError
+
+ def __init__(
+ self,
+ config: PretrainedConfig,
+ *args: typing.Any,
+ **kwargs: typing.Any,
+ ) -> None:
+ super().__init__()
+ model_name = self.__class__.__name__
+ self.original_get_video_feature = self.get_video_feature
+ self.evs_config = self.create_evs_config(config)
+ self.evs_enabled = self.evs_config.video_pruning_rate > 0.0
+ if self.evs_enabled:
+ logger.info(f"[EVS] enabled for {model_name} [{self.evs_config}]")
+ self.get_video_feature = self.evs_video
+ else:
+ logger.info(
+ f"[EVS] requested on model {model_name} but is disabled for pruning_rate == 0.0."
+ )
+
+ def evs_video(self, items: list[MultimodalDataItem]) -> EVSEmbeddingResult:
+ """
+ Apply EVS pruning to video embeddings.
+
+ Args:
+ items: List containing a single VideoEVSDataItem with video features.
+
+ Returns:
+ EVSEmbeddingResult with pruned embeddings and actual token counts per frame.
+ """
+ logger.debug(
+ f"[EVS] beginning for model {self.__class__.__name__} [evs_config={self.evs_config=}]"
+ )
+ assert len(items) == 1, f"Expected 1 item, got {len(items)}"
+ item = items[0]
+ assert isinstance(
+ item, VideoEVSDataItem
+ ), f"Expected VideoEVSDataItem with modality VIDEO, got {item}"
+
+ q = self.evs_config.video_pruning_rate
+ merge = self.evs_config.spatial_merge_size
+ videos_features = self.original_get_video_feature([item])
+ if videos_features.ndim == 3:
+ videos_features = videos_features.flatten(0, 1)
+ assert videos_features.ndim == 2, videos_features.ndim
+
+ final_embeddings: list[torch.Tensor] = []
+ num_tokens_per_frame: list[int] = []
+
+ sizes = [(t * h * w // merge**2) for t, h, w in item.thw_grids]
+ for single_video, video_size_thw in zip(
+ videos_features.split(sizes),
+ item.thw_grids,
+ strict=True,
+ ):
+ retention_mask = compute_retention_mask(
+ single_video,
+ video_size_thw=video_size_thw,
+ spatial_merge_size=merge,
+ q=q,
+ )
+ preserved = single_video[retention_mask]
+ final_embeddings.append(preserved)
+ num_frames = video_size_thw[0]
+ tokens_per_frame = (
+ retention_mask.reshape(num_frames, -1).sum(dim=-1).tolist()
+ )
+ num_tokens_per_frame.extend(tokens_per_frame)
+ final_embeddings_tensor = torch.cat(final_embeddings)
+ return EVSEmbeddingResult(
+ embedding=final_embeddings_tensor,
+ num_tokens_per_frame=num_tokens_per_frame,
+ )
diff --git a/sglang/python/sglang/srt/multimodal/processors/__pycache__/base_processor.cpython-311.pyc b/sglang/python/sglang/srt/multimodal/processors/__pycache__/base_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81f187f0879eef2baaae8176d60aa0dbdec712ef
Binary files /dev/null and b/sglang/python/sglang/srt/multimodal/processors/__pycache__/base_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/cpu_monitor.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/cpu_monitor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdd00bbc8a64abe6a4ef05ec6d8a1da858e76a33
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/cpu_monitor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/func_timer.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/func_timer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d12a0107449e9872f8fc1f9567b9f086b87b722
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/func_timer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/metrics_collector.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/metrics_collector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..690f5b8b711eb71dc1f7f77ebeadbcf7e34a5861
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/metrics_collector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/req_time_stats.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/req_time_stats.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc4569d50aa2de37f72472fd3b6dfb70a3823162
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/req_time_stats.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/request_metrics_exporter.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/request_metrics_exporter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0ada339356e6ae9caf0f27e7d4793cc511c8802
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/request_metrics_exporter.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/scheduler_metrics_mixin.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/scheduler_metrics_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1cfd6a979bd11eff8edc56ff4025837623b6be5
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/scheduler_metrics_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/trace.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/trace.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa14bc4738327e0acc62d65b18d453588869e1ec
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/trace.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/observability/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/observability/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..703bd6447cd37579b33456892dcf548b87b96883
Binary files /dev/null and b/sglang/python/sglang/srt/observability/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/draft_utils.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/draft_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..865d1dd10a33fac6fff5fab0c8b192cec65d4f41
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/draft_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_cuda_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_cuda_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8749fed8a21e72ebc136fc61a9a60ec358e417e7
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_cuda_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_extend_cuda_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_extend_cuda_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95a181fff41837c1e3b661f22c9de0e2214c98a4
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_draft_extend_cuda_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_info.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_info.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ef7c54d7203af52a570ce5b730f5a4611ed2530
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_info.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_info_v2.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_info_v2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9c445e7f740a03a6ec86cba601ebcaa054d217e
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_info_v2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_utils.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b69a362598050021c6cea30f939d69b139612bb
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/eagle_worker.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/eagle_worker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fcfbcf9c7ceea2219b78eeeef34fc345931a0a9
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/eagle_worker.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/spec_info.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/spec_info.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..536dd2428c40c1c46cabf23cf51818e9bd6c087b
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/spec_info.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/spec_utils.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/spec_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40a0e44678464653beacd2a2e6e1286db2c4ce5d
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/spec_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/__pycache__/standalone_worker.cpython-311.pyc b/sglang/python/sglang/srt/speculative/__pycache__/standalone_worker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd2ca8dcc1b391aa528783976eb63c25239819f4
Binary files /dev/null and b/sglang/python/sglang/srt/speculative/__pycache__/standalone_worker.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/.clang-format b/sglang/python/sglang/srt/speculative/cpp_ngram/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..be44d89a697d41a1418fb8db64d2dbb66aa8f9ee
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/.clang-format
@@ -0,0 +1,15 @@
+BasedOnStyle: Google
+IndentWidth: 2
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: Empty
+DerivePointerAlignment: false
+PointerAlignment: Left
+NamespaceIndentation: None
+SortIncludes: true
+AllowShortLoopsOnASingleLine: false
+BinPackParameters: false # Prevents packing parameters in declarations
+BinPackArguments: false # Prevents packing arguments in function calls
+AlignAfterOpenBracket: AlwaysBreak # Forces a break after the opening parenthesis
+AlignOperands: Align # Aligns arguments vertically
+PenaltyBreakBeforeFirstCallParameter: 1 # Encourages breaking before the first argument
+PenaltyReturnTypeOnItsOwnLine: 100 # Keeps return type with function name
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.cpp b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7f0297e2e1b8bb1125c33890e82118877be7eb4
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.cpp
@@ -0,0 +1,381 @@
+#include "ngram.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace ngram {
+
+struct Node {
+ std::unordered_map next;
+};
+
+Ngram::Result fillResult(int last_token, int draft_token_num, std::vector& tree, int root) {
+ Ngram::Result info;
+ std::vector prevs;
+ info.token.reserve(draft_token_num);
+ prevs.reserve(draft_token_num);
+ std::queue> queue;
+ info.token.emplace_back(last_token);
+ prevs.emplace_back(-1);
+
+ for (auto [token, next] : tree[root].next) {
+ queue.emplace(token, next, 0);
+ }
+ while (queue.size()) {
+ auto [token, next, prev] = queue.front();
+ queue.pop();
+ info.token.emplace_back(token);
+ prevs.emplace_back(prev);
+ for (auto [t, n] : tree[next].next) {
+ queue.emplace(t, n, info.token.size() - 1);
+ }
+ }
+
+ // zero padding to length
+ while (info.token.size() < draft_token_num) {
+ info.token.emplace_back(0);
+ prevs.emplace_back(0);
+ }
+
+ int n = info.token.size();
+ info.mask.resize(n * n, 0);
+ info.mask[0] = 1;
+ for (int i = 0; i < n; ++i) {
+ if (prevs[i] != -1) {
+ memcpy(&info.mask[i * n], &info.mask[prevs[i] * n], prevs[i] + 1);
+ }
+ info.mask[i * n + i] = 1;
+ }
+
+ return info;
+}
+
+Ngram::Ngram(size_t capacity, const Param& param) {
+ param_ = param;
+ nodes_.resize(capacity);
+ for (auto& node : nodes_) {
+ node_pool_.emplace_back(&node);
+ }
+ free_node_count_ = node_pool_.size();
+ root_ = getNode();
+
+ if (!(param_.branch_length > 1)) {
+ throw std::runtime_error(
+ "param_.branch_length must be greater than 1, current value: " + std::to_string(param_.branch_length));
+ }
+ if (!(param_.min_match_window_size > 0)) {
+ throw std::runtime_error(
+ "min_match_window_size must be greater than 0, current value: " + std::to_string(param_.min_match_window_size));
+ }
+ if (!(param_.min_match_window_size <= param_.max_match_window_size)) {
+ throw std::runtime_error(
+ "min_match_window_size must be less than or equal to max_match_window_size, current min_match_window_size: " +
+ std::to_string(param_.min_match_window_size) +
+ ", max_match_window_size: " + std::to_string(param_.max_match_window_size));
+ }
+ if (!(param_.max_match_window_size < param_.branch_length)) {
+ throw std::runtime_error(
+ "max_match_window_size must be less than branch_length, current max_match_window_size: " +
+ std::to_string(param_.max_match_window_size) + ", branch_length: " + std::to_string(param_.branch_length));
+ }
+ if (!(param_.min_bfs_breadth > 0)) {
+ throw std::runtime_error(
+ "min_bfs_breadth must be greater than 0, current value: " + std::to_string(param_.min_bfs_breadth));
+ }
+ if (!(param_.min_bfs_breadth <= param_.max_bfs_breadth)) {
+ throw std::runtime_error(
+ "min_bfs_breadth must be less than or equal to max_bfs_breadth, current min_bfs_breadth: " +
+ std::to_string(param_.min_bfs_breadth) + ", max_bfs_breadth: " + std::to_string(param_.max_bfs_breadth));
+ }
+ if (!(param_.draft_token_num > 0)) {
+ throw std::runtime_error(
+ "draft_token_num must be greater than 0, current value: " + std::to_string(param_.draft_token_num));
+ }
+ for (auto config : param_.batch_draft_token_num) {
+ if (config != std::numeric_limits::max()) {
+ if (!(config <= param_.draft_token_num)) {
+ throw std::runtime_error(
+ "batch_draft_token_num config value " + std::to_string(config) +
+ " must be less than or equal to draft_token_num: " + std::to_string(param_.draft_token_num));
+ }
+ }
+ }
+ for (auto config : param_.batch_min_match_window_size) {
+ if (config != std::numeric_limits::max()) {
+ if (!(config >= param_.min_match_window_size)) {
+ throw std::runtime_error(
+ "batch_min_match_window_size config value " + std::to_string(config) +
+ " must be greater than or equal to min_match_window_size: " + std::to_string(param_.min_match_window_size));
+ }
+ if (!(config <= param_.max_match_window_size)) {
+ throw std::runtime_error(
+ "batch_min_match_window_size config value " + std::to_string(config) +
+ " must be less than or equal to max_match_window_size: " + std::to_string(param_.max_match_window_size));
+ }
+ }
+ }
+
+ quit_flag_ = false;
+ insert_worker_ = std::thread(&Ngram::insert, this);
+}
+
+Ngram::~Ngram() {
+ quit_flag_ = true;
+ insert_queue_.close();
+ insert_worker_.join();
+}
+
+std::vector> Ngram::match(const std::vector& tokens, size_t batch_size) const {
+ auto draft_token_num = param_.get_draft_token_num(batch_size);
+ auto min_match_window_size = param_.get_min_match_window_size(batch_size);
+ auto max_match_window_size = param_.max_match_window_size;
+ std::vector> result;
+ result.reserve(param_.max_match_window_size - param_.min_match_window_size);
+ for (int32_t match_window_size = std::min(tokens.size(), param_.max_match_window_size);
+ match_window_size >= param_.min_match_window_size;
+ --match_window_size) {
+ auto start = tokens.data() + tokens.size() - match_window_size;
+ auto end = start + match_window_size;
+ auto cursor = root_;
+ while (start != end) {
+ auto iter = cursor->child.find(*start);
+ if (iter == cursor->child.end()) {
+ cursor = nullptr;
+ break;
+ }
+ ++start;
+ cursor = iter->second;
+ }
+ if (cursor) {
+ result.emplace_back(std::make_pair(cursor, match_window_size));
+ }
+ }
+ return result;
+}
+
+void Ngram::squeeze(size_t count) {
+ if (!(node_pool_.size() >= free_node_count_ + count)) {
+ throw std::runtime_error(
+ "Insufficient node size to release required nodes. "
+ "available to release: " +
+ std::to_string(node_pool_.size() - free_node_count_) + ", required to release: " + std::to_string(count));
+ }
+ while (count--) {
+ auto last = global_lru_.back();
+ global_lru_.pop_back();
+
+ if (!last->child.empty()) {
+ throw std::runtime_error("The node to be released still has child nodes and cannot be released. ");
+ }
+
+ last->parent->lru.erase(last->parent_lru_pos);
+ last->parent->sorted_children.erase(last);
+ last->parent->child.erase(last->token);
+
+ node_pool_[free_node_count_++] = last;
+ }
+}
+
+void Ngram::synchronize() const {
+ while (!insert_queue_.empty()) {
+ std::this_thread::sleep_for(std::chrono::microseconds(10));
+ }
+}
+
+void Ngram::insert() {
+ while (!quit_flag_) {
+ std::vector data;
+ if (!insert_queue_.dequeue(data)) {
+ continue;
+ }
+ const auto* token = data.data();
+ size_t size = data.size();
+ std::unique_lock lock(mutex_);
+
+ for (size_t i = 0; i + param_.min_match_window_size < size; ++i) {
+ auto start = token + i;
+ auto end = start + std::min(size - i, param_.branch_length);
+
+ if (end - start > free_node_count_) {
+ squeeze(end - start - free_node_count_);
+ }
+
+ TrieNode* cursor = root_;
+ path_.clear();
+ while (start != end) {
+ auto token = *start;
+ auto iter = cursor->child.find(token);
+ if (iter == cursor->child.end()) {
+ iter = cursor->child.insert({token, getNode()}).first;
+ auto node = iter->second;
+
+ cursor->lru.emplace_front(node);
+ global_lru_.emplace_back(node);
+
+ node->token = token;
+ node->parent = cursor;
+ node->parent_lru_pos = cursor->lru.begin();
+ node->global_lru_pos = --global_lru_.end();
+ node->freq = 1;
+ cursor->sorted_children.insert(node);
+ } else {
+ auto node = iter->second;
+ cursor->sorted_children.erase(node);
+ node->freq++;
+ cursor->sorted_children.insert(node);
+ cursor->lru.splice(cursor->lru.begin(), cursor->lru, node->parent_lru_pos);
+ }
+ cursor = iter->second;
+ path_.emplace_back(cursor);
+ ++start;
+ }
+
+ for (auto it = path_.rbegin(); it != path_.rend(); ++it) {
+ TrieNode* node = *it;
+ global_lru_.splice(global_lru_.begin(), global_lru_, node->global_lru_pos);
+ }
+ }
+ }
+}
+
+void Ngram::asyncInsert(std::vector>&& tokens) {
+ for (auto&& token : tokens) {
+ insert_queue_.enqueue(std::move(token));
+ }
+}
+
+Ngram::Result Ngram::matchBFS(const std::vector& tokens, size_t batch_size) const {
+ std::vector> nodes = match(tokens, batch_size);
+
+ double bfs_breadth_scale = double(param_.max_bfs_breadth - param_.min_bfs_breadth) /
+ (param_.max_match_window_size - param_.min_match_window_size + 1);
+
+ auto draft_token_num = param_.get_draft_token_num(batch_size);
+ std::vector tree(draft_token_num + 1);
+ int root = 0;
+ int cursor = 1;
+
+ for (auto [node, depth] : nodes) {
+ std::queue> queue; // parent, bfs_breadth, node
+ queue.push({root, (param_.max_match_window_size - depth) * bfs_breadth_scale + param_.min_bfs_breadth, node});
+ while (queue.size() && cursor <= draft_token_num) {
+ auto front = queue.front();
+ queue.pop();
+
+ auto parent = std::get<0>(front);
+ auto cur_breadth = std::get<1>(front);
+ auto iter = std::get<2>(front)->lru.begin();
+
+ auto breadth = std::max(1, int32_t(cur_breadth));
+ for (int i = 0; i < breadth && iter != std::get<2>(front)->lru.end() && cursor <= draft_token_num; ++i, ++iter) {
+ auto token = (*iter)->token;
+ auto pos = -1;
+ if (auto tit = tree[parent].next.find(token); tit != tree[parent].next.end()) {
+ pos = tit->second;
+ } else {
+ pos = tree[parent].next.insert(std::make_pair(token, cursor++)).first->second;
+ }
+ queue.emplace(pos, cur_breadth - bfs_breadth_scale, *iter);
+ }
+ }
+ }
+
+ return fillResult(tokens.back(), draft_token_num + 1, tree, root);
+}
+
+Ngram::Result Ngram::matchProb(const std::vector& tokens, size_t batch_size) const {
+ std::vector> nodes = match(tokens, batch_size);
+ auto draft_token_num = param_.get_draft_token_num(batch_size);
+
+ struct CompareByLastDouble {
+ bool operator()(
+ const std::tuple& a, // parent_pos, node, final_prob
+ const std::tuple& b) const {
+ return std::get<2>(a) < std::get<2>(b);
+ }
+ };
+
+ std::priority_queue<
+ std::tuple,
+ std::vector>,
+ CompareByLastDouble>
+ heap;
+
+ std::vector tree(draft_token_num + 1);
+
+ int root = 0;
+ int cursor = 1;
+ int top_k = param_.max_bfs_breadth;
+
+ auto addToHeap = [&heap, &top_k](int parent, const TrieNode* trie_node, double prob) -> void {
+ double sum_freq = 0.0;
+ int count = 0;
+ std::list> topk_children;
+ for (auto* child : trie_node->sorted_children) {
+ sum_freq += static_cast(child->freq);
+ topk_children.emplace_back(child, child->freq);
+ if (++count >= top_k) break;
+ }
+ if (sum_freq <= 0) sum_freq = 1.0;
+ for (const auto& [child, freq] : topk_children) {
+ double norm_freq = static_cast(freq) / sum_freq * prob;
+ heap.emplace(parent, child, norm_freq);
+ }
+ };
+
+ for (auto [node, _] : nodes) {
+ addToHeap(root, node, 1.0);
+
+ while (!heap.empty() && cursor <= draft_token_num) {
+ auto [parent, trie_node, prob] = heap.top(); // parent_pos, node, final_prob
+ heap.pop();
+ auto token = trie_node->token;
+ int pos = -1;
+ auto tit = tree[parent].next.find(token);
+ if (tit != tree[parent].next.end()) {
+ pos = tit->second;
+ } else {
+ pos = cursor++;
+ tree[parent].next[token] = pos;
+ }
+ addToHeap(pos, trie_node, prob);
+ }
+ }
+
+ return fillResult(tokens.back(), draft_token_num + 1, tree, root);
+}
+
+Ngram::Result Ngram::batchMatch(const std::vector>& tokens) const {
+ std::unique_lock lock(mutex_);
+ Result merged_result;
+ auto match_func = param_.match_type == "BFS" ? &Ngram::matchBFS : &Ngram::matchProb;
+ for (const auto& tks : tokens) {
+ Result res = (this->*match_func)(tks, tokens.size());
+ merged_result.token.insert(merged_result.token.end(), res.token.begin(), res.token.end());
+ merged_result.mask.insert(merged_result.mask.end(), res.mask.begin(), res.mask.end());
+ }
+ return merged_result;
+}
+
+void Ngram::Result::truncate(size_t n) {
+ if (n < token.size()) {
+ int full_n = token.size();
+ for (int i = 1; i < n; ++i) {
+ memcpy(&mask[i * n], &mask[i * full_n], sizeof(mask[0]) * n);
+ }
+ token.resize(n);
+ mask.resize(n * n);
+ }
+}
+
+} // namespace ngram
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.h b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c9a9380ecab2385a68f898c9dd19db007927933
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "param.h"
+#include "queue.h"
+
+namespace ngram {
+
+struct TrieNode {
+ std::unordered_map child;
+ std::list::const_iterator global_lru_pos;
+ std::list::const_iterator parent_lru_pos;
+ int32_t token;
+ TrieNode* parent;
+ std::list lru;
+ int32_t freq = 0;
+
+ struct CompareByFreq {
+ bool operator()(TrieNode* a, TrieNode* b) const {
+ return std::tie(b->freq, a->token, a) < std::tie(a->freq, b->token, b);
+ }
+ };
+ std::multiset sorted_children;
+};
+
+class Ngram {
+ std::vector nodes_;
+ std::vector node_pool_;
+ size_t free_node_count_;
+ std::list global_lru_;
+ TrieNode* root_;
+ std::vector path_;
+ Param param_;
+
+ std::vector> match(const std::vector& tokens, size_t batch_size) const;
+
+ void squeeze(size_t count);
+
+ TrieNode* getNode() {
+ auto node = node_pool_[--free_node_count_];
+ node->~TrieNode();
+ new (node) TrieNode();
+ return node;
+ }
+
+ mutable std::mutex mutex_;
+ bool quit_flag_;
+ utils::Queue> insert_queue_;
+ std::thread insert_worker_;
+ std::vector> match_tmp_data_;
+
+ public:
+ Ngram(size_t capacity, const Param& param);
+ Ngram() = default;
+ ~Ngram();
+
+ static Ngram& instance() {
+ static Ngram instance;
+ return instance;
+ }
+
+ void synchronize() const;
+
+ void asyncInsert(std::vector>&& tokens);
+
+ struct Result {
+ std::vector token;
+ std::vector mask;
+
+ void truncate(size_t n);
+ };
+
+ Result batchMatch(const std::vector>& tokens) const;
+
+ void reset() {
+ std::unique_lock lock(mutex_);
+
+ global_lru_.clear();
+ path_.clear();
+ node_pool_.clear();
+ for (auto& node : nodes_) {
+ node_pool_.emplace_back(&node);
+ }
+ free_node_count_ = node_pool_.size();
+ root_ = getNode();
+ }
+
+ const Param& param() const {
+ return param_;
+ }
+
+ private:
+ Result matchBFS(const std::vector& tokens, size_t batch_size) const;
+ Result matchProb(const std::vector& tokens, size_t batch_size) const;
+
+ void insert();
+};
+
+} // namespace ngram
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache.py b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b1eb8eea7880eda09943231bcd386c2496770b9
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+
+import logging
+import os
+from typing import List, Tuple
+
+import numpy as np
+from torch.utils.cpp_extension import load
+
+logger = logging.getLogger(__name__)
+
+_abs_path = os.path.dirname(os.path.abspath(__file__))
+ngram_cache_cpp = load(
+ name="ngram_cache_cpp",
+ sources=[
+ f"{_abs_path}/ngram_cache_binding.cpp",
+ f"{_abs_path}/ngram.cpp",
+ ],
+ extra_cflags=["-O3", "-std=c++20"],
+)
+
+
+class NgramCache:
+ def __init__(
+ self,
+ branch_length=18,
+ min_match_window_size=1,
+ max_match_window_size=10,
+ min_bfs_breadth=1,
+ max_bfs_breadth=8,
+ draft_token_num=8,
+ match_type="BFS",
+ capacity=1000000,
+ ):
+ param = ngram_cache_cpp.Param()
+ param.branch_length = branch_length
+ param.min_match_window_size = min_match_window_size
+ param.max_match_window_size = max_match_window_size
+ param.min_bfs_breadth = min_bfs_breadth
+ param.max_bfs_breadth = max_bfs_breadth
+ param.draft_token_num = draft_token_num
+ param.match_type = match_type
+ self.cache = ngram_cache_cpp.Ngram(capacity, param)
+
+ self.default_mask = np.ones((1, 1), dtype=np.int64)
+ self.draft_token_num = draft_token_num
+
+ def batch_put(self, batch_tokens: List[List[int]]):
+ self.cache.asyncInsert(batch_tokens)
+
+ def synchronize(self):
+ self.cache.synchronize()
+
+ def reset(self):
+ self.cache.reset()
+
+ def batch_get(self, batch_tokens: List[List[int]]) -> Tuple[np.ndarray, np.ndarray]:
+ result = self.cache.batchMatch(batch_tokens)
+ return np.array(result.token), np.array(result.mask)
+
+ def leaf_paths_from_mask(
+ self, tokens: List[int], tree_mask: List[List[int]]
+ ) -> List[List[int]]:
+ """
+ Find all leaf paths according to the binary tree_mask (i.e., paths that are not prefixes of any other path).
+
+ Args:
+ mask : List[List[int]] # nxn binary matrix
+ tokens : List[int] # token list corresponding to columns
+
+ Returns:
+ List[List[int]] # token lists of only the leaf paths, preserving their order of appearance
+ """
+
+ row_sets = [
+ (i, {idx for idx, v in enumerate(row) if v == 1})
+ for i, row in enumerate(tree_mask)
+ ]
+ leaf_sets = []
+ leaf_rows = []
+
+ for i, cur_set in reversed(row_sets):
+ if any(cur_set <= kept for kept in leaf_sets):
+ continue
+ leaf_sets.append(cur_set)
+ leaf_rows.append(i)
+
+ leaf_rows.reverse()
+ result = []
+ for r in leaf_rows:
+ path = [tokens[col] for col in range(len(tokens)) if tree_mask[r][col] == 1]
+ result.append(path)
+
+ return result
+
+ def debug_result(
+ self, decoding_ids: np.ndarray, decoding_masks: np.ndarray, tokenizer=None
+ ):
+ decoding_ids = decoding_ids.reshape(-1, self.draft_token_num)
+ decoding_masks = decoding_masks.reshape(
+ -1, self.draft_token_num, self.draft_token_num
+ )
+ logger.info(f"\n{decoding_ids=}\n{decoding_masks=}")
+ for i in range(decoding_ids.shape[0]):
+ leaf_paths = self.leaf_paths_from_mask(
+ decoding_ids[i].tolist(), decoding_masks[i].tolist()
+ )
+ if tokenizer is None:
+ logger.info(f"draft path {i}: {leaf_paths}")
+ else:
+ logger.info(f"result {i}:")
+ for leaf_path in leaf_paths:
+ logger.info(
+ f"draft path {i}: {leaf_path} -> {tokenizer.decode(leaf_path, ensure_ascii=False)}"
+ )
+
+
+# main function
+if __name__ == "__main__":
+ format = f"%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format=format,
+ datefmt="%Y-%m-%d %H:%M:%S",
+ force=True,
+ )
+
+ token_ids = [
+ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+ [1, 2, 3, 44, 55, 66, 77, 88, 99, 100],
+ ]
+ cache = NgramCache(branch_length=12, draft_token_num=8)
+ cache.batch_put(token_ids)
+
+ cache.synchronize()
+ decoding_ids, decoding_masks = cache.batch_get([[1, 2, 3], [3, 44], [3, 6, 999]])
+
+ cache.debug_result(decoding_ids, decoding_masks)
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac5b931f9a4a18484b3ec810fb67fed153831127
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp
@@ -0,0 +1,43 @@
+#include
+#include
+
+#include "ngram.h"
+
+PYBIND11_MODULE(ngram_cache_cpp, m) {
+ using namespace ngram;
+ namespace py = pybind11;
+ m.doc() = "";
+
+ py::class_(m, "Ngram")
+ .def(py::init(), py::arg("capacity"), py::arg("param"))
+ .def("asyncInsert", &Ngram::asyncInsert, "")
+ .def("batchMatch", &Ngram::batchMatch, "")
+ .def("reset", &Ngram::reset, "")
+ .def("synchronize", &Ngram::synchronize, "");
+
+ py::class_(m, "Param")
+ .def(py::init<>())
+ .def_readwrite("enable", &Param::enable)
+ .def_readwrite("enable_router_mode", &Param::enable_router_mode)
+ .def_readwrite("min_bfs_breadth", &Param::min_bfs_breadth)
+ .def_readwrite("max_bfs_breadth", &Param::max_bfs_breadth)
+ .def_readwrite("min_match_window_size", &Param::min_match_window_size)
+ .def_readwrite("max_match_window_size", &Param::max_match_window_size)
+ .def_readwrite("branch_length", &Param::branch_length)
+ .def_readwrite("draft_token_num", &Param::draft_token_num)
+ .def_readwrite("match_type", &Param::match_type)
+ .def_readwrite("batch_min_match_window_size", &Param::batch_min_match_window_size)
+ .def_readwrite("batch_draft_token_num", &Param::batch_draft_token_num)
+ .def("get_draft_token_num", &Param::get_draft_token_num, "")
+ .def("get_min_match_window_size", &Param::get_min_match_window_size, "")
+ .def("parse", &Param::parse, "")
+ .def("resetBatchMinMatchWindowSize", &Param::resetBatchMinMatchWindowSize, "")
+ .def("resetBatchReturnTokenNum", &Param::resetBatchReturnTokenNum, "")
+ .def("detail", &Param::detail, "");
+
+ py::class_(m, "Result")
+ .def(py::init<>())
+ .def_readwrite("token", &Ngram::Result::token)
+ .def_readwrite("mask", &Ngram::Result::mask)
+ .def("truncate", &Ngram::Result::truncate);
+}
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/param.h b/sglang/python/sglang/srt/speculative/cpp_ngram/param.h
new file mode 100644
index 0000000000000000000000000000000000000000..08b975bb18bc89b296d49a56e898cc289dae5131
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/param.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace ngram {
+
+struct Param {
+ bool enable;
+ bool enable_router_mode;
+ size_t min_bfs_breadth;
+ size_t max_bfs_breadth;
+ size_t min_match_window_size;
+ size_t max_match_window_size;
+ size_t branch_length;
+ size_t draft_token_num;
+ std::string match_type;
+
+ std::vector batch_min_match_window_size;
+ std::vector batch_draft_token_num;
+
+ size_t get_draft_token_num(size_t batch_size) const {
+ if (batch_size < batch_draft_token_num.size()) {
+ if (batch_draft_token_num[batch_size] !=
+ std::numeric_limits::max()) {
+ return batch_draft_token_num[batch_size];
+ }
+ }
+ return draft_token_num - 1;
+ }
+
+ size_t get_min_match_window_size(size_t batch_size) const {
+ if (batch_size < batch_min_match_window_size.size()) {
+ if (batch_min_match_window_size[batch_size] !=
+ std::numeric_limits::max()) {
+ return batch_min_match_window_size[batch_size];
+ }
+ }
+ return min_match_window_size;
+ }
+
+ std::vector parse(const std::string& value) {
+ // 0-1|10,2-3|20,
+ std::vector result;
+ if (value.empty()) {
+ return result;
+ }
+ std::vector mark;
+ std::regex comma_re(",");
+ std::sregex_token_iterator first{value.begin(), value.end(), comma_re, -1}, last;
+ for (auto p : std::vector(first, last)) {
+ std::cerr << "seg " << p << std::endl;
+ }
+ for (const auto& seg : std::vector(first, last)) {
+ std::regex pipe_re("\\|");
+ std::sregex_token_iterator seg_first{seg.begin(), seg.end(), pipe_re, -1}, seg_last;
+ std::vector part(seg_first, seg_last);
+ for (auto p : part) {
+ std::cerr << "part " << p << std::endl;
+ }
+ if (part.size() != 2) {
+ throw std::runtime_error(
+ "failed to get config, invalid config: " + seg + ", part's size = " + std::to_string(part.size()));
+ }
+ std::regex endash_re("-");
+ std::sregex_token_iterator range_first{part[0].begin(), part[0].end(), endash_re, -1}, range_last;
+ std::vector range(range_first, range_last);
+ if (range.size() != 2) {
+ throw std::runtime_error("failed to get range, invalid config: " + value);
+ }
+ size_t L = std::atoi(range[0].c_str());
+ size_t R = std::atoi(range[1].c_str());
+ if (L > R || R > 128) {
+ throw std::runtime_error("invalid range, config: " + value);
+ }
+ if (R >= result.size()) {
+ result.resize(R + 1, std::numeric_limits::max());
+ mark.resize(result.size(), false);
+ }
+ size_t config = std::atoi(part[1].c_str());
+ do {
+ if (mark[L]) {
+ throw std::runtime_error("repeated position " + std::to_string(L) + ", config : " + value);
+ }
+ mark[L] = true;
+ result[L] = config;
+ } while (++L <= R);
+ }
+ return result;
+ }
+
+ void resetBatchMinMatchWindowSize(const std::string& value) {
+ batch_min_match_window_size = parse(value);
+ }
+
+ void resetBatchReturnTokenNum(const std::string& value) {
+ batch_draft_token_num = parse(value);
+ }
+
+ std::string detail() {
+ std::stringstream ss;
+ ss << "enable = " << enable << ", enable_router_mode = " << enable_router_mode
+ << ", min_bfs_breadth = " << min_bfs_breadth << ", max_bfs_breadth = " << max_bfs_breadth
+ << ", min_match_window_size = " << min_match_window_size << ", max_match_window_size = " << max_match_window_size
+ << ", branch_length = " << branch_length << ", draft_token_num = " << draft_token_num
+ << ", match_type = " << match_type;
+ ss << ", batch_min_match_window_size(" << batch_min_match_window_size.size() << ") = ";
+ for (int i = 0; i < batch_min_match_window_size.size(); ++i) {
+ ss << i << "|" << batch_min_match_window_size[i] << ",";
+ }
+ ss << ", batch_draft_token_num(" << batch_draft_token_num.size() << ") = ";
+ for (int i = 0; i < batch_draft_token_num.size(); ++i) {
+ ss << i << "|" << batch_draft_token_num[i] << ",";
+ }
+ return ss.str();
+ }
+};
+
+} // namespace ngram
diff --git a/sglang/python/sglang/srt/speculative/cpp_ngram/queue.h b/sglang/python/sglang/srt/speculative/cpp_ngram/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..f08aa5cecc552730f9a368860b6ec133e277cad7
--- /dev/null
+++ b/sglang/python/sglang/srt/speculative/cpp_ngram/queue.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+namespace utils {
+
+template
+class Queue {
+ public:
+ bool enqueue(T&& rhs) {
+ {
+ std::lock_guard lock(mutex_);
+ if (closed_) {
+ return false;
+ }
+ queue_.emplace(std::move(rhs));
+ }
+ cv_.notify_one();
+ return true;
+ }
+
+ bool enqueue(const T& rhs) {
+ {
+ std::lock_guard lock(mutex_);
+ if (closed_) {
+ return false;
+ }
+ queue_.emplace(rhs);
+ }
+ cv_.notify_one();
+ return true;
+ }
+
+ bool dequeue(T& rhs) {
+ std::unique_lock lock(mutex_);
+ cv_.wait(lock, [this] { return queue_.size() || closed_; });
+ if (closed_) {
+ return false;
+ }
+ rhs = std::move(queue_.front());
+ queue_.pop();
+ return true;
+ }
+
+ size_t size() const {
+ std::lock_guard lock(mutex_);
+ return queue_.size();
+ }
+
+ bool empty() const {
+ std::lock_guard lock(mutex_);
+ return queue_.empty();
+ }
+
+ void close() {
+ {
+ std::lock_guard lock(mutex_);
+ closed_ = true;
+ }
+ cv_.notify_all();
+ }
+
+ private:
+ std::queue queue_;
+ mutable std::mutex mutex_;
+ std::condition_variable cv_;
+ bool closed_{false};
+};
+
+} // namespace utils
diff --git a/sglang/python/sglang/srt/utils/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9e9c69fe24200c3a72608cb5cd3590064c1cc4
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/aio_rwlock.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/aio_rwlock.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06c327376756b9f83f24c2894a235c687ed09acd
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/aio_rwlock.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/auth.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/auth.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..741cc640246b94699e83fd64ad95e6538a79734d
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/auth.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/cuda_ipc_transport_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/cuda_ipc_transport_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdde5a6ddeba8399dab9ed4046f5a2bf3c58f170
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/cuda_ipc_transport_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/custom_op.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/custom_op.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4647ac676ee8ef3c2533626569e2c2d69be9019e
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/custom_op.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/device_timer.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/device_timer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8c283a2a9406f6c2a167cf5f7dd42a3be0a2ff
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/device_timer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/gauge_histogram.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/gauge_histogram.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3eb857017f533db9c33ad70e1b8e81802a3a23a
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/gauge_histogram.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/hf_transformers_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/hf_transformers_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4d1c5fba7a3db8678b38db0a41630f31b6ae107
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/hf_transformers_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/host_shared_memory.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/host_shared_memory.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c22581272eb603879d5580a5cc62b3c335fa70b
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/host_shared_memory.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/log_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/log_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef205a1f256266b5210d395ec318f525d81bc4a3
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/log_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/mistral_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/mistral_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1fc50403a7991614a6db2489238c361f18a5b9b
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/mistral_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/multi_stream_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/multi_stream_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3364bfc0772794b308d4ffd80578c7d508c638f1
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/multi_stream_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/numa_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/numa_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0967224d008ba9ab3f2fdc78db18ed22eaf6f0d7
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/numa_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/nvtx_pytorch_hooks.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/nvtx_pytorch_hooks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea68fe75befb5ca66120f2d07d034a1aa9c6a926
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/nvtx_pytorch_hooks.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/offloader.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/offloader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbbcb54adca0c936f81619ff8b1098491645cd59
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/offloader.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/patch_tokenizer.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/patch_tokenizer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54f3e95457488d1a3975c4548dbb3d66679d796a
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/patch_tokenizer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/patch_torch.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/patch_torch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13a483611dd2e8968013c7596b4ba2005aac7963
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/patch_torch.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/poll_based_barrier.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/poll_based_barrier.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a798732b9402a98e9d8c1254643f49fea8ea92c9
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/poll_based_barrier.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/profile_merger.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/profile_merger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..916cd4bc5f19cfb30f4a6c566f02d8a64d280e8c
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/profile_merger.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/profile_utils.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/profile_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f97795f15cea8f0d35648a4f78ce4fdf97a4d3c4
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/profile_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/request_logger.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/request_logger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fe68b88b2dbedd288fba98db272095206df65b0
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/request_logger.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/scheduler_status_logger.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/scheduler_status_logger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..694061d796164a1390b1b66d9c5c4a890f516155
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/scheduler_status_logger.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/slow_rank_detector.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/slow_rank_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8045ed49c6022767e9c7c73da5abb2cbe81ce09
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/slow_rank_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/torch_memory_saver_adapter.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/torch_memory_saver_adapter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a51f0ab58b49ffd943a08bf39f7ef402553bc61a
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/torch_memory_saver_adapter.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/watchdog.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/watchdog.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac499f96b5b90c95f7c704cc60171e55499fb98
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/watchdog.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/utils/__pycache__/weight_checker.cpython-311.pyc b/sglang/python/sglang/srt/utils/__pycache__/weight_checker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..650efbcd8f67676bc4e166083717cfde6e5fe37a
Binary files /dev/null and b/sglang/python/sglang/srt/utils/__pycache__/weight_checker.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/weight_sync/__pycache__/tensor_bucket.cpython-311.pyc b/sglang/python/sglang/srt/weight_sync/__pycache__/tensor_bucket.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ec3383d71120506a4a58461b3827e02ad86fc
Binary files /dev/null and b/sglang/python/sglang/srt/weight_sync/__pycache__/tensor_bucket.cpython-311.pyc differ