Lekr0 commited on 8 days ago

Commit

6268841

verified ·

1 Parent(s): 5513247

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

sglang/.github/workflows/open-pr-copy-from-oss.yml +28 -0
sglang/.github/workflows/release-branch-cut.yml +213 -0
sglang/.github/workflows/rerun-ut.yml +71 -0
sglang/docs/_static/css/custom_log.css +29 -0
sglang/docs/_static/css/readthedocs.css +9 -0
sglang/docs/_static/image/logo.ico +0 -0
sglang/docs/advanced_features/checkpoint_engine.md +254 -0
sglang/docs/advanced_features/structured_outputs.ipynb +997 -0
sglang/docs/advanced_features/tool_parser.ipynb +856 -0
sglang/docs/advanced_features/vlm_query.ipynb +388 -0
sglang/docs/basic_usage/deepseek_ocr.md +54 -0
sglang/docs/basic_usage/deepseek_v32.md +459 -0
sglang/docs/basic_usage/glm45.md +70 -0
sglang/docs/basic_usage/glmv.md +136 -0
sglang/docs/basic_usage/gpt_oss.md +147 -0
sglang/docs/basic_usage/llama4.md +92 -0
sglang/docs/basic_usage/minimax_m2.md +85 -0
sglang/docs/basic_usage/native_api.ipynb +667 -0
sglang/docs/basic_usage/offline_engine_api.ipynb +235 -0
sglang/docs/basic_usage/ollama_api.md +91 -0
sglang/docs/basic_usage/openai_api.rst +9 -0
sglang/docs/basic_usage/openai_api_completions.ipynb +552 -0
sglang/docs/basic_usage/openai_api_embeddings.ipynb +193 -0
sglang/docs/basic_usage/openai_api_vision.ipynb +252 -0
sglang/docs/basic_usage/popular_model_usage.rst +19 -0
sglang/docs/basic_usage/qwen3.md +39 -0
sglang/docs/basic_usage/qwen3_vl.md +130 -0
sglang/docs/basic_usage/sampling_params.md +347 -0
sglang/docs/basic_usage/send_request.ipynb +251 -0
sglang/docs/developer_guide/bench_serving.md +355 -0
sglang/docs/developer_guide/benchmark_and_profiling.md +467 -0
sglang/docs/developer_guide/contribution_guide.md +147 -0
sglang/docs/developer_guide/development_guide_using_docker.md +108 -0
sglang/docs/developer_guide/development_jit_kernel_guide.md +259 -0
sglang/docs/developer_guide/evaluating_new_models.md +146 -0
sglang/docs/developer_guide/release_process.md +18 -0
sglang/docs/developer_guide/setup_github_runner.md +51 -0
sglang/docs/diffusion/api/cli.md +332 -0
sglang/docs/diffusion/api/openai_api.md +420 -0
sglang/docs/diffusion/ci_perf.md +29 -0
sglang/docs/diffusion/compatibility_matrix.md +78 -0
sglang/docs/diffusion/contributing.md +67 -0
sglang/docs/diffusion/environment_variables.md +36 -0
sglang/docs/diffusion/index.md +98 -0
sglang/docs/diffusion/installation.md +95 -0
sglang/docs/diffusion/performance/attention_backends.md +131 -0
sglang/docs/diffusion/performance/cache/cache_dit.md +273 -0
sglang/docs/diffusion/performance/cache/index.md +60 -0
sglang/docs/diffusion/performance/cache/teacache.md +84 -0
sglang/docs/diffusion/performance/index.md +72 -0

sglang/.github/workflows/open-pr-copy-from-oss.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Open A PR to Copy Code From OSS
+on:
+  workflow_dispatch:
+  # schedule:
+  #   - cron: '0 10 * * *'
+permissions:
+  contents: write
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: 'main'
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+      - name: Copy from OSS code
+        env:
+          GH_TOKEN: ${{ secrets.GH_PAT_FOR_OPEN_PR_TO_PRIVATE }}
+        run: |
+          python3 scripts/code_sync/copy_from_oss.py

sglang/.github/workflows/release-branch-cut.yml ADDED Viewed

	@@ -0,0 +1,213 @@

+name: Release Branch Cut
+on:
+  workflow_dispatch:
+    inputs:
+      branch_name:
+        description: 'Branch name to create (e.g., release/v0.5.7)'
+        required: true
+        type: string
+      commit_sha:
+        description: 'Commit SHA from main to cut the release branch from (defaults to latest main)'
+        required: false
+        type: string
+        default: ''
+permissions:
+  actions: write
+  contents: write
+  pull-requests: read
+jobs:
+  cut-release-branch:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    outputs:
+      branch_name: ${{ steps.set_output.outputs.branch_name }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Validate branch name
+        run: |
+          BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+          if [ -z "$BRANCH_NAME" ]; then
+            echo "::error::Branch name is required"
+            exit 1
+          fi
+          # Validate branch name format (should start with release/)
+          if [[ ! "$BRANCH_NAME" =~ ^release/ ]]; then
+            echo "::warning::Branch name '$BRANCH_NAME' does not follow convention 'release/vX.Y.Z'"
+          fi
+          echo "Branch name: $BRANCH_NAME"
+      - name: Validate commit SHA
+        id: validate
+        run: |
+          COMMIT_SHA="${{ github.event.inputs.commit_sha }}"
+          # If no commit SHA provided, use latest main
+          if [ -z "$COMMIT_SHA" ]; then
+            COMMIT_SHA=$(git rev-parse HEAD)
+            echo "No commit SHA provided, using latest main: $COMMIT_SHA"
+          fi
+          # Verify the commit exists and is on main
+          if ! git cat-file -t "$COMMIT_SHA" > /dev/null 2>&1; then
+            echo "::error::Commit SHA '$COMMIT_SHA' does not exist"
+            exit 1
+          fi
+          # Check if commit is an ancestor of main (i.e., is on main branch)
+          if ! git merge-base --is-ancestor "$COMMIT_SHA" main; then
+            echo "::error::Commit SHA '$COMMIT_SHA' is not on the main branch"
+            exit 1
+          fi
+          echo "COMMIT_SHA=$COMMIT_SHA" >> $GITHUB_OUTPUT
+          echo "Validated commit SHA: $COMMIT_SHA"
+      - name: Check if branch already exists
+        run: |
+          BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+          if git ls-remote --heads origin "$BRANCH_NAME" | grep -q "$BRANCH_NAME"; then
+            echo "::error::Branch '$BRANCH_NAME' already exists"
+            exit 1
+          fi
+          echo "Branch '$BRANCH_NAME' does not exist, proceeding with creation"
+      - name: Create release branch
+        id: set_output
+        run: |
+          COMMIT_SHA="${{ steps.validate.outputs.COMMIT_SHA }}"
+          BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+          git config user.name "sglang-bot"
+          git config user.email "sglang-bot@users.noreply.github.com"
+          # Create branch from the specified commit
+          git checkout -b "$BRANCH_NAME" "$COMMIT_SHA"
+          echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
+          echo "Successfully created branch '$BRANCH_NAME' from commit '$COMMIT_SHA'"
+      - name: Update version references in documentation
+        run: |
+          BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+          # Extract version from branch name (e.g., release/v0.5.8 -> v0.5.8)
+          VERSION=$(echo "$BRANCH_NAME" | sed 's/release\///')
+          # Update git clone version references in docs
+          sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/get_started/install.md
+          sed -i "s/git clone -b v[0-9]\+\.[0-9]\+\.[0-9]\+\.\?post\?[0-9]*/git clone -b $VERSION/" docs/platforms/amd_gpu.md
+          # Check if any changes were made
+          if git diff --quiet; then
+            echo "No version references needed updating"
+          else
+            git add docs/get_started/install.md docs/platforms/amd_gpu.md
+            git commit -m "docs: update version references to $VERSION"
+            echo "Updated version references to $VERSION"
+          fi
+      - name: Push release branch
+        run: |
+          BRANCH_NAME="${{ steps.set_output.outputs.branch_name }}"
+          git push origin "$BRANCH_NAME"
+          echo "Successfully pushed branch '$BRANCH_NAME'"
+      - name: Summary
+        run: |
+          COMMIT_SHA="${{ steps.validate.outputs.COMMIT_SHA }}"
+          BRANCH_NAME="${{ github.event.inputs.branch_name }}"
+          echo "## Release Branch Cut Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY
+          echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY
+          echo "| Branch | \`$BRANCH_NAME\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Commit | \`$COMMIT_SHA\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Triggered by | @${{ github.actor }} |" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Next Steps" >> $GITHUB_STEP_SUMMARY
+          echo "1. Tests are automatically triggered on the release branch" >> $GITHUB_STEP_SUMMARY
+          echo "2. Apply any hotfixes if needed" >> $GITHUB_STEP_SUMMARY
+          echo "3. Create a tag to trigger release: \`gh workflow run release-tag.yml -f version=X.Y.Z -f ref=$BRANCH_NAME\`" >> $GITHUB_STEP_SUMMARY
+  run-pr-tests-nvidia:
+    needs: cut-release-branch
+    uses: ./.github/workflows/pr-test.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+      run_all_tests: true
+    secrets: inherit
+  run-pr-tests-amd:
+    needs: cut-release-branch
+    uses: ./.github/workflows/pr-test-amd.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+      run_all_tests: true
+    secrets: inherit
+  run-pr-test-npu:
+    needs: cut-release-branch
+    uses: ./.github/workflows/pr-test-npu.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+      run_all_tests: true
+    secrets: inherit
+  run-pr-tests-xeon:
+    needs: cut-release-branch
+    uses: ./.github/workflows/pr-test-xeon.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+      run_all_tests: true
+    secrets: inherit
+  run-pr-tests-xpu:
+    needs: cut-release-branch
+    uses: ./.github/workflows/pr-test-xpu.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+      run_all_tests: true
+    secrets: inherit
+  run-nightly-tests-nvidia:
+    needs: cut-release-branch
+    uses: ./.github/workflows/nightly-test-nvidia.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+    secrets: inherit
+  run-nightly-tests-amd:
+    needs: cut-release-branch
+    uses: ./.github/workflows/nightly-test-amd.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+    secrets: inherit
+  run-nightly-tests-npu:
+    needs: cut-release-branch
+    uses: ./.github/workflows/nightly-test-npu.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+    secrets: inherit
+  run-nightly-tests-intel:
+    needs: cut-release-branch
+    uses: ./.github/workflows/nightly-test-intel.yml
+    with:
+      ref: ${{ needs.cut-release-branch.outputs.branch_name }}
+    secrets: inherit

sglang/.github/workflows/rerun-ut.yml ADDED Viewed

	@@ -0,0 +1,71 @@

+name: Rerun UT
+run-name: ${{ inputs.pr_head_sha && format('[rerun-ut] {0}', inputs.pr_head_sha) || '[rerun-ut]' }}
+on:
+  workflow_dispatch:
+    inputs:
+      test_command:
+        description: "Test command to run (e.g. 'registered/core/test_srt_endpoint.py TestSRTEndpoint.test_simple_decode')"
+        required: true
+        type: string
+      runner_label:
+        description: "Runner label (e.g. '1-gpu-runner', '1-gpu-5090', '4-gpu-h100')"
+        required: true
+        type: string
+      pr_head_sha:
+        description: "PR head SHA to checkout (for /rerun-ut on fork PRs)"
+        required: false
+        type: string
+        default: ""
+      use_deepep:
+        description: "Use ci_install_deepep.sh instead of ci_install_dependency.sh"
+        required: false
+        type: string
+        default: "false"
+env:
+  SGLANG_IS_IN_CI: true
+  SGLANG_CUDA_COREDUMP: "1"
+  SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true
+permissions:
+  actions: write
+  contents: read
+jobs:
+  rerun-ut-cuda:
+    runs-on: ${{ inputs.runner_label }}
+    timeout-minutes: 120
+    env:
+      RUNNER_LABELS: ${{ inputs.runner_label }}
+      IS_BLACKWELL: ${{ (inputs.runner_label == '1-gpu-5090' || contains(inputs.runner_label, 'b200')) && '1' || '' }}
+      SGLANG_CI_RDMA_ALL_DEVICES: ${{ inputs.runner_label == '8-gpu-h20' && 'mlx5_1,mlx5_2,mlx5_3,mlx5_4' || '' }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || github.sha }}
+      - name: Install dependencies
+        timeout-minutes: 20
+        run: |
+          if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then
+            source /etc/profile.d/sglang-ci.sh
+          fi
+          if [[ "${{ inputs.use_deepep }}" == "true" ]]; then
+            bash scripts/ci/cuda/ci_install_deepep.sh
+          else
+            bash scripts/ci/cuda/ci_install_dependency.sh
+          fi
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          if [[ "${{ inputs.runner_label }}" == "1-gpu-5090" ]]; then
+            source /etc/profile.d/sglang-ci.sh
+          fi
+          cd test/
+          python3 ${{ inputs.test_command }}
+      - uses: ./.github/actions/upload-cuda-coredumps
+        if: always()

sglang/docs/_static/css/custom_log.css ADDED Viewed

	@@ -0,0 +1,29 @@

+.output_area {
+    color: #615656;
+}
+table.autosummary td {
+    width: 50%
+  }
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+.output_area.stdout {
+    color: #d3d3d3 !important;
+}
+div.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+div.output_area.stdout {
+    color: #d3d3d3 !important;
+}

sglang/docs/_static/css/readthedocs.css ADDED Viewed

	@@ -0,0 +1,9 @@

+table.autosummary td {
+  width: 50%
+}
+img.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}

sglang/docs/_static/image/logo.ico ADDED Viewed

sglang/docs/advanced_features/checkpoint_engine.md ADDED Viewed

	@@ -0,0 +1,254 @@

+# Checkpoint Engine Integration
+The SGLang checkpoint engine integration provides an efficient way to load model weights using a distributed checkpoint loading system. This feature significantly reduces model loading time, especially for large models and multi-node setups, by parallelizing the weight loading process across multiple processes and nodes.
+## Overview
+The checkpoint engine integration allows SGLang to:
+- Load model weights in parallel using multiple processes
+- Distribute weight loading across multiple nodes to increase effective disk bandwidth
+- Overlap weight loading with other initialization tasks like CUDA graph capture
+- Support both single-node and multi-node deployments
+## Installation
+First, install the checkpoint engine package:
+```bash
+pip install 'checkpoint-engine[p2p]'
+```
+## Architecture
+The system consists of two main components:
+1. **SGLang Server**: Runs with `--wait-for-initial-weights` flag to wait for weights before becoming ready
+2. **Checkpoint Engine Workers**: Separate processes (managed by torchrun) that load and distribute model weights
+The checkpoint engine uses a parameter server architecture with support for:
+- **Broadcast mode**: Weights are broadcast from loading processes to inference processes
+- **P2P mode**: Direct peer-to-peer weight transfer between processes
+- **All mode**: Combination of both broadcast and P2P methods
+## Usage Examples
+### Single Node Setup
+**Terminal 1 - Launch SGLang Server:**
+```bash
+python -m sglang.launch_server \
+    --model-path Qwen/Qwen3-8B \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights
+```
+**Terminal 2 - Run Checkpoint Engine:**
+Using sglang entrypoint:
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+    examples/checkpoint_engine/update.py \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+### Multi-Node Setup (2 Nodes)
+**Node 0:**
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+    --model-path Qwen/Qwen3-8B \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights \
+    --host [IP]
+```
+Run checkpoint engine:
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+    --nnodes 2 \
+    --node-rank 0 \
+    --master-addr [IP] \
+    --master-port 29500 \
+    examples/checkpoint_engine/update.py \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+**Node 1:**
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+    --model-path Qwen/Qwen3-8B \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights \
+    --host [IP]
+```
+Run checkpoint engine:
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+    --nnodes 2 \
+    --node-rank 1 \
+    --master-addr [IP] \
+    --master-port 29500 \
+    examples/checkpoint_engine/update.py \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 8
+```
+### Multi-Node Setup with Tensor Parallelism (TP=16)
+**Node 0:**
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+    --model-path Qwen/Qwen3-8B \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights \
+    --host [IP] \
+    --dist-init-addr [IP]:9120 \
+    --nnodes 2 \
+    --node-rank 0
+```
+Run checkpoint engine:
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 16
+```
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+    --nnodes 2 \
+    --node-rank 0 \
+    --master-addr [IP] \
+    --master-port 29500 \
+    examples/checkpoint_engine/update.py \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 16
+```
+**Node 1:**
+Launch SGLang server:
+```bash
+python -m sglang.launch_server \
+    --model-path Qwen/Qwen3-8B \
+    --tp 8 \
+    --load-format dummy \
+    --wait-for-initial-weights \
+    --host [IP] \
+    --dist-init-addr [IP]:9120 \
+    --nnodes 2 \
+    --node-rank 1
+```
+Run checkpoint engine:
+Using sglang entrypoint (recommended):
+```bash
+python -m sglang.srt.checkpoint_engine.update \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 16
+```
+Using torchrun directly:
+```bash
+torchrun --nproc-per-node 8 \
+    --nnodes 2 \
+    --node-rank 1 \
+    --master-addr [IP] \
+    --master-port 29500 \
+    examples/checkpoint_engine/update.py \
+    --update-method broadcast \
+    --checkpoint-path /path/to/Qwen/Qwen3-8B/ \
+    --inference-parallel-size 16
+```
+## Configuration Options
+### SGLang Server Options
+- `--load-format dummy`: Use dummy format for initial loading (allows overlapping with other tasks)
+- `--wait-for-initial-weights`: Wait for checkpoint engine to provide weights before becoming ready
+- `--host`: Host address for multi-node setups
+- `--dist-init-addr`: Distributed initialization address for tensor parallelism
+### Checkpoint Engine Options
+- `--update-method`: Weight update method (`broadcast`, `p2p`, or `all`)
+- `--checkpoint-path`: Path to model checkpoint directory
+- `--inference-parallel-size`: Number of inference parallel processes
+- `--endpoint`: SGLang server endpoint (default: `http://localhost:19730`)
+- `--checkpoint-name`: Name for the checkpoint (default: `my-checkpoint-iter-0`)
+- `--save-metas-file`: File to save checkpoint metadata
+- `--load-metas-file`: File to load checkpoint metadata from
+- `--uds`: Unix domain socket path for communication
+- `--weight-version`: Version identifier for weights
+## Performance Benefits
+The checkpoint engine provides significant time savings in two main aspects:
+1. **Multi-node Loading**: Each node only loads a portion of weights from disk, effectively increasing disk bandwidth. More participating nodes provide greater acceleration. Preliminary tests show 20-second acceleration when loading DeepSeek-R1 on H20-3e with two nodes.
+2. **Single Process Optimization**: Using dummy format allows overlapping disk-to-CPU transfer with CUDA graph capture and other initialization tasks, providing additional time savings.
+## Troubleshooting
+- Ensure checkpoint engine package is installed: `pip install 'checkpoint-engine[p2p]'`
+- Verify network connectivity between nodes in multi-node setups
+- Check that the checkpoint path contains valid model files
+- Monitor logs for connection errors between SGLang server and checkpoint engine
+- Use `--sleep-time` parameter to add delays if needed for debugging
+## References
+- [Checkpoint Engine Repository](https://github.com/MoonshotAI/checkpoint-engine)

sglang/docs/advanced_features/structured_outputs.ipynb ADDED Viewed

	@@ -0,0 +1,997 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Structured Outputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.\n",
+    "\n",
+    "SGLang supports three grammar backends:\n",
+    "\n",
+    "- [XGrammar](https://github.com/mlc-ai/xgrammar)(default): Supports JSON schema, regular expression, and EBNF constraints.\n",
+    "- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.\n",
+    "- [Llguidance](https://github.com/guidance-ai/llguidance): Supports JSON schema, regular expression, and EBNF constraints.\n",
+    "\n",
+    "We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
+    "\n",
+    "To use Outlines, simply add `--grammar-backend outlines` when launching the server.\n",
+    "To use llguidance, add `--grammar-backend llguidance`  when launching the server.\n",
+    "If no backend is specified, XGrammar will be used as the default.\n",
+    "\n",
+    "For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## OpenAI Compatible API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import os\n",
+    "\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON\n",
+    "\n",
+    "you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\n",
+    "            \"name\": \"foo\",\n",
+    "            # convert the pydantic model to json schema\n",
+    "            \"schema\": CapitalInfo.model_json_schema(),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "response_content = response.choices[0].message.content\n",
+    "# validate the JSON response by the pydantic model\n",
+    "capital_info = CapitalInfo.model_validate_json(response_content)\n",
+    "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ebnf_grammar = \"\"\"\n",
+    "root ::= city | description\n",
+    "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
+    "description ::= city \" is \" status\n",
+    "status ::= \"the capital of \" country\n",
+    "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
+    "\"\"\"\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Give me the information of the capital of France.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=32,\n",
+    "    extra_body={\"ebnf\": ebnf_grammar},\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    extra_body={\"regex\": \"(Paris|London)\"},\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool_get_current_weather = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_weather\",\n",
+    "        \"description\": \"Get the current weather in a given location\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"city\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                },\n",
+    "                \"state\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+    "                    \" in, e.g. 'CA' which would mean 'California'\",\n",
+    "                },\n",
+    "                \"unit\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The unit to fetch the temperature in\",\n",
+    "                    \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                },\n",
+    "            },\n",
+    "            \"required\": [\"city\", \"state\", \"unit\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "tool_get_current_date = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_date\",\n",
+    "        \"description\": \"Get the current date and time for a given timezone\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"timezone\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
+    "                }\n",
+    "            },\n",
+    "            \"required\": [\"timezone\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
+    "schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
+    "\n",
+    "\n",
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": f\"\"\"\n",
+    "# Tool Instructions\n",
+    "- Always execute python code in messages that you share.\n",
+    "- When looking for real time information use relevant functions if available else fallback to brave_search\n",
+    "You have access to the following functions:\n",
+    "Use the function 'get_current_weather' to: Get the current weather in a given location\n",
+    "{tool_get_current_weather[\"function\"]}\n",
+    "Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
+    "{tool_get_current_date[\"function\"]}\n",
+    "If a you choose to call a function ONLY reply in the following format:\n",
+    "<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
+    "where\n",
+    "start_tag => `<function`\n",
+    "parameters => a JSON dict with the function argument name as key and function argument value as value.\n",
+    "end_tag => `</function>`\n",
+    "Here is an example,\n",
+    "<function=example_function_name>{{\"example_name\": \"example_value\"}}</function>\n",
+    "Reminder:\n",
+    "- Function calls MUST follow the specified format\n",
+    "- Required parameters MUST be specified\n",
+    "- Only call one function at a time\n",
+    "- Put the entire function call reply on one line\n",
+    "- Always add your sources when using search results to answer the user query\n",
+    "You are a helpful assistant.\"\"\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=messages,\n",
+    "    response_format={\n",
+    "        \"type\": \"structural_tag\",\n",
+    "        \"structures\": [\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_weather>\",\n",
+    "                \"schema\": schema_get_current_weather,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_date>\",\n",
+    "                \"schema\": schema_get_current_date,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "        ],\n",
+    "        \"triggers\": [\"<function=\"],\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Support for XGrammar latest structural tag format\n",
+    "# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=messages,\n",
+    "    response_format={\n",
+    "        \"type\": \"structural_tag\",\n",
+    "        \"format\": {\n",
+    "            \"type\": \"triggered_tags\",\n",
+    "            \"triggers\": [\"<function=\"],\n",
+    "            \"tags\": [\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_weather>\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"json_schema\",\n",
+    "                        \"json_schema\": schema_get_current_weather,\n",
+    "                    },\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_date>\",\n",
+    "                    \"content\": {\n",
+    "                        \"type\": \"json_schema\",\n",
+    "                        \"json_schema\": schema_get_current_date,\n",
+    "                    },\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "            ],\n",
+    "            \"at_least_one\": False,\n",
+    "            \"stop_after_first\": False,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "# Make API request\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print_highlight(response.json())\n",
+    "\n",
+    "\n",
+    "response_data = json.loads(response.json()[\"text\"])\n",
+    "# validate the response by the pydantic model\n",
+    "capital_info = CapitalInfo.model_validate(response_data)\n",
+    "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# JSON\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"json_schema\": json_schema,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Give me the information of the capital of France.\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"max_new_tokens\": 128,\n",
+    "            \"temperature\": 0,\n",
+    "            \"n\": 3,\n",
+    "            \"ebnf\": (\n",
+    "                \"root ::= city | description\\n\"\n",
+    "                'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "                'description ::= city \" is \" status\\n'\n",
+    "                'status ::= \"the capital of \" country\\n'\n",
+    "                'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "            ),\n",
+    "        },\n",
+    "        \"stream\": False,\n",
+    "        \"return_logprob\": False,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Paris is the capital of\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"regex\": \"(France|England)\",\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# generate an answer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+    "\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+    ")\n",
+    "payload = {\n",
+    "    \"text\": text,\n",
+    "    \"sampling_params\": {\n",
+    "        \"structural_tag\": json.dumps(\n",
+    "            {\n",
+    "                \"type\": \"structural_tag\",\n",
+    "                \"structures\": [\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_weather>\",\n",
+    "                        \"schema\": schema_get_current_weather,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_date>\",\n",
+    "                        \"schema\": schema_get_current_date,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                ],\n",
+    "                \"triggers\": [\"<function=\"],\n",
+    "            }\n",
+    "        )\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Support for XGrammar latest structural tag format\n",
+    "# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html\n",
+    "\n",
+    "payload = {\n",
+    "    \"text\": text,\n",
+    "    \"sampling_params\": {\n",
+    "        \"structural_tag\": json.dumps(\n",
+    "            {\n",
+    "                \"type\": \"structural_tag\",\n",
+    "                \"format\": {\n",
+    "                    \"type\": \"triggered_tags\",\n",
+    "                    \"triggers\": [\"<function=\"],\n",
+    "                    \"tags\": [\n",
+    "                        {\n",
+    "                            \"begin\": \"<function=get_current_weather>\",\n",
+    "                            \"content\": {\n",
+    "                                \"type\": \"json_schema\",\n",
+    "                                \"json_schema\": schema_get_current_weather,\n",
+    "                            },\n",
+    "                            \"end\": \"</function>\",\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"begin\": \"<function=get_current_date>\",\n",
+    "                            \"content\": {\n",
+    "                                \"type\": \"json_schema\",\n",
+    "                                \"json_schema\": schema_get_current_date,\n",
+    "                            },\n",
+    "                            \"end\": \"</function>\",\n",
+    "                        },\n",
+    "                    ],\n",
+    "                    \"at_least_one\": False,\n",
+    "                    \"stop_after_first\": False,\n",
+    "                },\n",
+    "            }\n",
+    "        )\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "\n",
+    "llm = sgl.Engine(\n",
+    "    model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.1,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\")  # validate the output by the pydantic model\n",
+    "    capital_info = CapitalInfo.model_validate_json(output[\"text\"])\n",
+    "    print_highlight(f\"Validated output: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of France.\",\n",
+    "    \"Give me the information of the capital of Germany.\",\n",
+    "    \"Give me the information of the capital of Italy.\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"ebnf\": (\n",
+    "        \"root ::= city | description\\n\"\n",
+    "        'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "        'description ::= city \" is \" status\\n'\n",
+    "        'status ::= \"the capital of \" country\\n'\n",
+    "        'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Please provide information about London as a major global city:\",\n",
+    "    \"Please provide information about Paris as a major global city:\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
+    ")\n",
+    "prompts = [text]\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"structural_tag\": json.dumps(\n",
+    "        {\n",
+    "            \"type\": \"structural_tag\",\n",
+    "            \"structures\": [\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_weather>\",\n",
+    "                    \"schema\": schema_get_current_weather,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_date>\",\n",
+    "                    \"schema\": schema_get_current_date,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "            ],\n",
+    "            \"triggers\": [\"<function=\"],\n",
+    "        }\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Support for XGrammar latest structural tag format\n",
+    "# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"structural_tag\": json.dumps(\n",
+    "        {\n",
+    "            \"type\": \"structural_tag\",\n",
+    "            \"format\": {\n",
+    "                \"type\": \"triggered_tags\",\n",
+    "                \"triggers\": [\"<function=\"],\n",
+    "                \"tags\": [\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_weather>\",\n",
+    "                        \"content\": {\n",
+    "                            \"type\": \"json_schema\",\n",
+    "                            \"json_schema\": schema_get_current_weather,\n",
+    "                        },\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_date>\",\n",
+    "                        \"content\": {\n",
+    "                            \"type\": \"json_schema\",\n",
+    "                            \"json_schema\": schema_get_current_date,\n",
+    "                        },\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                ],\n",
+    "                \"at_least_one\": False,\n",
+    "                \"stop_after_first\": False,\n",
+    "            },\n",
+    "        }\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/advanced_features/tool_parser.ipynb ADDED Viewed

	@@ -0,0 +1,856 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tool Parser\n",
+    "\n",
+    "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Currently supported parsers:\n",
+    "\n",
+    "| Parser | Supported Models | Notes |\n",
+    "|---|---|---|\n",
+    "| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | Recommend adding `--chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja` to launch command. |\n",
+    "| `deepseekv31` | DeepSeek-V3.1 and DeepSeek-V3.2-Exp (e.g. `deepseek-ai/DeepSeek-V3.1`, `deepseek-ai/DeepSeek-V3.2-Exp`) | Recommend adding `--chat-template ./examples/chat_template/tool_chat_template_deepseekv31.jinja` (Or ..deepseekv32.jinja for DeepSeek-V3.2) to launch command. |\n",
+    "| `deepseekv32` | DeepSeek-V3.2 (`deepseek-ai/DeepSeek-V3.2`) | |\n",
+    "| `glm` | GLM series (e.g. `zai-org/GLM-4.6`) | |\n",
+    "| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
+    "| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
+    "| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
+    "| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
+    "| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
+    "| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n",
+    "| `qwen` | Qwen series (e.g. `Qwen/Qwen3-Next-80B-A3B-Instruct`, `Qwen/Qwen3-VL-30B-A3B-Thinking`) except Qwen3-Coder| |\n",
+    "| `qwen3_coder` | Qwen3-Coder (e.g. `Qwen/Qwen3-Coder-30B-A3B-Instruct`) | |\n",
+    "| `step3` | Step-3 | |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## OpenAI Compatible API"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Launching the Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"  # qwen25\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that `--tool-call-parser` defines the parser used to interpret responses."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Tools for Function Call\n",
+    "Below is a Python snippet that shows how to define a tool as a dictionary. The dictionary includes a tool name, a description, and property defined Parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define tools\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                    },\n",
+    "                    \"state\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+    "                        \" in, e.g. 'CA' which would mean 'California'\",\n",
+    "                    },\n",
+    "                    \"unit\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The unit to fetch the temperature in\",\n",
+    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    },\n",
+    "                },\n",
+    "                \"required\": [\"city\", \"state\", \"unit\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"What's the weather like in Boston today? Output a reasoning before act, then use the tools to help you.\",\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize the Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize OpenAI-like client\n",
+    "client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
+    "model_name = client.models.list().data[0].id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Non-Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Non-streaming mode test\n",
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    max_tokens=1024,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(response_non_stream)\n",
+    "print_highlight(\"==== content ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.content)\n",
+    "print_highlight(\"==== tool_calls ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.tool_calls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Handle Tools\n",
+    "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "name_non_stream = response_non_stream.choices[0].message.tool_calls[0].function.name\n",
+    "arguments_non_stream = (\n",
+    "    response_non_stream.choices[0].message.tool_calls[0].function.arguments\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Final streamed function call name: {name_non_stream}\")\n",
+    "print_highlight(f\"Final streamed function call arguments: {arguments_non_stream}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Streaming mode test\n",
+    "print_highlight(\"Streaming response:\")\n",
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    max_tokens=1024,\n",
+    "    stream=True,  # Enable streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "\n",
+    "texts = \"\"\n",
+    "tool_calls = []\n",
+    "name = \"\"\n",
+    "arguments = \"\"\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        texts += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.tool_calls:\n",
+    "        tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(texts)\n",
+    "\n",
+    "print_highlight(\"==== Tool Call ====\")\n",
+    "for tool_call in tool_calls:\n",
+    "    print_highlight(tool_call)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Handle Tools\n",
+    "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parse and combine function call arguments\n",
+    "arguments = []\n",
+    "for tool_call in tool_calls:\n",
+    "    if tool_call.function.name:\n",
+    "        print_highlight(f\"Streamed function call name: {tool_call.function.name}\")\n",
+    "\n",
+    "    if tool_call.function.arguments:\n",
+    "        arguments.append(tool_call.function.arguments)\n",
+    "\n",
+    "# Combine all fragments into a single JSON string\n",
+    "full_arguments = \"\".join(arguments)\n",
+    "print_highlight(f\"streamed function call arguments: {full_arguments}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define a Tool Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is a demonstration, define real function according to your usage.\n",
+    "def get_current_weather(city: str, state: str, unit: \"str\"):\n",
+    "    return (\n",
+    "        f\"The weather in {city}, {state} is 85 degrees {unit}. It is \"\n",
+    "        \"partly cloudly, with highs in the 90's.\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "available_tools = {\"get_current_weather\": get_current_weather}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Execute the Tool"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages.append(response_non_stream.choices[0].message)\n",
+    "\n",
+    "# Call the corresponding tool function\n",
+    "tool_call = messages[-1].tool_calls[0]\n",
+    "tool_name = tool_call.function.name\n",
+    "tool_to_call = available_tools[tool_name]\n",
+    "result = tool_to_call(**(json.loads(tool_call.function.arguments)))\n",
+    "print_highlight(f\"Function call result: {result}\")\n",
+    "# messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
+    "messages.append(\n",
+    "    {\n",
+    "        \"role\": \"tool\",\n",
+    "        \"tool_call_id\": tool_call.id,\n",
+    "        \"content\": str(result),\n",
+    "        \"name\": tool_name,\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Updated message history: {messages}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Send Results Back to Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_response = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    stream=False,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(final_response)\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(final_response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "import requests\n",
+    "\n",
+    "# generate an answer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "input = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True, tools=tools, return_dict=False\n",
+    ")\n",
+    "\n",
+    "gen_url = f\"http://localhost:{port}/generate\"\n",
+    "gen_data = {\n",
+    "    \"text\": input,\n",
+    "    \"sampling_params\": {\n",
+    "        \"skip_special_tokens\": False,\n",
+    "        \"max_new_tokens\": 1024,\n",
+    "        \"temperature\": 0,\n",
+    "        \"top_p\": 0.95,\n",
+    "    },\n",
+    "}\n",
+    "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
+    "print_highlight(\"==== Response ====\")\n",
+    "print_highlight(gen_response)\n",
+    "\n",
+    "# parse the response\n",
+    "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
+    "\n",
+    "function_call_input = {\n",
+    "    \"text\": gen_response,\n",
+    "    \"tool_call_parser\": \"qwen25\",\n",
+    "    \"tools\": tools,\n",
+    "}\n",
+    "\n",
+    "function_call_response = requests.post(parse_url, json=function_call_input)\n",
+    "function_call_response_json = function_call_response.json()\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print(function_call_response_json[\"normal_text\"])\n",
+    "print_highlight(\"==== Calls ====\")\n",
+    "print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
+    "print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
+    "from sglang.srt.managers.io_struct import Tool, Function\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "tokenizer = llm.tokenizer_manager.tokenizer\n",
+    "input_ids = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=True, add_generation_prompt=True, tools=tools, return_dict=False\n",
+    ")\n",
+    "\n",
+    "# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
+    "# to make sure the tool call token <call> is not trimmed.\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"max_new_tokens\": 1024,\n",
+    "    \"temperature\": 0,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"skip_special_tokens\": False,\n",
+    "}\n",
+    "\n",
+    "# 1) Offline generation\n",
+    "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
+    "generated_text = result[\"text\"]  # Assume there is only one prompt\n",
+    "\n",
+    "print_highlight(\"=== Offline Engine Output Text ===\")\n",
+    "print_highlight(generated_text)\n",
+    "\n",
+    "\n",
+    "# 2) Parse using FunctionCallParser\n",
+    "def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
+    "    function_dict = tool_dict.get(\"function\", {})\n",
+    "    return Tool(\n",
+    "        type=tool_dict.get(\"type\", \"function\"),\n",
+    "        function=Function(\n",
+    "            name=function_dict.get(\"name\"),\n",
+    "            description=function_dict.get(\"description\"),\n",
+    "            parameters=function_dict.get(\"parameters\"),\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
+    "\n",
+    "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
+    "normal_text, calls = parser.parse_non_stream(generated_text)\n",
+    "\n",
+    "print_highlight(\"=== Parsing Result ===\")\n",
+    "print(\"Normal text portion:\", normal_text)\n",
+    "print_highlight(\"Function call portion:\")\n",
+    "for call in calls:\n",
+    "    # call: ToolCallItem\n",
+    "    print_highlight(f\"  - tool name: {call.name}\")\n",
+    "    print_highlight(f\"    parameters: {call.parameters}\")\n",
+    "\n",
+    "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool Choice Mode\n",
+    "\n",
+    "SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
+    "\n",
+    "### Supported Tool Choice Options\n",
+    "\n",
+    "- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
+    "- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
+    "\n",
+    "### Backend Compatibility\n",
+    "\n",
+    "Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
+    "\n",
+    "### Example: Required Tool Choice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "\n",
+    "# Start a new server session for tool choice examples\n",
+    "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0  --log-level warning\"\n",
+    ")\n",
+    "wait_for_server(\n",
+    "    f\"http://localhost:{port_tool_choice}\", process=server_process_tool_choice\n",
+    ")\n",
+    "\n",
+    "# Initialize client for tool choice examples\n",
+    "client_tool_choice = OpenAI(\n",
+    "    api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
+    ")\n",
+    "model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
+    "\n",
+    "# Example with tool_choice=\"required\" - forces the model to call a tool\n",
+    "messages_required = [\n",
+    "    {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
+    "]\n",
+    "\n",
+    "# Define tools\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                    },\n",
+    "                    \"unit\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The unit to fetch the temperature in\",\n",
+    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    },\n",
+    "                },\n",
+    "                \"required\": [\"city\", \"unit\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "response_required = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_required,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice=\"required\",  # Force the model to call a tool\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with tool_choice='required':\")\n",
+    "print(\"Content:\", response_required.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example: Specific Function Choice\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with specific function choice - forces the model to call a specific function\n",
+    "messages_specific = [\n",
+    "    {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
+    "]\n",
+    "\n",
+    "response_specific = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_specific,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice={\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\"name\": \"get_current_weather\"},\n",
+    "    },  # Force the model to call the specific get_current_weather function\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with specific function choice:\")\n",
+    "print(\"Content:\", response_specific.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
+    "\n",
+    "if response_specific.choices[0].message.tool_calls:\n",
+    "    tool_call = response_specific.choices[0].message.tool_calls[0]\n",
+    "    print_highlight(f\"Called function: {tool_call.function.name}\")\n",
+    "    print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process_tool_choice)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pythonic Tool Call Format (Llama-3.2 / Llama-3.3 / Llama-4)\n",
+    "\n",
+    "Some Llama models (such as Llama-3.2-1B, Llama-3.2-3B, Llama-3.3-70B, and Llama-4) support a \"pythonic\" tool call format, where the model outputs function calls as Python code, e.g.:\n",
+    "\n",
+    "```python\n",
+    "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\")]\n",
+    "```\n",
+    "\n",
+    "- The output is a Python list of function calls, with arguments as Python literals (not JSON).\n",
+    "- Multiple tool calls can be returned in the same list:\n",
+    "```python\n",
+    "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\"),\n",
+    " get_current_weather(city=\"New York\", state=\"NY\", unit=\"fahrenheit\")]\n",
+    "```\n",
+    "\n",
+    "For more information, refer to Meta’s documentation on  [Zero shot function calling](https://github.com/meta-llama/llama-models/blob/main/models/llama4/prompt_format.md#zero-shot-function-calling---system-message).\n",
+    "\n",
+    "Note that this feature is still under development on Blackwell.\n",
+    "\n",
+    "### How to enable\n",
+    "- Launch the server with `--tool-call-parser pythonic`\n",
+    "- You may also specify --chat-template with the improved template for the model (e.g., `--chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja`).\n",
+    "This is recommended because the model expects a special prompt format to reliably produce valid pythonic tool call outputs. The template ensures that the prompt structure (e.g., special tokens, message boundaries like `<|eom|>`, and function call delimiters) matches what the model was trained or fine-tuned on. If you do not use the correct chat template, tool calling may fail or produce inconsistent results.\n",
+    "\n",
+    "#### Forcing Pythonic Tool Call Output Without a Chat Template\n",
+    "If you don't want to specify a chat template, you must give the model extremely explicit instructions in your messages to enforce pythonic output. For example, for `Llama-3.2-1B-Instruct`, you need:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1  --log-level warning\"  # llama-3.2-1b-instruct\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+    "\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_weather\",\n",
+    "            \"description\": \"Get the current weather for a given location.\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"location\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The name of the city or location.\",\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"location\"],\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_tourist_attractions\",\n",
+    "            \"description\": \"Get a list of top tourist attractions for a given city.\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The name of the city to find attractions for.\",\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"city\"],\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": (\n",
+    "                \"You are a travel assistant. \"\n",
+    "                \"When asked to call functions, ALWAYS respond ONLY with a python list of function calls, \"\n",
+    "                \"using this format: [func_name1(param1=value1, param2=value2), func_name2(param=value)]. \"\n",
+    "                \"Do NOT use JSON, do NOT use variables, do NOT use any other format. \"\n",
+    "                \"Here is an example:\\n\"\n",
+    "                '[get_weather(location=\"Paris\"), get_tourist_attractions(city=\"Paris\")]'\n",
+    "            ),\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": (\n",
+    "                \"I'm planning a trip to Tokyo next week. What's the weather like and what are some top tourist attractions? \"\n",
+    "                \"Propose parallel tool calls at once, using the python list of function calls format as shown above.\"\n",
+    "            ),\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://localhost:{port}/v1\", api_key=\"xxxxxx\")\n",
+    "model_name = client.models.list().data[0].id\n",
+    "\n",
+    "\n",
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(response_non_stream)\n",
+    "\n",
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
+    "    stream=True,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "texts = \"\"\n",
+    "tool_calls = []\n",
+    "name = \"\"\n",
+    "arguments = \"\"\n",
+    "\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        texts += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.tool_calls:\n",
+    "        tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+    "\n",
+    "print_highlight(\"Streaming Response:\")\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(texts)\n",
+    "\n",
+    "print_highlight(\"==== Tool Call ====\")\n",
+    "for tool_call in tool_calls:\n",
+    "    print_highlight(tool_call)\n",
+    "\n",
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note:**  \n",
+    "> The model may still default to JSON if it was heavily finetuned on that format. Prompt engineering (including examples) is the only way to increase the chance of pythonic output if you are not using a chat template."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to support a new model?\n",
+    "1. Update the TOOLS_TAG_LIST in sglang/srt/function_call_parser.py with the model’s tool tags. Currently supported tags include:\n",
+    "```\n",
+    "\tTOOLS_TAG_LIST = [\n",
+    "\t    “<|plugin|>“,\n",
+    "\t    “<function=“,\n",
+    "\t    “<tool_call>“,\n",
+    "\t    “<|python_tag|>“,\n",
+    "\t    “[TOOL_CALLS]”\n",
+    "\t]\n",
+    "```\n",
+    "2. Create a new detector class in sglang/srt/function_call_parser.py that inherits from BaseFormatDetector. The detector should handle the model’s specific function call format. For example:\n",
+    "```\n",
+    "    class NewModelDetector(BaseFormatDetector):\n",
+    "```\n",
+    "3. Add the new detector to the MultiFormatParser class that manages all the format detectors."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

sglang/docs/advanced_features/vlm_query.ipynb ADDED Viewed

	@@ -0,0 +1,388 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Query VLM with Offline Engine\n",
+    "\n",
+    "This tutorial demonstrates how to use SGLang's **offline Engine API** to query VLMs. We will demonstrate usage with Qwen2.5-VL and Llama 4. This section demonstrates three different calling approaches:\n",
+    "\n",
+    "1. **Basic Call**: Directly pass images and text.\n",
+    "2. **Processor Output**: Use HuggingFace processor for data preprocessing.\n",
+    "3. **Precomputed Embeddings**: Pre-calculate image features to improve inference efficiency."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "## Understanding the Three Input Formats\n",
+    "\n",
+    "SGLang supports three ways to pass visual data, each optimized for different scenarios:\n",
+    "\n",
+    "### 1. **Raw Images** - Simplest approach\n",
+    "- Pass PIL Images, file paths, URLs, or base64 strings directly\n",
+    "- SGLang handles all preprocessing automatically\n",
+    "- Best for: Quick prototyping, simple applications\n",
+    "\n",
+    "### 2. **Processor Output** - For custom preprocessing\n",
+    "- Pre-process images with HuggingFace processor\n",
+    "- Pass the complete processor output dict with `format: \"processor_output\"`\n",
+    "- Best for: Custom image transformations, integration with existing pipelines\n",
+    "- Requirement: Must use `input_ids` instead of text prompt\n",
+    "\n",
+    "### 3. **Precomputed Embeddings** - For maximum performance\n",
+    "- Pre-calculate visual embeddings using the vision encoder\n",
+    "- Pass embeddings with `format: \"precomputed_embedding\"`\n",
+    "- Best for: Repeated queries on same images, caching, high-throughput serving\n",
+    "- Performance gain: Avoids redundant vision encoder computation (30-50% speedup)\n",
+    "\n",
+    "**Key Rule**: Within a single request, use only one format for all images. Don't mix formats.\n",
+    "\n",
+    "The examples below demonstrate all three approaches with both Qwen2.5-VL and Llama 4 models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": [
+    "## Querying Qwen2.5-VL Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
+    "chat_template = \"qwen2-vl\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from io import BytesIO\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "\n",
+    "from sglang.srt.parser.conversation import chat_templates\n",
+    "\n",
+    "image = Image.open(\n",
+    "    BytesIO(\n",
+    "        requests.get(\n",
+    "            \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+    "        ).content\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "conv = chat_templates[chat_template].copy()\n",
+    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+    "conv.append_message(conv.roles[1], \"\")\n",
+    "conv.image_data = [image]\n",
+    "\n",
+    "print(\"Generated prompt text:\")\n",
+    "print(conv.get_prompt())\n",
+    "print(f\"\\nImage size: {image.size}\")\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "### Basic Offline Engine API Call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang import Engine\n",
+    "\n",
+    "llm = Engine(model_path=model_path, chat_template=chat_template, log_level=\"warning\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+    "print(\"Model response:\")\n",
+    "print(out[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8",
+   "metadata": {},
+   "source": [
+    "### Call with Processor Output\n",
+    "\n",
+    "Using a HuggingFace processor to preprocess text and images, and passing the `processor_output` directly into `Engine.generate`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoProcessor\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "processor_output = processor(\n",
+    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+    ")\n",
+    "\n",
+    "out = llm.generate(\n",
+    "    input_ids=processor_output[\"input_ids\"][0].detach().cpu().tolist(),\n",
+    "    image_data=[dict(processor_output, format=\"processor_output\")],\n",
+    ")\n",
+    "print(\"Response using processor output:\")\n",
+    "print(out[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "### Call with Precomputed Embeddings\n",
+    "\n",
+    "You can pre-calculate image features to avoid repeated visual encoding processes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoProcessor\n",
+    "from transformers import Qwen2_5_VLForConditionalGeneration\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "vision = (\n",
+    "    Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_output = processor(\n",
+    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+    ")\n",
+    "\n",
+    "input_ids = processor_output[\"input_ids\"][0].detach().cpu().tolist()\n",
+    "\n",
+    "precomputed_embeddings = vision(\n",
+    "    processor_output[\"pixel_values\"].cuda(), processor_output[\"image_grid_thw\"].cuda()\n",
+    ")\n",
+    "\n",
+    "multi_modal_item = dict(\n",
+    "    processor_output,\n",
+    "    format=\"precomputed_embedding\",\n",
+    "    feature=precomputed_embeddings,\n",
+    ")\n",
+    "\n",
+    "out = llm.generate(input_ids=input_ids, image_data=[multi_modal_item])\n",
+    "print(\"Response using precomputed embeddings:\")\n",
+    "print(out[\"text\"])\n",
+    "\n",
+    "llm.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13",
+   "metadata": {},
+   "source": [
+    "## Querying Llama 4 Vision Model\n",
+    "\n",
+    "```python\n",
+    "model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
+    "chat_template = \"llama-4\"\n",
+    "\n",
+    "from io import BytesIO\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "\n",
+    "from sglang.srt.parser.conversation import chat_templates\n",
+    "\n",
+    "# Download the same example image\n",
+    "image = Image.open(\n",
+    "    BytesIO(\n",
+    "        requests.get(\n",
+    "            \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+    "        ).content\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "conv = chat_templates[chat_template].copy()\n",
+    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+    "conv.append_message(conv.roles[1], \"\")\n",
+    "conv.image_data = [image]\n",
+    "\n",
+    "print(\"Llama 4 generated prompt text:\")\n",
+    "print(conv.get_prompt())\n",
+    "print(f\"Image size: {image.size}\")\n",
+    "\n",
+    "image\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14",
+   "metadata": {},
+   "source": [
+    "### Llama 4 Basic Call\n",
+    "\n",
+    "Llama 4 requires more computational resources, so it's configured with multi-GPU parallelism (tp_size=4) and larger context length.\n",
+    "\n",
+    "```python\n",
+    "llm = Engine(\n",
+    "    model_path=model_path,\n",
+    "    enable_multimodal=True,\n",
+    "    attention_backend=\"fa3\",\n",
+    "    tp_size=4,\n",
+    "    context_length=65536,\n",
+    ")\n",
+    "\n",
+    "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+    "print(\"Llama 4 response:\")\n",
+    "print(out[\"text\"])\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15",
+   "metadata": {},
+   "source": [
+    "### Call with Processor Output\n",
+    "\n",
+    "Using HuggingFace processor to preprocess data can reduce computational overhead during inference.\n",
+    "\n",
+    "```python\n",
+    "from transformers import AutoProcessor\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "processor_output = processor(\n",
+    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+    ")\n",
+    "\n",
+    "out = llm.generate(\n",
+    "    input_ids=processor_output[\"input_ids\"][0].detach().cpu().tolist(),\n",
+    "    image_data=[dict(processor_output, format=\"processor_output\")],\n",
+    ")\n",
+    "print(\"Response using processor output:\")\n",
+    "print(out)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16",
+   "metadata": {},
+   "source": [
+    "### Call with Precomputed Embeddings\n",
+    "\n",
+    "```python\n",
+    "from transformers import AutoProcessor\n",
+    "from transformers import Llama4ForConditionalGeneration\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "model = Llama4ForConditionalGeneration.from_pretrained(\n",
+    "    model_path, torch_dtype=\"auto\"\n",
+    ").eval()\n",
+    "\n",
+    "vision = model.vision_model.cuda()\n",
+    "multi_modal_projector = model.multi_modal_projector.cuda()\n",
+    "\n",
+    "print(f'Image pixel values shape: {processor_output[\"pixel_values\"].shape}')\n",
+    "input_ids = processor_output[\"input_ids\"][0].detach().cpu().tolist()\n",
+    "\n",
+    "# Process image through vision encoder\n",
+    "image_outputs = vision(\n",
+    "    processor_output[\"pixel_values\"].to(\"cuda\"), \n",
+    "    aspect_ratio_ids=processor_output[\"aspect_ratio_ids\"].to(\"cuda\"),\n",
+    "    aspect_ratio_mask=processor_output[\"aspect_ratio_mask\"].to(\"cuda\"),\n",
+    "    output_hidden_states=False\n",
+    ")\n",
+    "image_features = image_outputs.last_hidden_state\n",
+    "\n",
+    "# Flatten image features and pass through multimodal projector\n",
+    "vision_flat = image_features.view(-1, image_features.size(-1))\n",
+    "precomputed_embeddings = multi_modal_projector(vision_flat)\n",
+    "\n",
+    "# Build precomputed embedding data item\n",
+    "mm_item = dict(\n",
+    "    processor_output, \n",
+    "    format=\"precomputed_embedding\", \n",
+    "    feature=precomputed_embeddings\n",
+    ")\n",
+    "\n",
+    "# Use precomputed embeddings for efficient inference\n",
+    "out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
+    "print(\"Llama 4 precomputed embedding response:\")\n",
+    "print(out[\"text\"])\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "custom_cell_magics": "kql",
+   "encoding": "# -*- coding: utf-8 -*-",
+   "text_representation": {
+    "extension": ".py",
+    "format_name": "light",
+    "format_version": "1.5",
+    "jupytext_version": "1.16.1"
+   }
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

sglang/docs/basic_usage/deepseek_ocr.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# DeepSeek OCR (OCR-1 / OCR-2)
+DeepSeek OCR models are multimodal (image + text) models for OCR and document understanding.
+## Launch server
+```shell
+python -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-OCR-2 \
+  --trust-remote-code \
+  --host 0.0.0.0 \
+  --port 30000
+```
+> You can replace `deepseek-ai/DeepSeek-OCR-2` with `deepseek-ai/DeepSeek-OCR`.
+## Prompt examples
+Recommended prompts from the model card:
+```
+<image>
+<|grounding|>Convert the document to markdown.
+```
+```
+<image>
+Free OCR.
+```
+## OpenAI-compatible request example
+```python
+import requests
+url = "http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "deepseek-ai/DeepSeek-OCR-2",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "<image>\n<|grounding|>Convert the document to markdown."},
+                {"type": "image_url", "image_url": {"url": "https://example.com/your_image.jpg"}},
+            ],
+        }
+    ],
+    "max_tokens": 512,
+}
+response = requests.post(url, json=data)
+print(response.text)
+```

sglang/docs/basic_usage/deepseek_v32.md ADDED Viewed

	@@ -0,0 +1,459 @@

+# DeepSeek V3.2 Usage
+DeepSeek-V3.2 model family equips DeepSeek-V3.1-Terminus with DeepSeek Sparse Attention (DSA) through continued training. With DSA, a fine-grained sparse attention mechanism powered by a lightning indexer, DeepSeek-V3.2 achieves efficiency improvements in long-context scenarios.
+For reporting issues or tracking upcoming features, please refer to this [Roadmap](https://github.com/sgl-project/sglang/issues/11060).
+Note: This document is originally written for the usage of [DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp) model. The usage of [DeepSeek-V3.2](https://huggingface.co/deepseek-ai/DeepSeek-V3.2) or [DeepSeek-V3.2-Speciale](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Speciale) is the same as DeepSeek-V3.2-Exp except for the tool call parser.
+## Installation
+### Docker
+```bash
+# H200/B200
+docker pull lmsysorg/sglang:latest
+# MI350/MI355
+docker pull lmsysorg/sglang:v0.5.8-rocm700-mi35x
+# MI300
+# v0.5.8-rocm700-mi30x does not include PR #17504. Prefer the newest MI30x ROCm
+# image tag from Docker Hub when available, or build from source (below).
+docker pull lmsysorg/sglang:v0.5.8-rocm700-mi30x
+# NPUs
+docker pull lmsysorg/sglang:dsv32-a2
+docker pull lmsysorg/sglang:dsv32-a3
+```
+### Build From Source
+```bash
+# Install SGLang
+git clone https://github.com/sgl-project/sglang
+cd sglang
+pip3 install pip --upgrade
+pip3 install -e "python"
+```
+## Launch DeepSeek V3.2 with SGLang
+To serve [DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp) on 8xH200/B200 GPUs:
+```bash
+# Launch with TP + DP (Recommended)
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention
+# Launch with EP + DP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --ep 8 --dp 8 --enable-dp-attention
+# Launch with Pure TP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8
+# Launch with TP on MI30x/MI35x
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --nsa-prefill-backend tilelang --nsa-decode-backend tilelang
+```
+### Configuration Tips
+- **DP Attention (Recommended)**: For DeepSeek V3.2 model, the kernels are customized for the use case of `dp_size=8`, so DP attention (`--dp 8 --enable-dp-attention`) is the recommended configuration for better stability and performance. All test cases use this configuration by default.
+- **Pure TP Mode**: Launching with pure TP (without `--dp` and `--enable-dp-attention`) is also supported. Note that this mode has not been fully validated in PD disaggregation scenarios.
+- **Short-sequence MHA prefill (adaptive)**: For short prefill sequences (default threshold: **2048 tokens**), the NSA backend uses standard MHA automatically (no extra flags). On H200 (SM90) this path uses the FlashAttention variable-length kernel; on B200 (SM100) it uses TRT-LLM ragged MHA. MHA uses `MHA_ONE_SHOT` for best performance. `MHA_ONE_SHOT` computes multi-head attention over all tokens (both cached prefix and newly extended tokens) in a single kernel invocation, avoiding the overhead of chunked KV cache processing. This achieves optimal throughput for short sequences where total sequence length fits within the chunk capacity limit.
+- **Choices of Attention Kernels**: The attention backend is automatically set to `nsa` attention backend for DeepSeek V3.2 model. In this backend, different kernels for sparse prefilling/decoding are implemented, which can be specified by `--nsa-prefill-backend` and `--nsa-decode-backend` server arguments. The choices of nsa prefill/decode attention kernels include:
+  - `flashmla_sparse`: `flash_mla_sparse_fwd` kernel from `flash_mla` library. Can run on both Hopper and Blackwell GPUs. It requires bf16 q, kv inputs.
+  - `flashmla_kv`: `flash_mla_with_kvcache` kernel from `flash_mla` library. Can run on both Hopper and Blackwell GPUs. It requires bf16 q, fp8 k_cache inputs.
+  - `fa3`: `flash_attn_with_kvcache` kernel from `flash_attn` library. Can only run on Hopper GPUs. It requires bf16 q, kv inputs.
+  - `tilelang`: `tilelang` implementation that can run on GPU, HPU and NPU.
+  - `aiter`: Aiter kernel on AMD HPUs. Can only be used as decode kernel.
+  - `trtllm`: `trtllm-mla` sparse kernel from flashinfer library. Only run on blackwell GPUs. It requires QKV bf16 or QKV fp8.
+- On the basis of performance benchmarks, the default configuration on H200 and B200 are set as follows :
+  - H200: `flashmla_sparse` prefill attention (short-seq prefill uses MHA via FlashAttention varlen), `fa3` decode attention, `bf16` kv cache dtype.
+  - B200: `flashmla_auto` prefill attention (short-seq prefill uses MHA via TRT-LLM ragged), `flashmla_kv` decode attention, `fp8_e4m3` kv cache dtype. `flashmla_auto` enables automatic selection of either `flashmla_sparse` or `flashmla_kv` kernel for prefill based on KV cache dtype, hardware, and heuristics. When FP8 KV cache is enabled and `total_kv_tokens < total_q_tokens * 512`, it uses the `flashmla_sparse` kernel; otherwise, it falls back to the `flashmla_kv` kernel. The heuristics may need to be tuned if the performance of either the `flashmla_sparse` or `flashmla_kv` kernel changes significantly.
+- On Blackwell platform, with slightly accuracy drop, the performance can boost up to 3x-5x
+  - B200: by choosing `trtllm` for both `--nsa-prefill-backend` and `--nsa-decode-backend`, the prefill attention use MHA via TRT-LLM ragged for both short and long sequence (**accuracy impact**). Combine the `trtllm` with `fp8_e4m3` kv cache, the kv cache dim is `576` (kv_lora_rank + qk_rope_head_dim) (**accuracy impact**), compare to the combination of `flashmla_auto` and `fp8_e4m` kv cache dim is `656` (kv_lora_rank + scale storage (kv_lora_rank // quant_block_size * 4 bytes) + rope dimension storage).
+## Multi-token Prediction
+SGLang implements Multi-Token Prediction (MTP) for DeepSeek V3.2 based on [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved significantly on small batch sizes. Please look at [this PR](https://github.com/sgl-project/sglang/pull/11652) for more information.
+Example usage with DP Attention:
+```bash
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention --speculative-algorithm EAGLE --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4
+```
+Example usage with Pure TP:
+```bash
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --speculative-algorithm EAGLE --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4
+```
+- The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes.
+- The default value of  `--max-running-requests` is set to `48` for MTP. For larger batch sizes, this value should be increased beyond the default value.
+```{tip}
+To enable the experimental overlap scheduler for EAGLE speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+## Function Calling and Reasoning Parser
+The usage of function calling and reasoning parser is the same as DeepSeek V3.1. Please refer to [Reasoning Parser](https://docs.sglang.io/advanced_features/separate_reasoning.html) and [Tool Parser](https://docs.sglang.io/advanced_features/tool_parser.html) documents.
+To launch `DeepSeek-V3.2-Exp` with function calling and reasoning parser:
+> Note: It is recommended to specify the chat-template, ensuring that you are within the sglang's root directory.
+```bash
+python3 -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+  --trust-remote-code \
+  --tp-size 8 --dp-size 8 --enable-dp-attention \
+  --tool-call-parser deepseekv31 \
+  --reasoning-parser deepseek-v3 \
+  --chat-template ./examples/chat_template/tool_chat_template_deepseekv32.jinja
+```
+To launch `DeepSeek-V3.2` with function calling and reasoning parser:
+```bash
+python3 -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2 \
+  --trust-remote-code \
+  --tp-size 8 --dp-size 8 --enable-dp-attention \
+  --tool-call-parser deepseekv32 \
+  --reasoning-parser deepseek-v3
+```
+`DeepSeek-V3.2-Speciale` doesn't support tool calling, so can only be launched with reasoning parser:
+```bash
+python3 -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Speciale \
+  --trust-remote-code \
+  --tp-size 8 --dp-size 8 --enable-dp-attention \
+  --reasoning-parser deepseek-v3
+```
+## NVFP4 Checkpoint
+To launch deepseek v3.2 [NVFP4 checkpoint](https://huggingface.co/nvidia/DeepSeek-V3.2-NVFP4) on Blackwell devices, the user needs to specify the quantization method as `modelopt_fp4`, and moe runner backend as one of `flashinfer_trtllm`(recommended), `flashinfer_cutlass` and `flashinfer_cutedsl`. Any other usage (parallelism, reasoning parser, ...) is the same as FP8 checkpoint.
+An example launching command can be:
+```bash
+python -m sglang.launch_server --model nvidia/DeepSeek-V3.2-NVFP4 --tp 4 --quantization modelopt_fp4 --moe-runner-backend flashinfer_trtllm --tool-call-parser deepseekv32  --reasoning-parser deepseek-v3
+```
+## PD Disaggregation
+Prefill Command:
+```bash
+python -m sglang.launch_server \
+        --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+        --disaggregation-mode prefill \
+        --host $LOCAL_IP \
+        --port $PORT \
+        --tp 8 \
+        --dp 8 \
+        --enable-dp-attention \
+        --dist-init-addr ${HOST}:${DIST_PORT} \
+        --trust-remote-code \
+        --disaggregation-bootstrap-port 8998 \
+        --mem-fraction-static 0.9 \
+```
+Decode command:
+```bash
+python -m sglang.launch_server \
+        --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+        --disaggregation-mode decode \
+        --host $LOCAL_IP \
+        --port $PORT \
+        --tp 8 \
+        --dp 8 \
+        --enable-dp-attention \
+        --dist-init-addr ${HOST}:${DIST_PORT} \
+        --trust-remote-code \
+        --mem-fraction-static 0.9 \
+```
+Router command:
+```bash
+python -m sglang_router.launch_router --pd-disaggregation \
+  --prefill $PREFILL_ADDR 8998 \
+  --decode $DECODE_ADDR \
+  --host 127.0.0.1 \
+  --port 8000 \
+```
+If you need more advanced deployment methods or production-ready deployment methods, such as RBG or LWS-based deployment, please refer to [references/multi_node_deployment/rbg_pd/deepseekv32_pd.md](../references/multi_node_deployment/rbg_pd/deepseekv32_pd.md). Additionally, you can also find startup commands for DeepEP-based EP parallelism in the aforementioned documentation.
+## Benchmarking Results
+### Accuracy Test with `gsm8k`
+A simple accuracy benchmark can be tested with `gsm8k` dataset:
+```bash
+python3 benchmark/gsm8k/bench_sglang.py --num-shots 8 --num-questions 1319 --parallel 1319
+```
+The result is 0.956, which matches our expectation:
+```bash
+Accuracy: 0.956
+Invalid: 0.000
+Latency: 25.109 s
+Output throughput: 5226.235 token/s
+```
+To test long-context accuracy, run gsm8k with `--num-shots 20`. The results are very close to the 8 shots results:
+```
+Accuracy: 0.956
+Invalid: 0.000
+Latency: 29.545 s
+Output throughput: 4418.617 token/s
+```
+### Accuracy Test with `gpqa-diamond`
+Accuracy benchmark on long context can be tested on GPQA-diamond dataset with long output tokens and thinking enabled:
+```bash
+python3 -m sglang.test.run_eval --port 30000 --eval-name gpqa --num-examples 198 --max-tokens 128000 --repeat 8 --thinking-mode deepseek-v3
+```
+The mean accuracy over 8 runs shows 0.797, which matches the number 0.799 in official tech report.
+```bash
+Repeat: 8, mean: 0.797
+Scores: ['0.808', '0.798', '0.808', '0.798', '0.783', '0.788', '0.803', '0.793']
+```
+For Deepseek V3.2, Deepseek recommends setting the sampling parameters to temperature = 1.0, top_p = 0.95:
+```bash
+python3 -m sglang.test.run_eval --port 30000 --eval-name gpqa --num-examples 198 --max-tokens 128000 --repeat 8 --top-p 0.95 --temperature 1.0 --thinking-mode deepseek-v3
+Repeat: 8, mean: 0.840
+Scores: ['0.848', '0.808', '0.848', '0.838', '0.879', '0.813', '0.838', '0.848']
+```
+which matches the official score, 0.824, as reported in the [Deepseek-V3.2 technical report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/assets/paper.pdf).
+### Accuracy Test with `aime 2025`
+Prepare the environment by installing NeMo-Skills in the docker or your own virtual environment:
+  ```
+  pip install git+https://github.com/NVIDIA/NeMo-Skills.git --ignore-installed blinker
+  ```
+Then launch the SGLang server:
+```
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp --tp 8 --dp 8 --enable-dp-attention
+```
+**For `DeepSeek-V3.2` and `DeepSeek-V3.2-Speciale`**:
+```
+python3 -m sglang.launch_server   --model-path deepseek-ai/DeepSeek-V3.2   --trust-remote-code   --tp-size 8 --dp-size 8 --enable-dp-attention   --tool-call-parser deepseekv32   --reasoning-parser deepseek-v3
+```
+Run the following script to evaluate AIME 2025:
+```
+#! /bin/bash
+export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
+ns prepare_data aime25
+PORT=30000
+BACKEND=sglang
+MODEL="deepseek-ai/DeepSeek-V3.2-Exp" # Should be changed to the model name
+MODEL_NAME="dsv32-fp8"
+echo "Starting AIME25 evaluation with model $MODEL on port $PORT using backend $BACKEND..."
+ns eval \
+  --benchmarks=aime25:4 \
+  --server_type=$BACKEND \
+  --model=$MODEL \
+  --server_address=http://localhost:${PORT}/v1 \
+  --output_dir=nemo_skills_aime25_${MODEL_NAME}_output_${BACKEND}_$(date +%Y%m%d_%H%M%S) \
+  ++chat_template_kwargs.thinking=true \
+  ++inference.temperature=1.0 \
+  ++inference.top_p=0.95 \
+  ++inference.tokens_to_generate=64000
+  # ++inference.tokens_to_generate=120000 for Speciale model
+```
+Test results (8*B200):
+DeepSeek-V3.2-Exp：
+| evaluation_mode    | num_entries | avg_tokens | gen_seconds | symbolic_correct      | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4]   | 30          | 15040      | 1673        | 87.50% ± 1.67%        | 0.00%     |
+| majority@4         | 30          | 15040      | 1673        | 90.00%                | 0.00%     |
+| pass@4             | 30          | 15040      | 1673        | 90.00%                | 0.00%     |
+DeepSeek-V3.2:
+| evaluation_mode    | num_entries | avg_tokens | gen_seconds | symbolic_correct      | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4]   | 30          | 13550      | 1632        | 92.50% ± 1.67%        | 0.00%     |
+| majority@4         | 30          | 13550      | 1632        | 94.71%                | 0.00%     |
+| pass@4             | 30          | 13550      | 1632        | 96.67%                | 0.00%     |
+DeepSeek-V3.2-Speciale:
+| evaluation_mode    | num_entries | avg_tokens | gen_seconds | symbolic_correct      | no_answer |
+|--------------------|-------------|------------|-------------|-----------------------|-----------|
+| pass@1[avg-of-4]   | 30          | 24155      | 3583        | 95.00% ± 1.92%        | 0.00%     |
+| majority@4         | 30          | 24155      | 3583        | 95.83%                | 0.00%     |
+| pass@4             | 30          | 24155      | 3583        | 100.00%               | 0.00%     |
+## DSA long sequence context parallel optimization(experimental)
+**Note: This feature is only verified on Hopper machines**
+For context parallel in DeepSeek V3.2 model, we provide two different modes of splitting tokens, which can be controlled with argument `--nsa-prefill-cp-mode`.
+### In sequence splitting
+The first mode can be enabled by `--nsa-prefill-cp-mode in-seq-split`. This mode implements context parallel for DSA by splitting the sequence uniformly between context parallel ranks. At attention stage, each cp rank computes the indexer results of sharded sequence, and collects the whole kv cache through all gather operator. Add `attn_cp_size` for communication group for context parallel.
+Note that in sequence splitting mode has the following restrictions:
+- The batch size is restricted to 1 for prefill batches
+- `moe_dense_tp_size=1`, `moe_a2a_backend = "deepep"`
+- To ensure `cp_size > 1`, the passed in `tp_size` must be larger than `dp_size`
+For more details, please refer to PR https://github.com/sgl-project/sglang/pull/12065.
+Example:
+```bash
+# In-seq splitting mode launched with EP + DP
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp  --tp 8 --ep 8 --dp 2 --enable-dp-attention --enable-nsa-prefill-context-parallel --attn-cp-size 4 --nsa-prefill-cp-mode in-seq-split --max-running-requests 32
+```
+### Round robin splitting (default setting)
+This mode can be enabled by specifying the parameter `--nsa-prefill-cp-mode round-robin-split`, which distributes tokens across ranks based on `token_idx % cp_size`.
+In this scenario, compared with the aforementioned method, it additionally supports the fused MoE backend (the fused MoE backend may deliver better performance than DeepEP in single-machine scenarios), FP8 KV-cache, and multi-batch prefill inference. But it cannot be enabled with dp attention together.
+For more details, please refer to PR https://github.com/sgl-project/sglang/pull/13959.
+Example usage:
+```bash
+# Launch with FusedMoe + CP8
+python -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.2-Exp  --tp 8 --enable-nsa-prefill-context-parallel  --attn-cp-size 8 --nsa-prefill-cp-mode round-robin-split --max-running-requests 32
+```
+### Pipeline Parallel + Context Parallel (PP + CP)
+This mode combines Pipeline Parallelism (PP) and Context Parallelism (CP) to scale across multiple nodes, which can achieve better throughput and Time To First Token (TTFT). Note that this method has only been tested on H20 96G.
+#### Standard Usage
+To launch with PP=2 and CP (via `round-robin-split` mode) on 2 nodes. This configuration uses the fused MoE kernel by default, which generally provides better performance.
+For related development details, please refer to:
+- Fused MoE + CP support: [PR #13959](https://github.com/sgl-project/sglang/pull/13959)
+- PP + CP support: [Issue #15358](https://github.com/sgl-project/sglang/issues/15358) and [PR #16380](https://github.com/sgl-project/sglang/pull/16380)
+Node 0:
+```bash
+export SGLANG_PP_LAYER_PARTITION=30,31
+python3 -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+  --nnodes 2 --node-rank 0 \
+  --dist-init-addr <HEAD_NODE_IP>:62001 \
+  --tp 8 --pp-size 2 \
+  --dp-size 1 --moe-dense-tp-size 1 \
+  --enable-nsa-prefill-context-parallel \
+  --attn-cp-size 8 \
+  --nsa-prefill-cp-mode round-robin-split \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --mem-fraction-static 0.8 \
+  --max-running-requests 128 \
+  --chunked-prefill-size 16384 \
+  --cuda-graph-max-bs 8 \
+  --page-size 64 \
+  --watchdog-timeout 3600 \
+  --host 0.0.0.0 --port 8000 \
+  --tool-call-parser deepseekv32
+```
+Node 1:
+```bash
+export SGLANG_PP_LAYER_PARTITION=30,31
+python3 -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+  --nnodes 2 --node-rank 1 \
+  --dist-init-addr <HEAD_NODE_IP>:62001 \
+  --tp 8 --pp-size 2 \
+  --dp-size 1 --moe-dense-tp-size 1 \
+  --enable-nsa-prefill-context-parallel \
+  --attn-cp-size 8 \
+  --nsa-prefill-cp-mode round-robin-split \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --mem-fraction-static 0.8 \
+  --max-running-requests 128 \
+  --chunked-prefill-size 16384 \
+  --cuda-graph-max-bs 8 \
+  --page-size 64 \
+  --watchdog-timeout 3600 \
+  --host 0.0.0.0 --port 8000 \
+  --tool-call-parser deepseekv32
+```
+#### PD Disaggregation with PP + CP
+If using PD (Prefill-Decode) Disaggregation, the Prefill nodes can be configured with PP + CP as follows.
+Prefill Node 0:
+```bash
+python -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+  --served-model-name deepseek-v32 \
+  --nnodes 2 --node-rank 0 \
+  --dist-init-addr <PREFILL_HEAD_IP>:20102 \
+  --tp 8 --pp-size 2 \
+  --dp-size 1 --moe-dense-tp-size 1 \
+  --enable-nsa-prefill-context-parallel \
+  --attn-cp-size 8 \
+  --nsa-prefill-cp-mode round-robin-split  \
+  --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --max-running-requests 512 \
+  --chunked-prefill-size 4096 \
+  --context-length 131072 \
+  --mem-fraction-static 0.9 \
+  --page-size 64 \
+  --enable-metrics \
+  --collect-tokens-histogram \
+  --tokenizer-worker-num 8 \
+  --host 0.0.0.0 --port 30000
+```
+Prefill Node 1:
+```bash
+python -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-V3.2-Exp \
+  --served-model-name deepseek-v32-prefill \
+  --nnodes 2 --node-rank 1 \
+  --dist-init-addr <PREFILL_HEAD_IP>:20102 \
+  --tp 8 --pp-size 2 \
+  --dp-size 1 --moe-dense-tp-size 1 \
+  --enable-nsa-prefill-context-parallel \
+  --attn-cp-size 8 \
+  --nsa-prefill-cp-mode round-robin-split  \
+  --disaggregation-ib-device mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3 \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --max-running-requests 512 \
+  --chunked-prefill-size 4096 \
+  --context-length 131072 \
+  --mem-fraction-static 0.9 \
+  --page-size 64 \
+  --enable-metrics \
+  --collect-tokens-histogram \
+  --tokenizer-worker-num 8 \
+  --host 0.0.0.0 --port 30000
+```
+For the Decode nodes, it is recommended to use the **EP mode**.

sglang/docs/basic_usage/glm45.md ADDED Viewed

	@@ -0,0 +1,70 @@

+## Launch GLM-4.5 / GLM-4.6 / GLM-4.7 with SGLang
+To serve GLM-4.5 / GLM-4.6 FP8 models on 8xH100/H200 GPUs:
+```bash
+python3 -m sglang.launch_server --model zai-org/GLM-4.6-FP8 --tp 8
+```
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported GLM-4.5 / GLM-4.6 models
+with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and
+`--speculative-num-draft-tokens` to enable this feature. For example:
+``` bash
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.6-FP8 \
+  --tp-size 8 \
+  --tool-call-parser glm45  \
+  --reasoning-parser glm45  \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 3  \
+  --speculative-eagle-topk 1  \
+  --speculative-num-draft-tokens 4 \
+  --mem-fraction-static 0.9 \
+  --served-model-name glm-4.6-fp8 \
+  --enable-custom-logit-processor
+```
+```{tip}
+To enable the experimental overlap scheduler for EAGLE speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+### Thinking Budget for GLM-4.5 / GLM-4.6
+**Note**: For GLM-4.7, `--tool-call-parser` should be set to `glm47`, for GLM-4.5 and GLM-4.6, it should be set to `glm45`.
+In SGLang, we can implement thinking budget with `CustomLogitProcessor`.
+Launch a server with `--enable-custom-logit-processor` flag on.
+Sample Request:
+```python
+import openai
+from rich.pretty import pprint
+from sglang.srt.sampling.custom_logit_processor import Glm4MoeThinkingBudgetLogitProcessor
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="*")
+response = client.chat.completions.create(
+    model="zai-org/GLM-4.6",
+    messages=[
+        {
+            "role": "user",
+            "content": "Question: Is Paris the Capital of France?",
+        }
+    ],
+    max_tokens=1024,
+    extra_body={
+        "custom_logit_processor": Glm4MoeThinkingBudgetLogitProcessor().to_str(),
+        "custom_params": {
+            "thinking_budget": 512,
+        },
+    },
+)
+pprint(response)
+```

sglang/docs/basic_usage/glmv.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# GLM-4.6V / GLM-4.5V Usage
+## Launch commands for SGLang
+Below are suggested launch commands tailored for different hardware / precision modes
+### FP8 (quantised) mode
+For high memory-efficiency and latency optimized deployments (e.g., on H100, H200) where FP8 checkpoint is supported:
+```bash
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.6V-FP8 \
+  --tp 2 \
+  --ep 2 \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --keep-mm-feature-on-device
+```
+### Non-FP8 (BF16 / full precision) mode
+For deployments on A100/H100 where BF16 is used (or FP8 snapshot not used):
+```bash
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.6V \
+  --tp 4 \
+  --ep 4 \
+  --host 0.0.0.0 \
+  --port 30000
+```
+## Hardware-specific notes / recommendations
+- On H100 with FP8: Use the FP8 checkpoint for best memory efficiency.
+- On A100 / H100 with BF16 (non-FP8): It’s recommended to use `--mm-max-concurrent-calls` to control parallel throughput and GPU memory usage during image/video inference.
+- On H200 & B200: The model can be run “out of the box”, supporting full context length plus concurrent image + video processing.
+## Sending Image/Video Requests
+### Image input:
+```python
+import requests
+url = f"http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "zai-org/GLM-4.6V",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true"
+                    },
+                },
+            ],
+        }
+    ],
+    "max_tokens": 300,
+}
+response = requests.post(url, json=data)
+print(response.text)
+```
+### Video Input:
+```python
+import requests
+url = f"http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "zai-org/GLM-4.6V",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s happening in this video?"},
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://github.com/sgl-project/sgl-test-files/raw/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+                    },
+                },
+            ],
+        }
+    ],
+    "max_tokens": 300,
+}
+response = requests.post(url, json=data)
+print(response.text)
+```
+## Important Server Parameters and Flags
+When launching the model server for **multimodal support**, you can use the following command-line arguments to fine-tune performance and behavior:
+- `--mm-attention-backend`: Specify multimodal attention backend. Eg. `fa3`(Flash Attention 3)
+- `--mm-max-concurrent-calls <value>`: Specifies the **maximum number of concurrent asynchronous multimodal data processing calls** allowed on the server. Use this to control parallel throughput and GPU memory usage during image/video inference.
+- `--mm-per-request-timeout <seconds>`: Defines the **timeout duration (in seconds)** for each multimodal request. If a request exceeds this time limit (e.g., for very large video inputs), it will be automatically terminated.
+- `--keep-mm-feature-on-device`: Instructs the server to **retain multimodal feature tensors on the GPU** after processing. This avoids device-to-host (D2H) memory copies and improves performance for repeated or high-frequency inference workloads.
+- `--mm-enable-dp-encoder`: Placing the ViT in data parallel while keeping the LLM in tensor parallel consistently lowers TTFT and boosts end-to-end throughput.
+- `SGLANG_USE_CUDA_IPC_TRANSPORT=1`: Shared memory pool based CUDA IPC for multi-modal data transport. For significantly improving e2e latency.
+### Example usage with the above optimizations:
+```bash
+SGLANG_USE_CUDA_IPC_TRANSPORT=1 \
+SGLANG_VLM_CACHE_SIZE_MB=0 \
+python -m sglang.launch_server \
+  --model-path zai-org/GLM-4.6V \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --trust-remote-code \
+  --tp-size 8 \
+  --enable-cache-report \
+  --log-level info \
+  --max-running-requests 64 \
+  --mem-fraction-static 0.65 \
+  --chunked-prefill-size 8192 \
+  --attention-backend fa3 \
+  --mm-attention-backend fa3 \
+  --mm-enable-dp-encoder \
+  --enable-metrics
+```
+### Thinking Budget for GLM-4.5V / GLM-4.6V
+In SGLang, we can implement thinking budget with `CustomLogitProcessor`.
+Launch a server with the `--enable-custom-logit-processor` flag. Then, use `Glm4MoeThinkingBudgetLogitProcessor` in the request, similar to the `GLM-4.6` example in [glm45.md](./glm45.md).

sglang/docs/basic_usage/gpt_oss.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# GPT OSS Usage
+Please refer to [https://github.com/sgl-project/sglang/issues/8833](https://github.com/sgl-project/sglang/issues/8833).
+## Responses API & Built-in Tools
+### Responses API
+GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use. You can set reasoning level via `instructions`, e.g., "Reasoning: high" (also supports "medium" and "low") — levels: low (fast), medium (balanced), high (deep).
+### Built-in Tools
+GPT‑OSS can call built‑in tools for web search and Python execution. You can use the demo tool server or connect to external MCP tool servers.
+#### Python Tool
+- Executes short Python snippets for calculations, parsing, and quick scripts.
+- By default runs in a Docker-based sandbox. To run on the host, set `PYTHON_EXECUTION_BACKEND=UV` (this executes model-generated code locally; use with care).
+- Ensure Docker is available if you are not using the UV backend. It is recommended to run `docker pull python:3.11` in advance.
+#### Web Search Tool
+- Uses the Exa backend for web search.
+- Requires an Exa API key; set `EXA_API_KEY` in your environment. Create a key at `https://exa.ai`.
+### Tool & Reasoning Parser
+- We support OpenAI Reasoning and Tool Call parser, as well as our SGLang native api for tool call and reasoning. Refer to [reasoning parser](../advanced_features/separate_reasoning.ipynb) and [tool call parser](../advanced_features/function_calling.ipynb) for more details.
+## Notes
+- Use **Python 3.12** for the demo tools. And install the required `gpt-oss` packages.
+- The default demo integrates the web search tool (Exa backend) and a demo Python interpreter via Docker.
+- For search, set `EXA_API_KEY`. For Python execution, either have Docker available or set `PYTHON_EXECUTION_BACKEND=UV`.
+Examples:
+```bash
+export EXA_API_KEY=YOUR_EXA_KEY
+# Optional: run Python tool locally instead of Docker (use with care)
+export PYTHON_EXECUTION_BACKEND=UV
+```
+Launch the server with the demo tool server:
+```bash
+python3 -m sglang.launch_server \
+  --model-path openai/gpt-oss-120b \
+  --tool-server demo \
+  --tp 2
+```
+For production usage, sglang can act as an MCP client for multiple services. An [example tool server](https://github.com/openai/gpt-oss/tree/main/gpt-oss-mcp-server) is provided. Start the servers and point sglang to them:
+```bash
+mcp run -t sse browser_server.py:mcp
+mcp run -t sse python_server.py:mcp
+python -m sglang.launch_server ... --tool-server ip-1:port-1,ip-2:port-2
+```
+The URLs should be MCP SSE servers that expose server information and well-documented tools. These tools are added to the system prompt so the model can use them.
+## Speculative Decoding
+SGLang supports speculative decoding for GPT-OSS models using EAGLE3 algorithm. This can significantly improve decoding speed, especially for small batch sizes.
+**Usage**:
+Add `--speculative-algorithm EAGLE3` along with the draft model path.
+```bash
+python3 -m sglang.launch_server \
+  --model-path openai/gpt-oss-120b \
+  --speculative-algorithm EAGLE3 \
+  --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+  --tp 2
+```
+```{tip}
+To enable the experimental overlap scheduler for EAGLE3 speculative decoding, set the environment variable `SGLANG_ENABLE_SPEC_V2=1`. This can improve performance by enabling overlap scheduling between draft and verification stages.
+```
+### Quick Demo
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:30000/v1",
+    api_key="sk-123456"
+)
+tools = [
+    {"type": "code_interpreter"},
+    {"type": "web_search_preview"},
+]
+# Reasoning level example
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helpful assistant."
+    reasoning_effort="high" # Supports high, medium, or low
+    input="In one sentence, explain the transformer architecture.",
+)
+print("====== reasoning: high ======")
+print(response.output_text)
+# Test python tool
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helpful assistant, you could use python tool to execute code.",
+    input="Use python tool to calculate the sum of 29138749187 and 29138749187", # 58,277,498,374
+    tools=tools
+)
+print("====== test python tool ======")
+print(response.output_text)
+# Test browser tool
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helpful assistant, you could use browser to search the web",
+    input="Search the web for the latest news about Nvidia stock price",
+    tools=tools
+)
+print("====== test browser tool ======")
+print(response.output_text)
+```
+Example output:
+```
+====== test python tool ======
+The sum of 29,138,749,187 and 29,138,749,187 is **58,277,498,374**.
+====== test browser tool ======
+**Recent headlines on Nvidia (NVDA) stock**
+| Date (2025) | Source | Key news points | Stock‑price detail |
+|-------------|--------|----------------|--------------------|
+| **May 13** | Reuters | The market data page shows Nvidia trading “higher” at **$116.61** with no change from the previous close. | **$116.61** – latest trade (delayed ≈ 15 min)【14†L34-L38】 |
+| **Aug 18** | CNBC | Morgan Stanley kept an **overweight** rating and lifted its price target to **$206** (up from $200), implying a 14 % upside from the Friday close. The firm notes Nvidia shares have already **jumped 34 % this year**. | No exact price quoted, but the article signals strong upside expectations【9†L27-L31】 |
+| **Aug 20** | The Motley Fool | Nvidia is set to release its Q2 earnings on Aug 27. The article lists the **current price of $175.36**, down 0.16 % on the day (as of 3:58 p.m. ET). | **$175.36** – current price on Aug 20【10†L12-L15】【10†L53-L57】 |
+**What the news tells us**
+* Nvidia’s share price has risen sharply this year – up roughly a third according to Morgan Stanley – and analysts are still raising targets (now $206).
+* The most recent market quote (Reuters, May 13) was **$116.61**, but the stock has surged since then, reaching **$175.36** by mid‑August.
+* Upcoming earnings on **Aug 27** are a focal point; both the Motley Fool and Morgan Stanley expect the results could keep the rally going.
+**Bottom line:** Nvidia’s stock is on a strong upward trajectory in 2025, with price targets climbing toward $200‑$210 and the market price already near $175 as of late August.
+```

sglang/docs/basic_usage/llama4.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# Llama4 Usage
+[Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md) is Meta's latest generation of open-source LLM model with industry-leading performance.
+SGLang has supported Llama 4 Scout (109B) and Llama 4 Maverick (400B) since [v0.4.5](https://github.com/sgl-project/sglang/releases/tag/v0.4.5).
+Ongoing optimizations are tracked in the [Roadmap](https://github.com/sgl-project/sglang/issues/5118).
+## Launch Llama 4 with SGLang
+To serve Llama 4 models on 8xH100/H200 GPUs:
+```bash
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
+  --tp 8 \
+  --context-length 1000000
+```
+### Configuration Tips
+- **OOM Mitigation**: Adjust `--context-length` to avoid a GPU out-of-memory issue. For the Scout model, we recommend setting this value up to 1M on 8\*H100 and up to 2.5M on 8\*H200. For the Maverick model, we don't need to set context length on 8\*H200. When hybrid kv cache is enabled, `--context-length` can be set up to 5M on 8\*H100 and up to 10M on 8\*H200 for the Scout model.
+- **Attention Backend Auto-Selection**: SGLang automatically selects the optimal attention backend for Llama 4 based on your hardware. You typically don't need to specify `--attention-backend` manually:
+  - **Blackwell GPUs (B200/GB200)**: `trtllm_mha`
+  - **Hopper GPUs (H100/H200)**: `fa3`
+  - **AMD GPUs**: `aiter`
+  - **Intel XPU**: `intel_xpu`
+  - **Other platforms**: `triton` (fallback)
+  To override the auto-selection, explicitly specify `--attention-backend` with one of the supported backends: `fa3`, `aiter`, `triton`, `trtllm_mha`, or `intel_xpu`.
+- **Chat Template**: Add `--chat-template llama-4` for chat completion tasks.
+- **Enable Multi-Modal**: Add `--enable-multimodal` for multi-modal capabilities.
+- **Enable Hybrid-KVCache**: Set `--swa-full-tokens-ratio` to adjust the ratio of SWA layer (for Llama4, it's local attention layer) KV tokens / full layer KV tokens. (default: 0.8, range: 0-1)
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Llama 4 Maverick (400B) with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+**Usage**:
+Add arguments `--speculative-draft-model-path`, `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+```
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
+  --speculative-algorithm EAGLE3 \
+  --speculative-draft-model-path nvidia/Llama-4-Maverick-17B-128E-Eagle3 \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --trust-remote-code \
+  --tp 8 \
+  --context-length 1000000
+```
+- **Note** The Llama 4 draft model *nvidia/Llama-4-Maverick-17B-128E-Eagle3* can only recognize conversations in chat mode.
+## Benchmarking Results
+### Accuracy Test with `lm_eval`
+The accuracy on SGLang for both Llama4 Scout and Llama4 Maverick can match the [official benchmark numbers](https://ai.meta.com/blog/llama-4-multimodal-intelligence/).
+Benchmark results on MMLU Pro dataset with 8*H100:
+|                    | Llama-4-Scout-17B-16E-Instruct | Llama-4-Maverick-17B-128E-Instruct  |
+|--------------------|--------------------------------|-------------------------------------|
+| Official Benchmark | 74.3                           | 80.5                                |
+| SGLang             | 75.2                           | 80.7                                |
+Commands:
+```bash
+# Llama-4-Scout-17B-16E-Instruct model
+python -m sglang.launch_server \
+  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
+  --port 30000 \
+  --tp 8 \
+  --mem-fraction-static 0.8 \
+  --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Scout-17B-16E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+# Llama-4-Maverick-17B-128E-Instruct
+python -m sglang.launch_server \
+  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
+  --port 30000 \
+  --tp 8 \
+  --mem-fraction-static 0.8 \
+  --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Maverick-17B-128E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+```
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/5092).

sglang/docs/basic_usage/minimax_m2.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# MiniMax M2.5/M2.1/M2 Usage
+[MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5), [MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1), and [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) are advanced large language models created by [MiniMax](https://www.minimax.io/).
+The MiniMax-M2 series redefines efficiency for agents. These compact, fast, and cost-effective MoE models (230 billion total parameters with 10 billion active parameters) are built for elite performance in coding and agentic tasks, all while maintaining powerful general intelligence. With just 10 billion activated parameters, the MiniMax-M2 series provides sophisticated, end-to-end tool use performance expected from today's leading models, but in a streamlined form factor that makes deployment and scaling easier than ever.
+## Supported Models
+This guide applies to the following models. You only need to update the model name during deployment. The following examples use **MiniMax-M2**:
+- [MiniMaxAI/MiniMax-M2.5](https://huggingface.co/MiniMaxAI/MiniMax-M2.5)
+- [MiniMaxAI/MiniMax-M2.1](https://huggingface.co/MiniMaxAI/MiniMax-M2.1)
+- [MiniMaxAI/MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2)
+## System Requirements
+The following are recommended configurations; actual requirements should be adjusted based on your use case:
+- 4x 96GB GPUs: Supported context length of up to 400K tokens.
+- 8x 144GB GPUs: Supported context length of up to 3M tokens.
+## Deployment with Python
+4-GPU deployment command:
+```bash
+python -m sglang.launch_server \
+    --model-path MiniMaxAI/MiniMax-M2 \
+    --tp-size 4 \
+    --tool-call-parser minimax-m2 \
+    --reasoning-parser minimax-append-think \
+    --host 0.0.0.0 \
+    --trust-remote-code \
+    --port 8000 \
+    --mem-fraction-static 0.85
+```
+8-GPU deployment command:
+```bash
+python -m sglang.launch_server \
+    --model-path MiniMaxAI/MiniMax-M2 \
+    --tp-size 8 \
+    --ep-size 8 \
+    --tool-call-parser minimax-m2 \
+    --reasoning-parser minimax-append-think \
+    --host 0.0.0.0 \
+    --trust-remote-code \
+    --port 8000 \
+    --mem-fraction-static 0.85
+```
+### AMD GPUs (MI300X/MI325X/MI355X)
+8-GPU deployment command:
+```bash
+SGLANG_USE_AITER=1 python -m sglang.launch_server \
+    --model-path MiniMaxAI/MiniMax-M2.5 \
+    --tp-size 8 \
+    --ep-size 8 \
+    --attention-backend aiter \
+    --tool-call-parser minimax-m2 \
+    --reasoning-parser minimax-append-think \
+    --host 0.0.0.0 \
+    --trust-remote-code \
+    --port 8000 \
+    --mem-fraction-static 0.85
+```
+## Testing Deployment
+After startup, you can test the SGLang OpenAI-compatible API with the following command:
+```bash
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "MiniMaxAI/MiniMax-M2",
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {"role": "user", "content": [{"type": "text", "text": "Who won the world series in 2020?"}]}
+        ]
+    }'
+```

sglang/docs/basic_usage/native_api.ipynb ADDED Viewed

	@@ -0,0 +1,667 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SGLang Native APIs\n",
+    "\n",
+    "Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce the following APIs:\n",
+    "\n",
+    "- `/generate` (text generation model)\n",
+    "- `/get_model_info`\n",
+    "- `/get_server_info`\n",
+    "- `/health`\n",
+    "- `/health_generate`\n",
+    "- `/flush_cache`\n",
+    "- `/update_weights`\n",
+    "- `/encode`(embedding model)\n",
+    "- `/v1/rerank`(cross encoder rerank model)\n",
+    "- `/v1/score`(decoder-only scoring)\n",
+    "- `/classify`(reward model)\n",
+    "- `/start_expert_distribution_record`\n",
+    "- `/stop_expert_distribution_record`\n",
+    "- `/dump_expert_distribution_record`\n",
+    "- `/tokenize`\n",
+    "- `/detokenize`\n",
+    "- A full list of these APIs can be found at [http_server.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py)\n",
+    "\n",
+    "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate (text generation model)\n",
+    "Generate completions. This is similar to the `/v1/completions` in OpenAI API. Detailed parameters can be found in the [sampling parameters](sampling_params.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
+    "data = {\"text\": \"What is the capital of France?\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Model Info\n",
+    "\n",
+    "Get the information of the model.\n",
+    "\n",
+    "- `model_path`: The path/name of the model.\n",
+    "- `is_generation`: Whether the model is used as generation model or embedding model.\n",
+    "- `tokenizer_path`: The path/name of the tokenizer.\n",
+    "- `preferred_sampling_params`: The default sampling params specified via `--preferred-sampling-params`. `None` is returned in this example as we did not explicitly configure it in server args.\n",
+    "- `weight_version`: This field contains the version of the model weights. This is often used to track changes or updates to the model’s trained parameters.\n",
+    "- `has_image_understanding`: Whether the model has image-understanding capability.\n",
+    "- `has_audio_understanding`: Whether the model has audio-understanding capability.\n",
+    "- `model_type`: The model type from the HuggingFace config (e.g., \"qwen2\", \"llama\").\n",
+    "- `architectures`: The model architectures from the HuggingFace config (e.g., [\"Qwen2ForCausalLM\"])."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/get_model_info\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "assert response_json[\"is_generation\"] is True\n",
+    "assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "assert response_json[\"preferred_sampling_params\"] is None\n",
+    "assert response_json.keys() == {\n",
+    "    \"model_path\",\n",
+    "    \"is_generation\",\n",
+    "    \"tokenizer_path\",\n",
+    "    \"preferred_sampling_params\",\n",
+    "    \"weight_version\",\n",
+    "    \"has_image_understanding\",\n",
+    "    \"has_audio_understanding\",\n",
+    "    \"model_type\",\n",
+    "    \"architectures\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Server Info\n",
+    "Gets the server information including CLI arguments, token limits, and memory pool sizes.\n",
+    "- Note: `get_server_info` merges the following deprecated endpoints:\n",
+    "  - `get_server_args`\n",
+    "  - `get_memory_pool_size`\n",
+    "  - `get_max_total_num_tokens`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/get_server_info\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Health Check\n",
+    "- `/health`: Check the health of the server.\n",
+    "- `/health_generate`: Check the health of the server by generating one token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/health_generate\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/health\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Flush Cache\n",
+    "\n",
+    "Flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/flush_cache\"\n",
+    "\n",
+    "response = requests.post(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update Weights From Disk\n",
+    "\n",
+    "Update model weights from disk without restarting the server. Only applicable for models with the same architecture and parameter size.\n",
+    "\n",
+    "SGLang support `update_weights_from_disk` API for continuous evaluation during training (save checkpoint to disk and update weights from disk).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# successful update with same architecture and size\n",
+    "\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+    "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)\n",
+    "assert response.json()[\"success\"] is True\n",
+    "assert response.json()[\"message\"] == \"Succeeded to update model weights.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# failed update with different parameter size or wrong name\n",
+    "\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+    "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"success\"] is False\n",
+    "assert response_json[\"message\"] == (\n",
+    "    \"Failed to get weights iterator: \"\n",
+    "    \"qwen/qwen2.5-0.5b-instruct-wrong\"\n",
+    "    \" (repository not found).\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encode (embedding model)\n",
+    "\n",
+    "Encode text into embeddings. Note that this API is only available for [embedding models](openai_api_embeddings.ipynb) and will raise an error for generation models.\n",
+    "Therefore, we launch a new server to server an embedding model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+    "    --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# successful encode for embedding model\n",
+    "\n",
+    "url = f\"http://localhost:{port}/encode\"\n",
+    "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## v1/rerank (cross encoder rerank model)\n",
+    "Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) with `attention-backend` `triton` and `torch_native`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reranker_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
+    "    --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=reranker_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute rerank scores for query and documents\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/rerank\"\n",
+    "data = {\n",
+    "    \"model\": \"BAAI/bge-reranker-v2-m3\",\n",
+    "    \"query\": \"what is panda?\",\n",
+    "    \"documents\": [\n",
+    "        \"hi\",\n",
+    "        \"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.\",\n",
+    "    ],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "for item in response_json:\n",
+    "    print_highlight(f\"Score: {item['score']:.2f} - Document: '{item['document']}'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(reranker_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## v1/score (decoder-only scoring)\n",
+    "\n",
+    "Compute token probabilities for specified tokens given a query and items. This is useful for classification tasks, scoring responses, or computing log-probabilities.\n",
+    "\n",
+    "Parameters:\n",
+    "- `query`: Query text\n",
+    "- `items`: Item text(s) to score\n",
+    "- `label_token_ids`: Token IDs to compute probabilities for\n",
+    "- `apply_softmax`: Whether to apply softmax to get normalized probabilities (default: False)\n",
+    "- `item_first`: Whether items come first in concatenation order (default: False)\n",
+    "- `model`: Model name\n",
+    "\n",
+    "The response contains `scores` - a list of probability lists, one per item, each in the order of `label_token_ids`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "score_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+    "    --host 0.0.0.0 --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=score_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Score the probability of different completions given a query\n",
+    "query = \"The capital of France is\"\n",
+    "items = [\"Paris\", \"London\", \"Berlin\"]\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/score\"\n",
+    "data = {\n",
+    "    \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    \"query\": query,\n",
+    "    \"items\": items,\n",
+    "    \"label_token_ids\": [9454, 2753],  # e.g. \"Yes\" and \"No\" token ids\n",
+    "    \"apply_softmax\": True,  # Normalize probabilities to sum to 1\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "\n",
+    "# Display scores for each item\n",
+    "for item, scores in zip(items, response_json[\"scores\"]):\n",
+    "    print_highlight(f\"Item '{item}': probabilities = {[f'{s:.4f}' for s in scores]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(score_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classify (reward model)\n",
+    "\n",
+    "SGLang Runtime also supports reward models. Here we use a reward model to classify the quality of pairwise generations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
+    "# This will be updated in the future.\n",
+    "\n",
+    "reward_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=reward_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "PROMPT = (\n",
+    "    \"What is the range of the numeric output of a sigmoid node in a neural network?\"\n",
+    ")\n",
+    "\n",
+    "RESPONSE1 = \"The output of a sigmoid node is bounded between -1 and 1.\"\n",
+    "RESPONSE2 = \"The output of a sigmoid node is bounded between 0 and 1.\"\n",
+    "\n",
+    "CONVS = [\n",
+    "    [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE1}],\n",
+    "    [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE2}],\n",
+    "]\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
+    "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False, return_dict=False)\n",
+    "\n",
+    "url = f\"http://localhost:{port}/classify\"\n",
+    "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
+    "\n",
+    "responses = requests.post(url, json=data).json()\n",
+    "for response in responses:\n",
+    "    print_highlight(f\"reward: {response['embedding'][0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(reward_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Capture expert selection distribution in MoE models\n",
+    "\n",
+    "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
+    "\n",
+    "*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expert_record_server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=expert_record_server_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
+    "data = {\"text\": \"What is the capital of France?\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(expert_record_server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenize/Detokenize Example (Round Trip)\n",
+    "\n",
+    "This example demonstrates how to use the /tokenize and /detokenize endpoints together. We first tokenize a string, then detokenize the resulting IDs to reconstruct the original text. This workflow is useful when you need to handle tokenization externally but still leverage the server for detokenization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer_free_server_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=tokenizer_free_server_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from sglang.utils import print_highlight\n",
+    "\n",
+    "base_url = f\"http://localhost:{port}\"\n",
+    "tokenize_url = f\"{base_url}/tokenize\"\n",
+    "detokenize_url = f\"{base_url}/detokenize\"\n",
+    "\n",
+    "model_name = \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "input_text = \"SGLang provides efficient tokenization endpoints.\"\n",
+    "print_highlight(f\"Original Input Text:\\n'{input_text}'\")\n",
+    "\n",
+    "# --- tokenize the input text ---\n",
+    "tokenize_payload = {\n",
+    "    \"model\": model_name,\n",
+    "    \"prompt\": input_text,\n",
+    "    \"add_special_tokens\": False,\n",
+    "}\n",
+    "try:\n",
+    "    tokenize_response = requests.post(tokenize_url, json=tokenize_payload)\n",
+    "    tokenize_response.raise_for_status()\n",
+    "    tokenization_result = tokenize_response.json()\n",
+    "    token_ids = tokenization_result.get(\"tokens\")\n",
+    "\n",
+    "    if not token_ids:\n",
+    "        raise ValueError(\"Tokenization returned empty tokens.\")\n",
+    "\n",
+    "    print_highlight(f\"\\nTokenized Output (IDs):\\n{token_ids}\")\n",
+    "    print_highlight(f\"Token Count: {tokenization_result.get('count')}\")\n",
+    "    print_highlight(f\"Max Model Length: {tokenization_result.get('max_model_len')}\")\n",
+    "\n",
+    "    # --- detokenize the obtained token IDs ---\n",
+    "    detokenize_payload = {\n",
+    "        \"model\": model_name,\n",
+    "        \"tokens\": token_ids,\n",
+    "        \"skip_special_tokens\": True,\n",
+    "    }\n",
+    "\n",
+    "    detokenize_response = requests.post(detokenize_url, json=detokenize_payload)\n",
+    "    detokenize_response.raise_for_status()\n",
+    "    detokenization_result = detokenize_response.json()\n",
+    "    reconstructed_text = detokenization_result.get(\"text\")\n",
+    "\n",
+    "    print_highlight(f\"\\nDetokenized Output (Text):\\n'{reconstructed_text}'\")\n",
+    "\n",
+    "    if input_text == reconstructed_text:\n",
+    "        print_highlight(\n",
+    "            \"\\nRound Trip Successful: Original and reconstructed text match.\"\n",
+    "        )\n",
+    "    else:\n",
+    "        print_highlight(\n",
+    "            \"\\nRound Trip Mismatch: Original and reconstructed text differ.\"\n",
+    "        )\n",
+    "\n",
+    "except requests.exceptions.RequestException as e:\n",
+    "    print_highlight(f\"\\nHTTP Request Error: {e}\")\n",
+    "except Exception as e:\n",
+    "    print_highlight(f\"\\nAn error occurred: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(tokenizer_free_server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

sglang/docs/basic_usage/offline_engine_api.ipynb ADDED Viewed

	@@ -0,0 +1,235 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Offline Engine API\n",
+    "\n",
+    "SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
+    "\n",
+    "- Offline Batch Inference\n",
+    "- Custom Server on Top of the Engine\n",
+    "\n",
+    "This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
+    "\n",
+    "- Non-streaming synchronous generation\n",
+    "- Streaming synchronous generation\n",
+    "- Non-streaming asynchronous generation\n",
+    "- Streaming asynchronous generation\n",
+    "\n",
+    "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nest Asyncio\n",
+    "Note that if you want to use **Offline Engine** in ipython or some other nested loop code, you need to add the following code:\n",
+    "```python\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Advanced Usage\n",
+    "\n",
+    "The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/hidden_states). \n",
+    "\n",
+    "Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Batch Inference\n",
+    "\n",
+    "SGLang offline engine supports batch inference with efficient scheduling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# launch the offline engine\n",
+    "import asyncio\n",
+    "\n",
+    "import sglang as sgl\n",
+    "import sglang.test.doc_patch\n",
+    "from sglang.utils import async_stream_and_merge, stream_and_merge\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The president of the United States is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.2,\n",
+    "    \"top_p\": 0.9,\n",
+    "}\n",
+    "\n",
+    "print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    print(f\"Prompt: {prompt}\")\n",
+    "    merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
+    "    print(\"Generated text:\", merged_output)\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print(\"\\n=== Testing asynchronous batch generation ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    outputs = await llm.async_generate(prompts, sampling_params)\n",
+    "\n",
+    "    for prompt, output in zip(prompts, outputs):\n",
+    "        print(f\"\\nPrompt: {prompt}\")\n",
+    "        print(f\"Generated text: {output['text']}\")\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    for prompt in prompts:\n",
+    "        print(f\"\\nPrompt: {prompt}\")\n",
+    "        print(\"Generated text: \", end=\"\", flush=True)\n",
+    "\n",
+    "        # Replace direct calls to async_generate with our custom overlap-aware version\n",
+    "        async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
+    "            print(cleaned_chunk, end=\"\", flush=True)\n",
+    "\n",
+    "        print()  # New line after each prompt\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/basic_usage/ollama_api.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Ollama-Compatible API
+SGLang provides Ollama API compatibility, allowing you to use the Ollama CLI and Python library with SGLang as the inference backend.
+## Prerequisites
+```bash
+# Install the Ollama Python library (for Python client usage)
+pip install ollama
+```
+> **Note**: You don't need the Ollama server installed - SGLang acts as the backend. You only need the `ollama` CLI or Python library as the client.
+## Endpoints
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/` | GET, HEAD | Health check for Ollama CLI |
+| `/api/tags` | GET | List available models |
+| `/api/chat` | POST | Chat completions (streaming & non-streaming) |
+| `/api/generate` | POST | Text generation (streaming & non-streaming) |
+| `/api/show` | POST | Model information |
+## Quick Start
+### 1. Launch SGLang Server
+```bash
+python -m sglang.launch_server \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --port 30001 \
+    --host 0.0.0.0
+```
+> **Note**: The model name used with `ollama run` must match exactly what you passed to `--model`.
+### 2. Use Ollama CLI
+```bash
+# List available models
+OLLAMA_HOST=http://localhost:30001 ollama list
+# Interactive chat
+OLLAMA_HOST=http://localhost:30001 ollama run "Qwen/Qwen2.5-1.5B-Instruct"
+```
+If connecting to a remote server behind a firewall:
+```bash
+# SSH tunnel
+ssh -L 30001:localhost:30001 user@gpu-server -N &
+# Then use Ollama CLI as above
+OLLAMA_HOST=http://localhost:30001 ollama list
+```
+### 3. Use Ollama Python Library
+```python
+import ollama
+client = ollama.Client(host='http://localhost:30001')
+# Non-streaming
+response = client.chat(
+    model='Qwen/Qwen2.5-1.5B-Instruct',
+    messages=[{'role': 'user', 'content': 'Hello!'}]
+)
+print(response['message']['content'])
+# Streaming
+stream = client.chat(
+    model='Qwen/Qwen2.5-1.5B-Instruct',
+    messages=[{'role': 'user', 'content': 'Tell me a story'}],
+    stream=True
+)
+for chunk in stream:
+    print(chunk['message']['content'], end='', flush=True)
+```
+## Smart Router
+For intelligent routing between local Ollama (fast) and remote SGLang (powerful) using an LLM judge, see the [Smart Router documentation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/ollama/README.md).
+## Summary
+| Component | Purpose |
+|-----------|---------|
+| **Ollama API** | Familiar CLI/API that developers already know |
+| **SGLang Backend** | High-performance inference engine |
+| **Smart Router** | Intelligent routing - fast local for simple tasks, powerful remote for complex tasks |

sglang/docs/basic_usage/openai_api.rst ADDED Viewed

	@@ -0,0 +1,9 @@

+OpenAI-Compatible APIs
+======================
+.. toctree::
+   :maxdepth: 1
+   openai_api_completions.ipynb
+   openai_api_vision.ipynb
+   openai_api_embeddings.ipynb

sglang/docs/basic_usage/openai_api_completions.ipynb ADDED Viewed

	@@ -0,0 +1,552 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Completions\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial covers the following popular APIs:\n",
+    "\n",
+    "- `chat/completions`\n",
+    "- `completions`\n",
+    "\n",
+    "Check out other tutorials to learn about [vision APIs](openai_api_vision.ipynb) for vision-language models and [embedding APIs](openai_api_embeddings.ipynb) for embedding models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chat Completions\n",
+    "\n",
+    "### Usage\n",
+    "\n",
+    "The server fully implements the OpenAI API.\n",
+    "It will automatically apply the chat template specified in the Hugging Face tokenizer, if one is available.\n",
+    "You can also specify a custom chat template with `--chat-template` when launching the server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Thinking/Reasoning Support\n",
+    "\n",
+    "Some models support internal reasoning or thinking processes that can be exposed in the API response. SGLang provides unified support for various reasoning models through the `chat_template_kwargs` parameter and compatible reasoning parsers.\n",
+    "\n",
+    "#### Supported Models and Configuration\n",
+    "\n",
+    "| Model Family | Chat Template Parameter | Reasoning Parser | Notes |\n",
+    "|--------------|------------------------|------------------|--------|\n",
+    "| DeepSeek-R1 (R1, R1-0528, R1-Distill) | `enable_thinking` | `--reasoning-parser deepseek-r1` | Standard reasoning models |\n",
+    "| DeepSeek-V3.1 | `thinking` | `--reasoning-parser deepseek-v3` | Hybrid model (thinking/non-thinking modes) |\n",
+    "| Qwen3 (standard) | `enable_thinking` | `--reasoning-parser qwen3` | Hybrid model (thinking/non-thinking modes) |\n",
+    "| Qwen3-Thinking | N/A (always enabled) | `--reasoning-parser qwen3-thinking` | Always generates reasoning |\n",
+    "| Kimi | N/A (always enabled) | `--reasoning-parser kimi` | Kimi thinking models |\n",
+    "| Gpt-Oss | N/A (always enabled) | `--reasoning-parser gpt-oss` | Gpt-Oss thinking models |\n",
+    "\n",
+    "#### Basic Usage\n",
+    "\n",
+    "To enable reasoning output, you need to:\n",
+    "1. Launch the server with the appropriate reasoning parser\n",
+    "2. Set the model-specific parameter in `chat_template_kwargs`\n",
+    "3. Optionally use `separate_reasoning: False` to not get reasoning content separately (default to `True`)\n",
+    "\n",
+    "**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example: Qwen3 Models\n",
+    "\n",
+    "```python\n",
+    "# Launch server:\n",
+    "# python3 -m sglang.launch_server --model Qwen/Qwen3-4B --reasoning-parser qwen3\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(\n",
+    "    api_key=\"EMPTY\",\n",
+    "    base_url=f\"http://127.0.0.1:30000/v1\",\n",
+    ")\n",
+    "\n",
+    "model = \"Qwen/Qwen3-4B\"\n",
+    "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=model,\n",
+    "    messages=messages,\n",
+    "    extra_body={\n",
+    "        \"chat_template_kwargs\": {\"enable_thinking\": True},\n",
+    "        \"separate_reasoning\": True\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"-\"*100)\n",
+    "print(\"Answer:\", response.choices[0].message.content)\n",
+    "```\n",
+    "\n",
+    "**ExampleOutput:**\n",
+    "```\n",
+    "Reasoning: Okay, so the user is asking how many 'r's are in the word 'strawberry'. Let me think. First, I need to make sure I have the word spelled correctly. Strawberry... S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me break it down.\n",
+    "\n",
+    "Starting with 'strawberry', let's write out the letters one by one. S, T, R, A, W, B, E, R, R, Y. Hmm, wait, that's 10 letters. Let me check again. S (1), T (2), R (3), A (4), W (5), B (6), E (7), R (8), R (9), Y (10). So the letters are S-T-R-A-W-B-E-R-R-Y. \n",
+    "...\n",
+    "Therefore, the answer should be three R's in 'strawberry'. But I need to make sure I'm not counting any other letters as R. Let me check again. S, T, R, A, W, B, E, R, R, Y. No other R's. So three in total. Yeah, that seems right.\n",
+    "\n",
+    "----------------------------------------------------------------------------------------------------\n",
+    "Answer: The word \"strawberry\" contains **three** letters 'r'. Here's the breakdown:\n",
+    "\n",
+    "1. **S-T-R-A-W-B-E-R-R-Y**  \n",
+    "   - The **third letter** is 'R'.  \n",
+    "   - The **eighth and ninth letters** are also 'R's.  \n",
+    "\n",
+    "Thus, the total count is **3**.  \n",
+    "\n",
+    "**Answer:** 3.\n",
+    "```\n",
+    "\n",
+    "**Note:** Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`. Qwen3-Thinking models always generate reasoning content and don't support the `enable_thinking` parameter.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Logit Bias Support\n",
+    "\n",
+    "SGLang supports the `logit_bias` parameter for both chat completions and completions APIs. This parameter allows you to modify the likelihood of specific tokens being generated by adding bias values to their logits. The bias values can range from -100 to 100, where:\n",
+    "\n",
+    "- **Positive values** (0 to 100) increase the likelihood of the token being selected\n",
+    "- **Negative values** (-100 to 0) decrease the likelihood of the token being selected\n",
+    "- **-100** effectively prevents the token from being generated\n",
+    "\n",
+    "The `logit_bias` parameter accepts a dictionary where keys are token IDs (as strings) and values are the bias amounts (as floats).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Getting Token IDs\n",
+    "\n",
+    "To use `logit_bias` effectively, you need to know the token IDs for the words you want to bias. Here's how to get token IDs:\n",
+    "\n",
+    "```python\n",
+    "# Get tokenizer to find token IDs\n",
+    "import tiktoken\n",
+    "\n",
+    "# For OpenAI models, use the appropriate encoding\n",
+    "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo\")  # or your model\n",
+    "\n",
+    "# Get token IDs for specific words\n",
+    "word = \"sunny\"\n",
+    "token_ids = tokenizer.encode(word)\n",
+    "print(f\"Token IDs for '{word}': {token_ids}\")\n",
+    "\n",
+    "# For SGLang models, you can access the tokenizer through the client\n",
+    "# and get token IDs for bias\n",
+    "```\n",
+    "\n",
+    "**Important:** The `logit_bias` parameter uses token IDs as string keys, not the actual words.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example: DeepSeek-V3 Models\n",
+    "\n",
+    "DeepSeek-V3 models support thinking mode through the `thinking` parameter:\n",
+    "\n",
+    "```python\n",
+    "# Launch server:\n",
+    "# python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.1 --tp 8  --reasoning-parser deepseek-v3\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(\n",
+    "    api_key=\"EMPTY\",\n",
+    "    base_url=f\"http://127.0.0.1:30000/v1\",\n",
+    ")\n",
+    "\n",
+    "model = \"deepseek-ai/DeepSeek-V3.1\"\n",
+    "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=model,\n",
+    "    messages=messages,\n",
+    "    extra_body={\n",
+    "        \"chat_template_kwargs\": {\"thinking\": True},\n",
+    "        \"separate_reasoning\": True\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"-\"*100)\n",
+    "print(\"Answer:\", response.choices[0].message.content)\n",
+    "```\n",
+    "\n",
+    "**Example Output:**\n",
+    "```\n",
+    "Reasoning: First, the question is: \"How many r's are in 'strawberry'?\"\n",
+    "\n",
+    "I need to count the number of times the letter 'r' appears in the word \"strawberry\".\n",
+    "\n",
+    "Let me write out the word: S-T-R-A-W-B-E-R-R-Y.\n",
+    "\n",
+    "Now, I'll go through each letter and count the 'r's.\n",
+    "...\n",
+    "So, I have three 'r's in \"strawberry\".\n",
+    "\n",
+    "I should double-check. The word is spelled S-T-R-A-W-B-E-R-R-Y. The letters are at positions: 3, 8, and 9 are 'r's. Yes, that's correct.\n",
+    "\n",
+    "Therefore, the answer should be 3.\n",
+    "----------------------------------------------------------------------------------------------------\n",
+    "Answer: The word \"strawberry\" contains **3** instances of the letter \"r\". Here's a breakdown for clarity:\n",
+    "\n",
+    "- The word is spelled: S-T-R-A-W-B-E-R-R-Y\n",
+    "- The \"r\" appears at the 3rd, 8th, and 9th positions.\n",
+    "```\n",
+    "\n",
+    "**Note:** DeepSeek-V3 models use the `thinking` parameter (not `enable_thinking`) to control reasoning output.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with logit_bias parameter\n",
+    "# Note: You need to get the actual token IDs from your tokenizer\n",
+    "# For demonstration, we'll use some example token IDs\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"Complete this sentence: The weather today is\"}\n",
+    "    ],\n",
+    "    temperature=0.7,\n",
+    "    max_tokens=20,\n",
+    "    logit_bias={\n",
+    "        \"12345\": 50,  # Increase likelihood of token ID 12345\n",
+    "        \"67890\": -50,  # Decrease likelihood of token ID 67890\n",
+    "        \"11111\": 25,  # Slightly increase likelihood of token ID 11111\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response with logit bias: {response.choices[0].message.content}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters\n",
+    "\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
+    "\n",
+    "SGLang extends the standard API with the `extra_body` parameter, allowing for additional customization. One key option within `extra_body` is `chat_template_kwargs`, which can be used to pass arguments to the chat template processor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
+    "        },\n",
+    "        {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
+    "        },\n",
+    "        {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
+    "    ],\n",
+    "    temperature=0.3,  # Lower temperature for more focused responses\n",
+    "    max_tokens=128,  # Reasonable length for a concise response\n",
+    "    top_p=0.95,  # Slightly higher for better fluency\n",
+    "    presence_penalty=0.2,  # Mild penalty to avoid repetition\n",
+    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
+    "    n=1,  # Single response is usually more stable\n",
+    "    seed=42,  # Keep for reproducibility\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Streaming mode is also supported."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Logit Bias Support\n",
+    "\n",
+    "The completions API also supports the `logit_bias` parameter with the same functionality as described in the chat completions section above.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
+    "    stream=True,\n",
+    ")\n",
+    "for chunk in stream:\n",
+    "    if chunk.choices[0].delta.content is not None:\n",
+    "        print(chunk.choices[0].delta.content, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Returning Routed Experts (MoE Models)\n",
+    "\n",
+    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with logit_bias parameter for completions API\n",
+    "# Note: You need to get the actual token IDs from your tokenizer\n",
+    "# For demonstration, we'll use some example token IDs\n",
+    "response = client.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    prompt=\"The best programming language for AI is\",\n",
+    "    temperature=0.7,\n",
+    "    max_tokens=20,\n",
+    "    logit_bias={\n",
+    "        \"12345\": 75,  # Strongly favor token ID 12345\n",
+    "        \"67890\": -100,  # Completely avoid token ID 67890\n",
+    "        \"11111\": -25,  # Slightly discourage token ID 11111\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response with logit bias: {response.choices[0].text}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Completions\n",
+    "\n",
+    "### Usage\n",
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter or chat templates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    prompt=\"List 3 countries and their capitals.\",\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    "    n=1,\n",
+    "    stop=None,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters\n",
+    "\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
+    "\n",
+    "Here is an example of a detailed completions request:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    prompt=\"Write a short story about a space explorer.\",\n",
+    "    temperature=0.7,  # Moderate temperature for creative writing\n",
+    "    max_tokens=150,  # Longer response for a story\n",
+    "    top_p=0.9,  # Balanced diversity in word choice\n",
+    "    stop=[\"\\n\\n\", \"THE END\"],  # Multiple stop sequences\n",
+    "    presence_penalty=0.3,  # Encourage novel elements\n",
+    "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
+    "    n=1,  # Generate one completion\n",
+    "    seed=123,  # For reproducible results\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Returning Routed Experts (MoE Models)\n",
+    "\n",
+    "For MoE models, set `return_routed_experts: true` in `extra_body` to return expert routing data. Requires `--enable-return-routed-experts` server flag. The `routed_experts` field will be returned in the `sgl_ext` object on each choice, containing base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Structured Outputs (JSON, Regex, EBNF)\n",
+    "\n",
+    "For OpenAI compatible structured outputs API, refer to [Structured Outputs](../advanced_features/structured_outputs.ipynb) for more details.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using LoRA Adapters\n",
+    "\n",
+    "SGLang supports LoRA (Low-Rank Adaptation) adapters with OpenAI-compatible APIs. You can specify which adapter to use directly in the `model` parameter using the `base-model:adapter-name` syntax.\n",
+    "\n",
+    "**Server Setup:**\n",
+    "```bash\n",
+    "python -m sglang.launch_server \\\n",
+    "    --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+    "    --enable-lora \\\n",
+    "    --lora-paths adapter_a=/path/to/adapter_a adapter_b=/path/to/adapter_b\n",
+    "```\n",
+    "\n",
+    "For more details on LoRA serving configuration, see the [LoRA documentation](../advanced_features/lora.ipynb).\n",
+    "\n",
+    "**API Call:**\n",
+    "\n",
+    "(Recommended) Use the `model:adapter` syntax to specify which adapter to use:\n",
+    "```python\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct:adapter_a\",  # ← base-model:adapter-name\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Convert to SQL: show all users\"}],\n",
+    "    max_tokens=50,\n",
+    ")\n",
+    "```\n",
+    "\n",
+    "**Backward Compatible: Using `extra_body`**\n",
+    "\n",
+    "The old `extra_body` method is still supported for backward compatibility:\n",
+    "```python\n",
+    "# Backward compatible method\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Convert to SQL: show all users\"}],\n",
+    "    extra_body={\"lora_path\": \"adapter_a\"},  # ← old method\n",
+    "    max_tokens=50,\n",
+    ")\n",
+    "```\n",
+    "**Note:** When both `model:adapter` and `extra_body[\"lora_path\"]` are specified, the `model:adapter` syntax takes precedence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/basic_usage/openai_api_embeddings.ipynb ADDED Viewed

	@@ -0,0 +1,193 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Embedding\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/embeddings).\n",
+    "\n",
+    "This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/retrieval_ranking/embedding_models.md)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize. Remember to add `--is-embedding` to the command."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "embedding_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+    "    --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, json\n",
+    "\n",
+    "text = \"Once upon a time\"\n",
+    "\n",
+    "curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
+    "\n",
+    "result = subprocess.check_output(curl_text, shell=True)\n",
+    "\n",
+    "print(result)\n",
+    "\n",
+    "text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "text = \"Once upon a time\"\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/v1/embeddings\",\n",
+    "    json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n",
+    ")\n",
+    "\n",
+    "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "# Text embedding example\n",
+    "response = client.embeddings.create(\n",
+    "    model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n",
+    "    input=text,\n",
+    ")\n",
+    "\n",
+    "embedding = response.data[0].embedding[:10]\n",
+    "print_highlight(f\"Text embedding (first 10): {embedding}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Input IDs\n",
+    "\n",
+    "SGLang also supports `input_ids` as input to get the embedding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n",
+    "input_ids = tokenizer.encode(text)\n",
+    "\n",
+    "curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
+    "\n",
+    "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
+    "    0\n",
+    "][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Modal Embedding Model\n",
+    "Please refer to [Multi-Modal Embedding Model](../supported_models/retrieval_ranking/embedding_models.md)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/basic_usage/openai_api_vision.ipynb ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Vision\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n",
+    "This tutorial covers the vision APIs for vision language models.\n",
+    "\n",
+    "SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/text_generation/multimodal_language_models.md).\n",
+    "\n",
+    "As an alternative to the OpenAI API, you can also use the [SGLang offline engine](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "vision_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=vision_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL\n",
+    "\n",
+    "Once the server is up, you can send test requests using curl or requests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "\n",
+    "curl_command = f\"\"\"\n",
+    "curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
+    "  -H \"Content-Type: application/json\" \\\\\n",
+    "  -d '{{\n",
+    "    \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    \"messages\": [\n",
+    "      {{\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "          {{\n",
+    "            \"type\": \"text\",\n",
+    "            \"text\": \"What’s in this image?\"\n",
+    "          }},\n",
+    "          {{\n",
+    "            \"type\": \"image_url\",\n",
+    "            \"image_url\": {{\n",
+    "              \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+    "            }}\n",
+    "          }}\n",
+    "        ]\n",
+    "      }}\n",
+    "    ],\n",
+    "    \"max_tokens\": 300\n",
+    "  }}'\n",
+    "\"\"\"\n",
+    "\n",
+    "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+    "print_highlight(response)\n",
+    "\n",
+    "\n",
+    "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    \"messages\": [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    \"max_tokens\": 300,\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"text\",\n",
+    "                    \"text\": \"What is in this image?\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\"\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    max_tokens=300,\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multiple-Image Inputs\n",
+    "\n",
+    "The server also supports multiple images and interleaved text and images if the model supports it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"text\",\n",
+    "                    \"text\": \"I have two very different images. They are not related at all. \"\n",
+    "                    \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(vision_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/basic_usage/popular_model_usage.rst ADDED Viewed

	@@ -0,0 +1,19 @@

+Popular Model Usage (DeepSeek, GPT-OSS, GLM, Llama, MiniMax, Qwen, and more)
+===============================================================
+For more usage examples and recipes, visit the `SGLang Cookbook <https://cookbook.sglang.io/>`_.
+.. toctree::
+   :maxdepth: 1
+   deepseek_v3.md
+   deepseek_v32.md
+   glm45.md
+   glmv.md
+   gpt_oss.md
+   minimax_m2.md
+   qwen3.md
+   qwen3_5.md
+   qwen3_vl.md
+   deepseek_ocr.md
+   llama4.md

sglang/docs/basic_usage/qwen3.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Qwen3-Next Usage
+SGLang has supported Qwen3-Next-80B-A3B-Instruct and Qwen3-Next-80B-A3B-Thinking since [this PR](https://github.com/sgl-project/sglang/pull/10233).
+## Launch Qwen3-Next with SGLang
+To serve Qwen3-Next models on 4xH100/H200 GPUs:
+```bash
+python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4
+```
+### Configuration Tips
+- `--max-mamba-cache-size`: Adjust `--max-mamba-cache-size` to increase mamba cache space and max running requests capability. It will decrease KV cache space as a trade-off. You can adjust it according to workload.
+- `--mamba-ssm-dtype`: `bfloat16` or `float32`, use `bfloat16` to save mamba cache size and `float32` to get more accurate results. The default setting is `float32`.
+- `--mamba-full-memory-ratio`: The ratio of mamba state memory to full kv cache memory. The default is 0.9.
+### Mamba Radix Cache
+SGLang supports prefix caching for Qwen3-Next models named `MambaRadixCache`, which improves inference speed by reusing computation results. There are two versions of `MambaRadixCache`:
+- `no_buffer`: The default version, which is also other hybrid linear models' choice. When it is enabled, SGLang will automatically close overlap schedule for compatibility reasons.
+- `extra_buffer`: An optimized version that is compatible with features like page size > 1, overlap schedule, and speculative decoding. It also supports storing mamba state in branching positions. However, it requires two extra mamba spaces for a ping-pong buffer for each request. To enable it, add the argument `--mamba-scheduler-strategy extra_buffer` when launching the server.
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Qwen3-Next models with [EAGLE speculative decoding](https://docs.sglang.io/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+``` bash
+python3 -m sglang.launch_server \
+  --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+  --tp 4 \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --speculative-algo NEXTN
+```
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/10233).

sglang/docs/basic_usage/qwen3_vl.md ADDED Viewed

	@@ -0,0 +1,130 @@

+# Qwen3-VL Usage
+[Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl)
+is Alibaba’s latest multimodal large language model with strong text, vision, and reasoning capabilities.
+SGLang supports Qwen3-VL Family of models with Image and Video input support.
+## Launch commands for SGLang
+Below are suggested launch commands tailored for different hardware / precision modes
+### FP8 (quantised) mode
+For high memory-efficiency and latency optimized deployments (e.g., on H100, H200) where FP8 checkpoint is supported:
+```bash
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-VL-235B-A22B-Instruct-FP8 \
+  --tp 8 \
+  --ep 8 \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --keep-mm-feature-on-device
+```
+### Non-FP8 (BF16 / full precision) mode
+For deployments on A100/H100 where BF16 is used (or FP8 snapshot not used):
+```bash
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-VL-235B-A22B-Instruct \
+  --tp 8 \
+  --ep 8 \
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+## Hardware-specific notes / recommendations
+- On H100 with FP8: Use the FP8 checkpoint for best memory efficiency.
+- On A100 / H100 with BF16 (non-FP8): It’s recommended to use `--mm-max-concurrent-calls` to control parallel throughput and GPU memory usage during image/video inference.
+- On H200 & B200: The model can be run “out of the box”, supporting full context length plus concurrent image + video processing.
+## Sending Image/Video Requests
+### Image input:
+```python
+import requests
+url = f"http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true"
+                    },
+                },
+            ],
+        }
+    ],
+    "max_tokens": 300,
+}
+response = requests.post(url, json=data)
+print(response.text)
+```
+### Video Input:
+```python
+import requests
+url = f"http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s happening in this video?"},
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://github.com/sgl-project/sgl-test-files/raw/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+                    },
+                },
+            ],
+        }
+    ],
+    "max_tokens": 300,
+}
+response = requests.post(url, json=data)
+print(response.text)
+```
+## Important Server Parameters and Flags
+When launching the model server for **multimodal support**, you can use the following command-line arguments to fine-tune performance and behavior:
+- `--mm-attention-backend`: Specify multimodal attention backend. Eg. `fa3`(Flash Attention 3)
+- `--mm-max-concurrent-calls <value>`: Specifies the **maximum number of concurrent asynchronous multimodal data processing calls** allowed on the server. Use this to control parallel throughput and GPU memory usage during image/video inference.
+- `--mm-per-request-timeout <seconds>`: Defines the **timeout duration (in seconds)** for each multimodal request. If a request exceeds this time limit (e.g., for very large video inputs), it will be automatically terminated.
+- `--keep-mm-feature-on-device`: Instructs the server to **retain multimodal feature tensors on the GPU** after processing. This avoids device-to-host (D2H) memory copies and improves performance for repeated or high-frequency inference workloads.
+- `SGLANG_USE_CUDA_IPC_TRANSPORT=1`: Shared memory pool based CUDA IPC for multi-modal data transport. For significantly improving e2e latency.
+### Example usage with the above optimizations:
+```bash
+SGLANG_USE_CUDA_IPC_TRANSPORT=1 \
+SGLANG_VLM_CACHE_SIZE_MB=0 \
+python -m sglang.launch_server \
+  --model-path Qwen/Qwen3-VL-235B-A22B-Instruct \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --trust-remote-code \
+  --tp-size 8 \
+  --enable-cache-report \
+  --log-level info \
+  --max-running-requests 64 \
+  --mem-fraction-static 0.65 \
+  --chunked-prefill-size 8192 \
+  --attention-backend fa3 \
+  --mm-attention-backend fa3 \
+  --enable-metrics
+```

sglang/docs/basic_usage/sampling_params.md ADDED Viewed

	@@ -0,0 +1,347 @@

+# Sampling Parameters
+This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime.
+If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](openai_api_completions.ipynb).
+## `/generate` Endpoint
+The `/generate` endpoint accepts the following parameters in JSON format. For detailed usage, see the [native API doc](native_api.ipynb). The object is defined at `io_struct.py::GenerateReqInput`. You can also read the source code to find more arguments and docs.
+| Argument                   | Type/Default                                                                 | Description                                                                                                                                                     |
+|----------------------------|------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| text                       | `Optional[Union[List[str], str]] = None`                                     | The input prompt. Can be a single prompt or a batch of prompts.                                                                                                 |
+| input_ids                  | `Optional[Union[List[List[int]], List[int]]] = None`                         | The token IDs for text; one can specify either text or input_ids.                                                                                               |
+| input_embeds               | `Optional[Union[List[List[List[float]]], List[List[float]]]] = None`         | The embeddings for input_ids; one can specify either text, input_ids, or input_embeds.                                                                          |
+| image_data                 | `Optional[Union[List[List[ImageDataItem]], List[ImageDataItem], ImageDataItem]] = None` | The image input. Supports three formats: (1) **Raw images**: PIL Image, file path, URL, or base64 string; (2) **Processor output**: Dict with `format: "processor_output"` containing HuggingFace processor outputs; (3) **Precomputed embeddings**: Dict with `format: "precomputed_embedding"` and `feature` containing pre-calculated visual embeddings. Can be a single image, list of images, or list of lists of images. See [Multimodal Input Formats](#multimodal-input-formats) for details. |
+| audio_data                 | `Optional[Union[List[AudioDataItem], AudioDataItem]] = None`                 | The audio input. Can be a file name, URL, or base64 encoded string.                                                                                             |
+| sampling_params            | `Optional[Union[List[Dict], Dict]] = None`                                   | The sampling parameters as described in the sections below.                                                                                                     |
+| rid                        | `Optional[Union[List[str], str]] = None`                                     | The request ID.                                                                                                                                                 |
+| return_logprob             | `Optional[Union[List[bool], bool]] = None`                                   | Whether to return log probabilities for tokens.                                                                                                                 |
+| logprob_start_len          | `Optional[Union[List[int], int]] = None`                                     | If return_logprob, the start location in the prompt for returning logprobs. Default is "-1", which returns logprobs for output tokens only.                     |
+| top_logprobs_num           | `Optional[Union[List[int], int]] = None`                                     | If return_logprob, the number of top logprobs to return at each position.                                                                                       |
+| token_ids_logprob          | `Optional[Union[List[List[int]], List[int]]] = None`                         | If return_logprob, the token IDs to return logprob for.                                                                                                         |
+| return_text_in_logprobs    | `bool = False`                                                               | Whether to detokenize tokens in text in the returned logprobs.                                                                                                  |
+| stream                     | `bool = False`                                                               | Whether to stream output.                                                                                                                                       |
+| lora_path                  | `Optional[Union[List[Optional[str]], Optional[str]]] = None`                 | The path to the LoRA.                                                                                                                                           |
+| custom_logit_processor     | `Optional[Union[List[Optional[str]], str]] = None`                           | Custom logit processor for advanced sampling control. Must be a serialized instance of `CustomLogitProcessor` using its `to_str()` method. For usage see below. |
+| return_hidden_states       | `Union[List[bool], bool] = False`                                            | Whether to return hidden states.                                                                                                                                |
+| return_routed_experts      | `bool = False`                                                               | Whether to return routed experts for MoE models. Requires `--enable-return-routed-experts` server flag. Returns base64-encoded int32 expert IDs as a flattened array with logical shape `[num_tokens, num_layers, top_k]`. |
+## Sampling parameters
+The object is defined at `sampling_params.py::SamplingParams`. You can also read the source code to find more arguments and docs.
+### Note on defaults
+By default, SGLang initializes several sampling parameters from the model's `generation_config.json` (when the server is launched with `--sampling-defaults model`, which is the default). To use SGLang/OpenAI constant defaults instead, start the server with `--sampling-defaults openai`. You can always override any parameter per request via `sampling_params`.
+```bash
+# Use model-provided defaults from generation_config.json (default behavior)
+python -m sglang.launch_server --model-path <MODEL> --sampling-defaults model
+# Use SGLang/OpenAI constant defaults instead
+python -m sglang.launch_server --model-path <MODEL> --sampling-defaults openai
+```
+### Core parameters
+| Argument        | Type/Default                                 | Description                                                                                                                                    |
+|-----------------|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| max_new_tokens  | `int = 128`                                  | The maximum output length measured in tokens.                                                                                                  |
+| stop            | `Optional[Union[str, List[str]]] = None`     | One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. |
+| stop_token_ids  | `Optional[List[int]] = None`                 | Provide stop words in the form of token IDs. Generation will stop if one of these token IDs is sampled.                                        |
+| stop_regex      | `Optional[Union[str, List[str]]] = None`     | Stop when hitting any of the regex patterns in this list |
+| temperature     | `float (model default; fallback 1.0)`        | [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, a higher temperature leads to more diversity. |
+| top_p           | `float (model default; fallback 1.0)`        | [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. |
+| top_k           | `int (model default; fallback -1)`           | [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. |
+| min_p           | `float (model default; fallback 0.0)`        | [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. |
+### Penalizers
+| Argument           | Type/Default           | Description                                                                                                                                    |
+|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| frequency_penalty  | `float = 0.0`          | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
+| presence_penalty   | `float = 0.0`          | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
+| repetition_penalty | `float = 1.0`          | Scales the logits of previously generated tokens to discourage (values > 1) or encourage (values < 1) repetition. Valid range is `[0, 2]`; `1.0` leaves probabilities unchanged. |
+| min_new_tokens     | `int = 0`              | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
+### Constrained decoding
+Please refer to our dedicated guide on [constrained decoding](../advanced_features/structured_outputs.ipynb) for the following parameters.
+| Argument        | Type/Default                    | Description                                                                                                                                    |
+|-----------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| json_schema     | `Optional[str] = None`          | JSON schema for structured outputs.                                                                                                            |
+| regex           | `Optional[str] = None`          | Regex for structured outputs.                                                                                                                  |
+| ebnf            | `Optional[str] = None`          | EBNF for structured outputs.                                                                                                                   |
+| structural_tag  | `Optional[str] = None`          | The structural tag for structured outputs.                                                                                                       |
+### Other options
+| Argument                      | Type/Default                    | Description                                                                                                                                    |
+|-------------------------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| n                             | `int = 1`                       | Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeating the same prompts several times offers better control and efficiency.) |
+| ignore_eos                    | `bool = False`                  | Don't stop generation when EOS token is sampled.                                                                                               |
+| skip_special_tokens           | `bool = True`                   | Remove special tokens during decoding.                                                                                                         |
+| spaces_between_special_tokens | `bool = True`                   | Whether or not to add spaces between special tokens during detokenization.                                                                     |
+| no_stop_trim                  | `bool = False`                  | Don't trim stop words or EOS token from the generated text.                                                                                    |
+| custom_params                 | `Optional[List[Optional[Dict[str, Any]]]] = None` | Used when employing `CustomLogitProcessor`. For usage, see below.                                                                              |
+## Examples
+### Normal
+Launch a server:
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
+```
+Send a request:
+```python
+import requests
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+Detailed example in [send request](./send_request.ipynb).
+### Streaming
+Send a request and stream the output:
+```python
+import requests, json
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+        "stream": True,
+    },
+    stream=True,
+)
+prev = 0
+for chunk in response.iter_lines(decode_unicode=False):
+    chunk = chunk.decode("utf-8")
+    if chunk and chunk.startswith("data:"):
+        if chunk == "data: [DONE]":
+            break
+        data = json.loads(chunk[5:].strip("\n"))
+        output = data["text"].strip()
+        print(output[prev:], end="", flush=True)
+        prev = len(output)
+print("")
+```
+Detailed example in [openai compatible api](openai_api_completions.ipynb).
+### Multimodal
+Launch a server:
+```bash
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov
+```
+Download an image:
+```bash
+curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/examples/assets/example_image.png?raw=true
+```
+Send a request:
+```python
+import requests
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
+                "<|im_start|>assistant\n",
+        "image_data": "example_image.png",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
+Streaming is supported in a similar manner as [above](#streaming).
+Detailed example in [OpenAI API Vision](openai_api_vision.ipynb).
+### Structured Outputs (JSON, Regex, EBNF)
+You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
+SGLang supports two grammar backends:
+- [XGrammar](https://github.com/mlc-ai/xgrammar) (default): Supports JSON schema, regular expression, and EBNF constraints.
+  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md).
+- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.
+If instead you want to initialize the Outlines backend, you can use `--grammar-backend outlines` flag:
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: xgrammar)
+```
+```python
+import json
+import requests
+json_schema = json.dumps({
+    "type": "object",
+    "properties": {
+        "name": {"type": "string", "pattern": "^[\\w]+$"},
+        "population": {"type": "integer"},
+    },
+    "required": ["name", "population"],
+})
+# JSON (works with both Outlines and XGrammar)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Here is the information of the capital of France in the JSON format.\n",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "json_schema": json_schema,
+        },
+    },
+)
+print(response.json())
+# Regular expression (Outlines backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Paris is the capital of",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "regex": "(France|England)",
+        },
+    },
+)
+print(response.json())
+# EBNF (XGrammar backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Write a greeting.",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
+        },
+    },
+)
+print(response.json())
+```
+Detailed example in [structured outputs](../advanced_features/structured_outputs.ipynb).
+### Custom logit processor
+Launch a server with `--enable-custom-logit-processor` flag on.
+```bash
+python -m sglang.launch_server \
+  --model-path meta-llama/Meta-Llama-3-8B-Instruct \
+  --port 30000 \
+  --enable-custom-logit-processor
+```
+Define a custom logit processor that will always sample a specific token id.
+```python
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+class DeterministicLogitProcessor(CustomLogitProcessor):
+    """A dummy logit processor that changes the logits to always
+    sample the given token id.
+    """
+    def __call__(self, logits, custom_param_list):
+        # Check that the number of logits matches the number of custom parameters
+        assert logits.shape[0] == len(custom_param_list)
+        key = "token_id"
+        for i, param_dict in enumerate(custom_param_list):
+            # Mask all other tokens
+            logits[i, :] = -float("inf")
+            # Assign highest probability to the specified token
+            logits[i, param_dict[key]] = 0.0
+        return logits
+```
+Send a request:
+```python
+import requests
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": 32,
+            "custom_params": {"token_id": 5},
+        },
+    },
+)
+print(response.json())
+```
+Send an OpenAI chat completion request:
+```python
+import openai
+from sglang.utils import print_highlight
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="None")
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "List 3 countries and their capitals."},
+    ],
+    temperature=0.0,
+    max_tokens=32,
+    extra_body={
+        "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+        "custom_params": {"token_id": 5},
+    },
+)
+print_highlight(f"Response: {response}")
+```

sglang/docs/basic_usage/send_request.ipynb ADDED Viewed

	@@ -0,0 +1,251 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sending Requests\n",
+    "This notebook provides a quick-start guide to use SGLang in chat completions after installation. Once your server is running, API documentation is available at `http://localhost:30000/docs` (Swagger UI), `http://localhost:30000/redoc` (ReDoc), or `http://localhost:30000/openapi.json` (OpenAPI spec, useful for AI agents). Replace `30000` with your port if using a different one.\n",
+    "\n",
+    "- For Vision Language Models, see [OpenAI APIs - Vision](openai_api_vision.ipynb).\n",
+    "- For Embedding Models, see [OpenAI APIs - Embedding](openai_api_embeddings.ipynb) and [Encode (embedding model)](native_api.html#Encode-(embedding-model)).\n",
+    "- For Reward Models, see [Classify (reward model)](native_api.html#Classify-(reward-model))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "# This is equivalent to running the following command in your terminal\n",
+    "# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+    " --host 0.0.0.0 --log-level warning\n",
+    "\"\"\")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\", process=server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, json\n",
+    "\n",
+    "curl_command = f\"\"\"\n",
+    "curl -s http://localhost:{port}/v1/chat/completions \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
+    "\"\"\"\n",
+    "\n",
+    "response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "# Use stream=True for streaming responses\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    "    stream=True,\n",
+    ")\n",
+    "\n",
+    "# Handle the streaming output\n",
+    "for chunk in response:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        print(chunk.choices[0].delta.content, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Native Generation APIs\n",
+    "\n",
+    "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"The capital of France is\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 32,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, json\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"The capital of France is\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 32,\n",
+    "        },\n",
+    "        \"stream\": True,\n",
+    "    },\n",
+    "    stream=True,\n",
+    ")\n",
+    "\n",
+    "prev = 0\n",
+    "for chunk in response.iter_lines(decode_unicode=False):\n",
+    "    chunk = chunk.decode(\"utf-8\")\n",
+    "    if chunk and chunk.startswith(\"data:\"):\n",
+    "        if chunk == \"data: [DONE]\":\n",
+    "            break\n",
+    "        data = json.loads(chunk[5:].strip(\"\\n\"))\n",
+    "        output = data[\"text\"]\n",
+    "        print(output[prev:], end=\"\", flush=True)\n",
+    "        prev = len(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

sglang/docs/developer_guide/bench_serving.md ADDED Viewed

	@@ -0,0 +1,355 @@

+# Bench Serving Guide
+This guide explains how to benchmark online serving throughput and latency using `python -m sglang.bench_serving`. It supports multiple inference backends via OpenAI-compatible and native endpoints, and produces both console metrics and optional JSONL outputs.
+### What it does
+- Generates synthetic or dataset-driven prompts and submits them to a target serving endpoint
+- Measures throughput, time-to-first-token (TTFT), inter-token latency (ITL), per-request end-to-end latency, and more
+- Supports streaming or non-streaming modes, rate control, and concurrency limits
+### Supported backends and endpoints
+- `sglang` / `sglang-native`: `POST /generate`
+- `sglang-oai`, `vllm`, `lmdeploy`: `POST /v1/completions`
+- `sglang-oai-chat`, `vllm-chat`, `lmdeploy-chat`: `POST /v1/chat/completions`
+- `trt` (TensorRT-LLM): `POST /v2/models/ensemble/generate_stream`
+- `gserver`: Custom server (Not Implemented yet in this script)
+- `truss`: `POST /v1/models/model:predict`
+If `--base-url` is provided, requests are sent to it. Otherwise, `--host` and `--port` are used. When `--model` is not provided, the script will attempt to query `GET /v1/models` for an available model ID (OpenAI-compatible endpoints).
+### Prerequisites
+- Python 3.8+
+- Dependencies typically used by this script: `aiohttp`, `numpy`, `requests`, `tqdm`, `transformers`, and for some datasets `datasets`, `pillow`, `pybase64`. Install as needed.
+- An inference server running and reachable via the endpoints above
+- If your server requires authentication, set environment variable `OPENAI_API_KEY` (used as `Authorization: Bearer <key>`)
+### Quick start
+Run a basic benchmark against an sglang server exposing `/generate`:
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+```
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --num-prompts 1000 \
+  --model meta-llama/Llama-3.1-8B-Instruct
+```
+Or, using an OpenAI-compatible endpoint (completions):
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm \
+  --base-url http://127.0.0.1:8000 \
+  --num-prompts 1000 \
+  --model meta-llama/Llama-3.1-8B-Instruct
+```
+### Datasets
+Select with `--dataset-name`:
+- `sharegpt` (default): loads ShareGPT-style pairs; optionally restrict with `--sharegpt-context-len` and override outputs with `--sharegpt-output-len`
+- `random`: random text lengths; sampled from ShareGPT token space
+- `random-ids`: random token ids (can lead to gibberish)
+- `image`: generates images and wraps them in chat messages; supports custom resolutions, multiple formats, and different content types
+- `generated-shared-prefix`: synthetic dataset with shared long system prompts and short questions
+- `mmmu`: samples from MMMU (Math split) and includes images
+Common dataset flags:
+- `--num-prompts N`: number of requests
+- `--random-input-len`, `--random-output-len`, `--random-range-ratio`: for random/random-ids/image
+- `--image-count`: Number of images per request (for `image` dataset).
+- `--apply-chat-template`: apply tokenizer chat template when constructing prompts
+- `--dataset-path PATH`: file path for ShareGPT json; if blank and missing, it will be downloaded and cached
+Generated Shared Prefix flags (for `generated-shared-prefix`):
+- `--gsp-num-groups`
+- `--gsp-prompts-per-group`
+- `--gsp-system-prompt-len`
+- `--gsp-question-len`
+- `--gsp-output-len`
+Image dataset flags (for `image`):
+- `--image-count`: Number of images per request
+- `--image-resolution`: Image resolution; supports presets (4k, 1080p, 720p, 360p) or custom 'heightxwidth' format (e.g., 1080x1920, 512x768)
+- `--image-format`: Image format (jpeg or png)
+- `--image-content`: Image content type (random or blank)
+### Examples
+1. To benchmark image dataset with 3 images per request, 500 prompts, 512 input length, and 512 output length, you can run:
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-3B-Instruct --disable-radix-cache
+```
+```bash
+python -m sglang.bench_serving \
+    --backend sglang-oai-chat \
+    --dataset-name image \
+    --num-prompts 500 \
+    --image-count 3 \
+    --image-resolution 720p \
+    --random-input-len 512 \
+    --random-output-len 512
+```
+2. To benchmark random dataset with 3000 prompts, 1024 input length, and 1024 output length, you can run:
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-3B-Instruct
+```
+```bash
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --dataset-name random \
+    --num-prompts 3000 \
+    --random-input 1024 \
+    --random-output 1024 \
+    --random-range-ratio 0.5
+```
+### Choosing model and tokenizer
+- `--model` is required unless the backend exposes `GET /v1/models`, in which case the first model ID is auto-selected.
+- `--tokenizer` defaults to `--model`. Both can be HF model IDs or local paths.
+- For ModelScope workflows, setting `SGLANG_USE_MODELSCOPE=true` enables fetching via ModelScope (weights are skipped for speed).
+- If your tokenizer lacks a chat template, the script warns because token counting can be less robust for gibberish outputs.
+### Rate, concurrency, and streaming
+- `--request-rate`: requests per second. `inf` sends all immediately (burst). Non-infinite rate uses a Poisson process for arrival times.
+- `--max-concurrency`: caps concurrent in-flight requests regardless of arrival rate.
+- `--disable-stream`: switch to non-streaming mode when supported; TTFT then equals total latency for chat completions.
+### Other key options
+- `--output-file FILE.jsonl`: append JSONL results to file; auto-named if unspecified
+- `--output-details`: include per-request arrays (generated texts, errors, ttfts, itls, input/output lens)
+- `--extra-request-body '{"top_p":0.9,"temperature":0.6}'`: merged into payload (sampling params, etc.)
+- `--disable-ignore-eos`: pass through EOS behavior (varies by backend)
+- `--warmup-requests N`: run warmup requests with short output first (default 1)
+- `--flush-cache`: call `/flush_cache` (sglang) before main run
+- `--profile`: call `/start_profile` and `/stop_profile` (requires server to enable profiling, e.g., `SGLANG_TORCH_PROFILER_DIR`)
+- `--lora-name name1 name2 ...`: randomly pick one per request and pass to backend (e.g., `lora_path` for sglang)
+- `--tokenize-prompt`: send integer IDs instead of text (currently supports `--backend sglang` only)
+### Authentication
+If your target endpoint requires OpenAI-style auth, set:
+```bash
+export OPENAI_API_KEY=sk-...yourkey...
+```
+The script will add `Authorization: Bearer $OPENAI_API_KEY` automatically for OpenAI-compatible routes.
+### Metrics explained
+Printed after each run:
+- Request throughput (req/s)
+- Input token throughput (tok/s) - includes both text and vision tokens
+- Output token throughput (tok/s)
+- Total token throughput (tok/s) - includes both text and vision tokens
+- Total input text tokens and Total input vision tokens - per-modality breakdown
+- Concurrency: aggregate time of all requests divided by wall time
+- End-to-End Latency (ms): mean/median/std/p99 per-request total latency
+- Time to First Token (TTFT, ms): mean/median/std/p99 for streaming mode
+- Inter-Token Latency (ITL, ms): mean/median/std/p95/p99/max between tokens
+- TPOT (ms): Token processing time after first token, i.e., `(latency - ttft)/(tokens-1)`
+- Accept length (sglang-only, if available): speculative decoding accept length
+The script also retokenizes generated text with the configured tokenizer and reports "retokenized" counts.
+### JSONL output format
+When `--output-file` is set, one JSON object is appended per run. Base fields:
+- Arguments summary: backend, dataset, request_rate, max_concurrency, etc.
+- Duration and totals: completed, total_input_tokens, total_output_tokens, retokenized totals
+- Throughputs and latency statistics as printed in the console
+- `accept_length` when available (sglang)
+With `--output-details`, an extended object also includes arrays:
+- `input_lens`, `output_lens`
+- `ttfts`, `itls` (per request: ITL arrays)
+- `generated_texts`, `errors`
+### End-to-end examples
+1) sglang native `/generate` (streaming):
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.5 \
+  --num-prompts 2000 \
+  --request-rate 100 \
+  --max-concurrency 512 \
+  --output-file sglang_random.jsonl --output-details
+```
+2) OpenAI-compatible Completions (e.g., vLLM):
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm \
+  --base-url http://127.0.0.1:8000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name sharegpt \
+  --num-prompts 1000 \
+  --sharegpt-output-len 256
+```
+3) OpenAI-compatible Chat Completions (streaming):
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm-chat \
+  --base-url http://127.0.0.1:8000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --num-prompts 500 \
+  --apply-chat-template
+```
+4) Images (VLM) with chat template:
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model your-vlm-model \
+  --dataset-name image \
+  --image-count 2 \
+  --image-resolution 720p \
+  --random-input-len 128 --random-output-len 256 \
+  --num-prompts 200 \
+  --apply-chat-template
+```
+4a) Images with custom resolution:
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model your-vlm-model \
+  --dataset-name image \
+  --image-count 1 \
+  --image-resolution 512x768 \
+  --random-input-len 64 --random-output-len 128 \
+  --num-prompts 100 \
+  --apply-chat-template
+```
+4b) 1080p images with PNG format and blank content:
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model your-vlm-model \
+  --dataset-name image \
+  --image-count 1 \
+  --image-resolution 1080p \
+  --image-format png \
+  --image-content blank \
+  --random-input-len 64 --random-output-len 128 \
+  --num-prompts 100 \
+  --apply-chat-template
+```
+5) Generated shared prefix (long system prompts + short questions):
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name generated-shared-prefix \
+  --gsp-num-groups 64 --gsp-prompts-per-group 16 \
+  --gsp-system-prompt-len 2048 --gsp-question-len 128 --gsp-output-len 256 \
+  --num-prompts 1024
+```
+6) Tokenized prompts (ids) for strict length control (sglang only):
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --tokenize-prompt \
+  --random-input-len 2048 --random-output-len 256 --random-range-ratio 0.2
+```
+7) Profiling and cache flush (sglang):
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --profile \
+  --flush-cache
+```
+8) TensorRT-LLM streaming endpoint:
+```bash
+python3 -m sglang.bench_serving \
+  --backend trt \
+  --base-url http://127.0.0.1:8000 \
+  --model your-trt-llm-model \
+  --dataset-name random \
+  --num-prompts 100 \
+  --disable-ignore-eos
+```
+9) Evaluating large-scale KVCache sharing with mooncake trace (sglang only):
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model model-name \
+  --dataset-name mooncake \
+  --mooncake-slowdown-factor 1.0 \
+  --mooncake-num-rounds 1000 \
+  --mooncake-workload conversation|mooncake|agent|synthetic
+  --use-trace-timestamps true \
+  --random-output-len 256
+```
+### Troubleshooting
+- All requests failed: verify `--backend`, server URL/port, `--model`, and authentication. Check warmup errors printed by the script.
+- Throughput seems too low: adjust `--request-rate` and `--max-concurrency`; verify server batch size/scheduling; ensure streaming is enabled if appropriate.
+- Token counts look odd: prefer chat/instruct models with proper chat templates; otherwise tokenization of gibberish may be inconsistent.
+- Image/MMMU datasets: ensure you installed extra deps (`pillow`, `datasets`, `pybase64`).
+- Authentication errors (401/403): set `OPENAI_API_KEY` or disable auth on your server.
+### Notes
+- The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
+- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.

sglang/docs/developer_guide/benchmark_and_profiling.md ADDED Viewed

	@@ -0,0 +1,467 @@

+# Benchmark and Profiling
+## Benchmark
+SGLang provides four benchmark tools that operate at different levels of the stack. The table below summarizes their key differences:
+| Tool                       | HTTP Server                                   | Scheduler                               | Use Case                                                                   |
+| -------------------------- | --------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- |
+| `bench_serving`            | Yes (async HTTP client to a running server)   | Yes (indirectly, via server)            | Realistic online serving benchmarks with latency metrics (TTFT, TPOT, ITL) |
+| `bench_one_batch_server`   | Yes (sends HTTP requests to a running server) | Yes (indirectly, via server)            | End-to-end single-batch latency including HTTP and scheduler overhead      |
+| `bench_offline_throughput` | No                                            | Yes (directly uses `Engine` in-process) | Maximum throughput measurement without HTTP overhead                       |
+| `bench_one_batch`          | No                                            | No (directly calls `ModelRunner`)       | Kernel-level latency profiling of a single static batch                    |
+Use `bench_serving` by default unless there are specific needs.
+**`bench_serving`** is an async HTTP load-testing client that sends requests at controlled rates with configurable concurrency to a running server. It measures realistic online serving metrics including time-to-first-token (TTFT), time-per-output-token (TPOT), inter-token latency (ITL), and throughput. Use `num-prompts >= 5 * max-concurrency` to measure steady-state performance. Launch a server with `sglang.launch_server` first.
+  ```bash
+  python3 -m sglang.bench_serving --backend sglang --max-concurrency 16 --num-prompts 80 --random-input-len 256 --random-output-len 32 --dataset-name random
+  ```
+**`bench_one_batch_server`** sends a single batch as one HTTP request to a running server. Due to only having a single batch, the server is never in a steady-state and metrics will be biased. Launch a server with `sglang.launch_server` first.
+  ```bash
+  python3 -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000 --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
+  ```
+**`bench_offline_throughput`** directly instantiates the `Engine` object in-process (no HTTP server) and submits all requests at once via `engine.generate()`. The engine's scheduler handles batching and execution. This measures maximum achievable throughput without any network overhead.
+  ```bash
+  python3 -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
+  ```
+**`bench_one_batch`** is the lowest-level tool. It directly instantiates a `ModelRunner` and calls `extend()` / `decode()` on a fixed static batch, bypassing the scheduler entirely. The prefill and decode phases are run separately, making profiling easier but rendering the metrics unrealistic. Because there is no dynamic batching, it may run out of memory for batch sizes that a real server can handle (a real server chunks prefill into smaller batches). This is best suited for profiling individual kernel performance.
+  ```bash
+  python3 -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
+  ```
+## Profile with PyTorch Profiler
+[Pytorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) is a convenient basic tool to inspect kernel execution time, call stack, and kernel overlap and occupancy.
+### Profile a server with `sglang.bench_serving`
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+# start server
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+# send profiling request from client
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile
+```
+The `SGLANG_TORCH_PROFILER_DIR` environment variable must be set on both the server and client side; otherwise, the trace file will not be generated correctly. A secure way to do this is by setting it in your shell's resource file (e.g., `~/.bashrc` for bash).
+For more details, please refer to [Bench Serving Guide](./bench_serving.md).
+### Profile In PD Disaggregation Mode
+When profiling in PD disaggregation mode, prefill and decode workers **must be profiled separately** due to torch profiler limitations. The `bench_serving` command provides dedicated options for this:
+#### Profile Prefill Workers
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+# start prefill and decode servers (see PD disaggregation docs for setup)
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1
+# start router
+python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+# send profiling request targeting prefill workers
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000
+```
+#### Profile Decode Workers
+```bash
+# send profiling request targeting decode workers
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001
+```
+#### Important Notes
+- `--profile-prefill-url` and `--profile-decode-url` are **mutually exclusive** - you cannot profile both at the same time
+- Both options support multiple worker URLs for multi-instance setups:
+  ```bash
+  # Profile multiple prefill workers
+  python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 http://127.0.0.1:30002
+  # Profile multiple decode workers
+  python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 http://127.0.0.1:30003
+  ```
+- Make sure `SGLANG_TORCH_PROFILER_DIR` is set on all worker nodes before starting the servers
+- For more details on setting up PD disaggregation, see [PD Disaggregation Guide](../advanced_features/pd_disaggregation.md)
+### Profile a server with `sglang.bench_offline_throughput`
+```bash
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+# profile one batch with bench_one_batch.py
+# batch size can be controlled with --batch argument
+python3 -m sglang.bench_one_batch --model-path meta-llama/Llama-3.1-8B-Instruct --batch 32 --input-len 1024 --output-len 10 --profile
+# profile multiple batches with bench_offline_throughput.py
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+### Profile a server with `sglang.profiler`
+When the server is running (e.g., processing a decoding request), you can start live profiling immediately by sending a profile request to the server.
+You can do this by running `python3 -m sglang.profiler`. For example:
+```
+# Terminal 1: Send a generation request
+python3 -m sglang.test.send_one
+# Terminal 2: Before the above request finishes, quickly launch the following command in a separate terminal.
+# It will generate a profile of the above request for several decoding batches.
+python3 -m sglang.profiler
+```
+You can also combine the above operations into a single command
+```
+python3 -m sglang.test.send_one --profile
+```
+### Profile a server with HTTP API endpoints
+SGLang provides HTTP API endpoints to control profiling on a running server. This allows you to start and stop profiling programmatically, which is useful for capturing specific workload patterns.
+#### Using `/start_profile` endpoint
+The `/start_profile` endpoint starts profiling on the server. You can control when profiling begins and how long it runs using the following parameters:
+**Basic usage:**
+```bash
+# Start profiling immediately for 10 steps
+curl -X POST http://127.0.0.1:30000/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{
+    "num_steps": 10
+  }'
+```
+**Parameters:**
+- `output_dir` (optional): Directory where profile traces will be saved. If not specified, uses `SGLANG_TORCH_PROFILER_DIR` environment variable, or `/tmp` as the default
+- `num_steps` (optional): Number of steps to profile. If not specified, profiling continues until manually stopped with `/end_profile`
+- `start_step` (optional): Step number at which to start profiling (inclusive). Useful for skipping warmup iterations
+- `activities` (optional): List of activities to profile, e.g., `["CPU", "GPU"]`. Default is `["CPU", "GPU"]`
+- `merge_profiles` (optional): Whether to merge distributed traces. Default is `false`
+**Note on step ranges:** Profiling starts at `start_step` (inclusive) and continues for `num_steps` iterations. For example, with `start_step=3` and `num_steps=10`, profiling captures steps 3, 4, 5, 6, 7, 8, 9, 10, 11, and 12 (10 steps total, starting from step 3).
+**Advanced usage with `start_step`:**
+```bash
+# Wait 5 steps (warmup), then profile for 10 steps
+curl -X POST http://127.0.0.1:30000/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{
+    "output_dir": "/tmp/profiles",
+    "start_step": 5,
+    "num_steps": 10,
+    "activities": ["CPU", "GPU"]
+  }'
+```
+**Continuous profiling (manual stop):**
+```bash
+# Start profiling without num_steps - must manually stop with /end_profile
+curl -X POST http://127.0.0.1:30000/start_profile
+```
+#### Using `/end_profile` endpoint
+The `/end_profile` endpoint stops an ongoing profiling session and saves the trace file.
+```bash
+# Stop profiling and save traces
+curl -X POST http://127.0.0.1:30000/end_profile
+```
+This is only needed when you start profiling without specifying `num_steps`. If `num_steps` is specified, profiling will automatically stop after that many steps.
+#### Example workflow
+```bash
+# Terminal 1: Start the server
+export SGLANG_TORCH_PROFILER_DIR=/tmp/profiles
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+# Terminal 2: Start continuous profiling
+curl -X POST http://127.0.0.1:30000/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{
+    "start_step": 3
+  }'
+# Terminal 3: Send requests to generate load
+python -m sglang.bench_serving --backend sglang --num-prompts 100
+# Terminal 2: Stop profiling when done
+curl -X POST http://127.0.0.1:30000/end_profile
+```
+### Profiler Trace Merger for Distributed Traces
+SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs.
+#### Multi-Node Profiling and Shared Storage Considerations
+Single-node profiler output merging is completely supported. When profiling in distributed environments spanning multiple nodes, shared storage (e.g., NFS, Lustre) should be accessible by all nodes for the output directory to enable merging of trace files.
+If there is no shared storage accessible across nodes, automatic merging of trace files during profiling is not supported directly as of now.
+#### HTTP API Usage
+```bash
+# Start profiling with automatic trace merging enabled
+curl -X POST <BASE_URL>/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{
+    "output_dir": "/tmp/profiles", # where to store profile traces
+    "num_steps": 10,
+    "activities": ["CPU", "GPU"],
+    "merge_profiles": true # optional argument to merge profile traces (default=False)
+  }'
+```
+#### Command Line Usage
+```bash
+# Start profiling with merge enabled
+python -m sglang.profiler \
+  --num-steps 10 \
+  --cpu \
+  --gpu \
+  --output-dir /tmp/profiles \
+  --merge-profiles # optional argument to merge profile traces (default=False)
+```
+#### Output Files
+The profile merger generates:
+- Individual rank trace files: `{profile_id}-TP-{tp}-DP-{dp}-PP-{pp}-EP-{ep}.trace.json.gz`
+- Merged trace file: `merged-{profile_id}.trace.json.gz`
+### Possible PyTorch bugs
+If in any cases you encounter the following error (for example, using qwen 2.5 VL):
+```bash
+RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
+```
+This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
+```bash
+export SGLANG_PROFILE_WITH_STACK=False
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+### View traces
+Trace files can be loaded and visualized from:
+1. https://ui.perfetto.dev/ (any browser)
+2. chrome://tracing (Chrome browser only)
+If browser cannot open trace file due to its large size,
+client can generate a small trace file (<100MB) by controlling number of prompts and lengths of prompt outputs.
+For example, when profiling a server,
+```bash
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 2 --sharegpt-output-len 100 --profile
+```
+This command sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly.
+Additionally, if you want to locate the SGLang Python source code through the cuda kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service.
+## Profile with Nsight
+[Nsight systems](https://docs.nvidia.com/nsight-systems/) is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+1. Prerequisite:
+   Install using apt, or run inside a [NVIDIA Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) or [SGLang Docker container](https://github.com/sgl-project/sglang/tree/main/docker).
+   ```bash
+   # install nsys
+   # https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html
+   apt update
+   apt install -y --no-install-recommends gnupg
+   echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+   apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+   apt update
+   apt install nsight-systems-cli
+   ```
+2. To profile a single batch, use
+   ```bash
+   nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.bench_one_batch --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512
+   ```
+3. To profile a server, e.g.
+   ```bash
+   # launch the server, set the delay and duration times according to needs
+   # after the duration time has been used up, server will be killed by nsys
+   nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+   # client
+   python3 -m sglang.bench_serving --backend sglang --num-prompts 1000 --dataset-name random --random-input 1024 --random-output 512
+   ```
+   In practice, we recommend users to set `--duration` argument to a large value. Whenever user wants the server to stop profiling. Firstly run:
+   ```bash
+   nsys sessions list
+   ```
+   to get the session id in the form of `profile-XXXXX`, then run:
+   ```bash
+   nsys stop --session=profile-XXXXX
+   ```
+   to manually kill the profiler and generate `nsys-rep` files instantly.
+4. Use NVTX to annotate code regions, e.g. to see their execution time.
+   ```bash
+   # install nvtx
+   pip install nvtx
+   ```
+   ```python
+   # code snippets
+   import nvtx
+   with nvtx.annotate("description", color="color"):
+       # some critical code
+   ```
+### Layer-wise NVTX Profiling with Nsight Systems
+SGLang provides built-in layerwise NVTX annotations that can be combined with the CUDA Profiler for detailed per-layer profiling in Nsight Systems. This is particularly useful for identifying performance bottlenecks at the layer level.
+#### Using `--enable-layerwise-nvtx-marker` with Nsight Systems and `/start_profile`
+The `--enable-layerwise-nvtx-marker` flag automatically adds NVTX markers to every layer in your model. This is particularly powerful when combined with Nsight Systems profiling to see detailed per-layer performance.
+**Method 1: Using `/start_profile` with CUDA_PROFILER (for programmatic control)**
+This method allows you to control exactly when profiling starts/stops via HTTP API while Nsight Systems is running.
+1. Launch the server with layerwise NVTX enabled under Nsight Systems:
+   ```bash
+   # Terminal 1: Start server with nsys and capture-range option
+   nsys profile --trace-fork-before-exec=true \
+     --cuda-graph-trace=node \
+     --capture-range=cudaProfilerApi \
+     --capture-range-end=stop \
+     -o layerwise_profile \
+     python -m sglang.launch_server \
+       --model-path meta-llama/Llama-3.1-8B-Instruct \
+       --enable-layerwise-nvtx-marker \
+       --disable-cuda-graph
+   ```
+   Note: NVTX markers are not emitted for kernel launches captured by CUDA graphs. Use `--disable-cuda-graph` to ensure all layerwise NVTX markers are emitted in the trace.
+2. In another terminal, control profiling via `/start_profile` with `CUDA_PROFILER` activity:
+   ```bash
+   # Terminal 2: Wait for server to be ready, then start CUDA profiling
+   # Wait 3 steps for warmup, then profile for 10 steps
+   curl -X POST http://127.0.0.1:30000/start_profile \
+     -H "Content-Type: application/json" \
+     -d '{
+       "start_step": 3,
+       "num_steps": 10,
+       "activities": ["CUDA_PROFILER"]
+     }'
+   ```
+3. Send requests to generate load:
+   ```bash
+   # Terminal 3: Generate workload
+   python -m sglang.bench_serving --backend sglang --num-prompts 100
+   ```
+4. Profiling will automatically stop after 10 steps (due to `num_steps: 10`). If you hadn't specified `num_steps`, you would need to manually stop it:
+   ```bash
+   # Terminal 2: Only needed if num_steps was not specified
+   curl -X POST http://127.0.0.1:30000/end_profile
+   ```
+The `--capture-range=cudaProfilerApi` option tells Nsight Systems to only capture data between `cudaProfilerStart()` and `cudaProfilerStop()` calls (triggered by `/start_profile` and `/end_profile`), reducing overhead and file size. The `start_step` parameter skips the first 3 steps to avoid capturing warmup overhead.
+**Method 2: Simpler approach without `/start_profile` API**
+For simpler use cases where you don't need fine-grained control over profiling start/stop, you can profile with Nsight Systems capturing the entire workload:
+```bash
+# Terminal 1: Start server with layerwise NVTX
+# Note: --disable-cuda-graph ensures all NVTX markers are emitted
+python -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.1-8B-Instruct \
+  --enable-layerwise-nvtx-marker \
+  --disable-cuda-graph
+# Terminal 2: Profile the benchmarking client
+nsys profile --trace-fork-before-exec=true \
+  --cuda-graph-trace=node \
+  -o layerwise_profile \
+  python -m sglang.bench_serving --backend sglang --num-prompts 10
+```
+This approach profiles the entire client execution, including all server interactions. The layerwise NVTX markers will be visible in the Nsight Systems timeline.
+**Viewing the profiling results:**
+Open the generated `.qdrep` file with Nsight Systems:
+```bash
+nsys-ui layerwise_profile.qdrep
+```
+In the Nsight Systems GUI, you'll see:
+- **NVTX ranges**: Each layer appears as a labeled range in the timeline with detailed information in the marker metadata
+- **CUDA kernels**: All GPU kernels are shown alongside the layer annotations
+- **Layer hierarchy**: The full module path (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct.model.layers.0.self_attn.qkv_proj`) helps identify specific layers. The prefix uses the full model path from `--model-path`.
+- **Tensor shapes**: Input/output dimensions and parameter shapes are included in the NVTX marker data
+**Benefits of layerwise NVTX profiling:**
+- **Granular visibility**: See exactly which layers are taking the most time
+- **Memory tracking**: Identify layers with large memory allocations
+- **Bottleneck identification**: Quickly locate inefficient operations
+- **Communication overhead**: In multi-GPU setups, see per-layer communication costs
+- **Development debugging**: Validate that model architecture changes have the expected performance impact
+## Other tips
+1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
+2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using:
+   ```bash
+   python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'
+   ```
+3. You can use `--python-backtrace=cuda` to see python call stack for all CUDA kernels, as in PyTorch Profiler. (Caveat: this can cause inaccurately long kernel runtimes for CUDA event based timing)
+4. For more arguments see [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).

sglang/docs/developer_guide/contribution_guide.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# Contribution Guide
+Welcome to **SGLang**! We appreciate your interest in contributing. This guide provides a concise overview of how to set up your environment, run tests, build documentation, and open a Pull Request (PR). Whether you’re fixing a small bug or developing a major feature, we encourage following these steps for a smooth contribution process.
+## Install SGLang from Source
+### Fork and clone the repository
+**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
+```bash
+git clone https://github.com/<your_user_name>/sglang.git
+```
+### Build from source
+Refer to [Install SGLang from Source](../get_started/install.md#method-2-from-source).
+## Format code with pre-commit
+We use [pre-commit](https://pre-commit.com/) to maintain consistent code style checks. Before pushing your changes, please run:
+```bash
+pip3 install pre-commit
+pre-commit install
+pre-commit run --all-files
+```
+- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
+- **Do not commit** directly to the `main` branch. Always create a new branch (e.g., `feature/my-new-feature`), push your changes, and open a PR from that branch.
+## Run and add unit tests
+If you add a new feature or fix a bug, please add corresponding unit tests to ensure coverage and prevent regression.
+SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unittest.html) framework.
+For detailed instructions on running tests and integrating them into CI, refer to [test/README.md](https://github.com/sgl-project/sglang/tree/main/test/README.md).
+## Write documentations
+We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
+For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
+## Test the accuracy
+If your code changes the model output, please run the accuracy tests. A quick sanity check is the few-shot GSM8K.
+```
+# Launch a server
+python3 -m sglang.launch_server --model Qwen/Qwen2-7B-Instruct
+# Evaluate
+python3 -m sglang.test.few_shot_gsm8k --num-questions 200
+```
+Please note that the above script is primarily a sanity check, not a rigorous accuracy or speed test.
+This test can have significant variance (1%–5%) in accuracy due to batching and the non-deterministic nature of the inference engine.
+Also, do not rely on the "Latency/Output throughput" from this script, as it is not a proper speed test.
+GSM8K is too easy for state-of-the-art models nowadays. Please try your own more challenging accuracy tests.
+You can find additional accuracy eval examples in:
+- [test_eval_accuracy_large.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_eval_accuracy_large.py)
+- [test_gpt_oss_1gpu.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_gpt_oss_1gpu.py)
+## Benchmark the speed
+Refer to [Benchmark and Profiling](../developer_guide/benchmark_and_profiling.md).
+## Requesting a review for merge
+You can follow the pull request merge process described in [MAINTAINER.md](https://github.com/sgl-project/sglang/blob/main/.github/MAINTAINER.md).
+You will need to work with the Merge Oncall, Codeowner, and other reviewers to get their approvals.
+Then your PR can be merged.
+## How to Trigger CI Tests
+We have a lot of open PRs but limited CI machines, so only top and trusted contributors have permission to trigger CI tests.
+Users with permission are listed in the [CI_PERMISSIONS.json](https://github.com/sgl-project/sglang/blob/main/.github/CI_PERMISSIONS.json)
+**PR authors** can always use `/rerun-failed-ci` on their own PRs, even if they are not listed in `CI_PERMISSIONS.json`.
+For CI to run on a pull request, it must have the "run-ci" label. Authorized users can add the label or rerun failed tests by commenting on the PR with one of these commands:
+- `/tag-run-ci-label`: Adds the "run-ci" label. Every future commit will trigger CI.
+- `/rerun-failed-ci`: Reruns the failed or flaky tests from the most recent commit.
+- `/tag-and-rerun-ci`: A single command that performs both `/tag-run-ci-label` and `/rerun-failed-ci`.
+- `/rerun-stage <stage-name>`: Reruns a specific test stage without waiting for its dependencies. This is useful when you want to quickly validate a fix for a specific test failure instead of waiting ~30 minutes for preceding stages to complete.
+If you have permission, the [Slash Command Handler](https://github.com/sgl-project/sglang/actions/workflows/slash-command-handler.yml) will run your command and react with a 👍 to your comment. It may take up to a few minutes for the reaction to appear. Here’s a usage [example](https://github.com/sgl-project/sglang/pull/14253#issuecomment-3599509302).
+To avoid spamming a PR with too many `/rerun-failed-ci` comments, you can also trigger the command by editing an existing comment and adding any suffix (e.g., `/rerun-failed-ci try again`).
+Example of rerunning a single test stage: `/rerun-stage unit-test-backend-4-gpu`.
+If you don’t have permission and you’re not the PR author, please ask maintainers to trigger CI for you.
+### CI rate limits
+Due to CI scheduling and limited resources, higher-priority PRs may preempt running jobs. In such cases, you may need to rerun the tests.
+We apply CI rate limits to prevent abuse and ensure fair usage of our CI resources.
+Each CI workflow has a default limit defined in its workflow configuration file. For example, in [pr-gate.yml](https://github.com/sgl-project/sglang/blob/main/.github/workflows/pr-gate.yml), the default cooldown period is 120 minutes, and each workflow can override it via the `cool-down-minutes` input parameter:
+```yaml
+cool-down-minutes:
+  description: "Default cooldown period in minutes; 0 disables rate limiting"
+  type: number
+  default: 120
+```
+Users listed in [CI_PERMISSIONS.json](https://github.com/sgl-project/sglang/blob/main/.github/CI_PERMISSIONS.json) may have a per-user cooldown interval. In practice, we use the minimum of the workflow’s default window and the user-specific interval.
+## Code style guidance
+- Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function.
+- Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code.
+- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code.
+  - A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible.
+- Make functions as pure as possible. Avoid in-place modification of arguments.
+- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files. (e.g., `scheduler.py`, `scheduler_output_processor_mixin.py`)
+- Keep tests run fast.
+  - If a single test file run longer than 500 seconds, split it into multiple smaller files (e.g., `test_eagle_infer_a.py`, `test_eagle_infer_b.py`).
+  - If a single job in a github workflow runs longer than 30 mins, split it into smaller jobs/steps.
+  - Reuse server launches in your unit tests to make tests run faster.
+- When supporting new hardware or features, follow these guidelines:
+  - Do not drastically change existing code.
+  - Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`).
+  - If you write multiple if/else blocks for new features, ensure the common path (e.g., NVIDIA hardware or the existing code path) is the first branch.
+## How to update sgl-kernel
+Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR).
+To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
+Follow these steps:
+1. Submit a PR to update the sgl-kernel source code without using it in sglang python package (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
+2. Bump the version of sgl-kernel (e.g., [#9220](https://github.com/sgl-project/sglang/pull/9220/files)).
+   - Once merged, this will trigger an automatic release of the sgl-kernel wheel to PyPI.
+   - If not urgent, you can wait for other people to release the wheel. A new version will typically be released within one week.
+3. Apply the changes:
+   - Update the sgl-kernel version in `sglang/python/pyproject.toml` to use the modified kernels.
+   - Update the related caller code in the sglang to use the new kernel.
+## Tips for newcomers
+If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
+If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://slack.sglang.io).
+Thank you for your interest in SGLang. Happy coding!

sglang/docs/developer_guide/development_guide_using_docker.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# Development Guide Using Docker
+## Setup VSCode on a Remote Host
+(Optional - you can skip this step if you plan to run sglang dev container locally)
+1. In the remote host, download `code` from [Https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell.
+Example
+```bash
+wget https://vscode.download.prss.microsoft.com/dbazure/download/stable/fabdb6a30b49f79a7aba0f2ad9df9b399473380f/vscode_cli_alpine_x64_cli.tar.gz
+tar xf vscode_cli_alpine_x64_cli.tar.gz
+# https://code.visualstudio.com/docs/remote/tunnels
+./code tunnel
+```
+2. In your local machine, press F1 in VSCode and choose "Remote Tunnels: Connect to Tunnel".
+## Setup Docker Container
+### Option 1. Use the default dev container automatically from VSCode
+There is a `.devcontainer` folder in the sglang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).
+![image](https://github.com/user-attachments/assets/6a245da8-2d4d-4ea8-8db1-5a05b3a66f6d)
+(*Figure 1: Diagram from VSCode official documentation [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).*)
+To enable this, you only need to:
+1. Start Visual Studio Code and install [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers).
+2. Press F1, type and choose "Dev Container: Open Folder in Container.
+3. Input the `sglang` local repo path in your machine and press enter.
+The first time you open it in dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container:
+![image](https://github.com/user-attachments/assets/650bba0b-c023-455f-91f9-ab357340106b)
+Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, sglang server will be started in the dev container with all your local changes applied automatically:
+![image](https://github.com/user-attachments/assets/748c85ba-7f8c-465e-8599-2bf7a8dde895)
+### Option 2. Start up containers manually (advanced)
+The following startup command is an example for internal development by the SGLang team. You can **modify or add directory mappings as needed**, especially for model weight downloads, to prevent repeated downloads by different Docker containers.
+❗️ **Note on RDMA**
+    1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them but keeping them there does not harm. Thus, we enable these two flags by default in the commands below.
+    2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+```bash
+# Change the name to yours
+docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --network=host --privileged --name sglang_dev lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_dev /bin/zsh
+```
+Some useful volumes to mount are:
+1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
+2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
+Example 1: Mounting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
+```bash
+docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+Example 2: Mounting both HuggingFace cache and local SGLang repo. Local code changes are automatically synced to the devcontainer as the SGLang is installed in editable mode in the dev image.
+```bash
+docker run -itd --shm-size 32g --gpus all -v $HOME/.cache/huggingface/:/root/.cache/huggingface -v $HOME/src/sglang:/sgl-workspace/sglang --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+## Debug SGLang with VSCode Debugger
+1. (Create if not exist) open `launch.json` in VSCode.
+2. Add the following config and save. Please note that you can edit the script as needed to apply different parameters or debug a different program (e.g. benchmark script).
+     ```JSON
+       {
+          "version": "0.2.0",
+          "configurations": [
+              {
+                  "name": "Python Debugger: launch_server",
+                  "type": "debugpy",
+                  "request": "launch",
+                  "module": "sglang.launch_server",
+                  "console": "integratedTerminal",
+                  "args": [
+                      "--model-path", "meta-llama/Llama-3.2-1B",
+                      "--host", "0.0.0.0",
+                      "--port", "30000",
+                      "--trust-remote-code",
+                  ],
+                  "justMyCode": false
+              }
+          ]
+      }
+    ```
+3. Press "F5" to start. VSCode debugger will ensure that the program will pause at the breakpoints even if the program is running at remote SSH/Tunnel host + dev container.
+## Profile
+```bash
+# Change batch size, input, output and add `disable-cuda-graph` (for easier analysis)
+# e.g. DeepSeek V3
+nsys profile -o deepseek_v3 python3 -m sglang.bench_one_batch --batch-size 1 --input 128 --output 256 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --tp 8 --disable-cuda-graph
+```
+## Evaluation
+```bash
+# e.g. gsm8k 8 shot
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 2000 --parallel 2000 --num-shots 8
+```

sglang/docs/developer_guide/development_jit_kernel_guide.md ADDED Viewed

	@@ -0,0 +1,259 @@

+# Development Guide for JIT Kernels
+## Environment Setup
+We strongly recommend using `clangd` as the language server for JIT kernel development.
+For Ubuntu/Debian, you can download clangd from [apt.llvm.org](https://apt.llvm.org/).
+If you are using VS Code, we recommend installing the `clangd` extension for better IDE integration.
+All JIT-related files are located in `python/sglang/jit_kernel`.
+Unlike `sgl-kernel`, which compiles CUDA/C++ binaries ahead of time (AOT), just-in-time (JIT) kernels are compiled at runtime.
+Consequently, a static `compile_commands.json` cannot be generated.
+To enable code completion with `clangd`, run `python -m sglang.jit_kernel` to generate a `.clangd` configuration file in your current directory.
+After generating the file, restart the clangd language server. It should now recognize all JIT kernel files.
+## Code Structure
+### C++ Implementation
+C++ source code is located in `python/sglang/jit_kernel/csrc`.
+Reusable functions should be placed in `python/sglang/jit_kernel/include`.
+We use [tvm-ffi](https://github.com/apache/tvm-ffi) for efficient foreign language bindings.
+Refer to the [documentation](https://tvm.apache.org/ffi/) for advanced usage, such as exporting C++ objects.
+Typically, `tvm::ffi::TensorView` is sufficient for passing PyTorch Tensors from Python.
+### Python Interface
+Python interfaces are defined in `python/sglang/jit_kernel`.
+The `load_jit` utility function in `python/sglang/jit_kernel/utils.py` loads and returns the compiled module.
+To export a C++ function (e.g., `cpp_func`), pass `cuda_wrappers=[("func", "cpp_func")]` to `load_jit`.
+The function can then be called in Python as `module.func`.
+For caching compiled modules, prefer `sglang.jit_kernel.utils.cache_once` over `functools.lru_cache`.
+`functools.lru_cache` is not compatible with `torch.compile`.
+### C++ Utilities
+The following C++ utilities are available:
+#### Integer Range
+Similar to PyTorch, we provide an `irange` function to represent an integer range.
+```C++
+#include <sgl_kernel/utils.h>
+void test() {
+  for (auto i : host::irange(100)) { // [0, 100)
+    // do something
+  }
+  for (auto i : host::irange(0, 100)) { // [0, 100)
+    // do something
+  }
+}
+```
+#### Runtime Checking
+`RuntimeCheck` validates conditions at runtime. It accepts optional arguments for error reporting.
+If the check fails, these arguments are output to aid debugging.
+`RuntimeDeviceCheck` verifies the status of the last kernel launch.
+```C++
+#include <sgl_kernel/utils.h>
+#include <sgl_kernel/utils.cuh>
+void test() {
+  host::RuntimeCheck(1 + 1 == 2, 1 + 1, " != ", 2);
+  host::RuntimeDeviceCheck();
+  // check the provided `cudaError_t`
+  host::RuntimeDeviceCheck(cudaGetLastError());
+}
+```
+#### Tensor Checking
+`TensorMatcher` provides a readable way to validate and extract tensor shape information.
+```cpp
+#include <sgl_kernel/tensor.h>
+void test(const tvm::ffi::TensorView k_cache, const tvm::ffi::TensorView v_cache) {
+  using namespace host;
+  auto D = SymbolicSize{"D"};  // cache dimension
+  auto N = SymbolicSize{"N"};  // kvcache stride
+  auto dtype = SymbolicDType{};
+  auto device = SymbolicDevice{};
+  TensorMatcher({-1, D})  //
+      .with_strides({N, 1})
+      .with_dtype<int32_t, int64_t>(dtype)
+      .with_device<kDLCUDA, kDLCPU>(device)
+      .verify(k_cache)
+      .verify(v_cache);
+}
+```
+Configure the `TensorMatcher` with expected stride, dtype, and device properties before verification.
+- If `with_strides` is omitted, the tensor is expected to be contiguous.
+- Template arguments in `with_dtype` restrict the allowed data types.
+- Template arguments in `with_device` restrict the allowed devices.
+- Values passed to `with_xxx` methods enforce equality checks.
+- Passing `-1` for size or stride allows matching any value.
+A `Symbolic` variable must resolve to the same value across all verifications.
+Use `.unwrap()` to retrieve the matched value after verification.
+> Note: `TensorMatcher` is a temporary expression and should not be stored in a variable.
+> Tip: Add `//` at the end of the `TensorMatcher` chain to enforce proper indentation.
+#### Kernel Launching
+`LaunchKernel::resolve_device` retrieves the current `cudaStream` from PyTorch.
+Kernels can also be launched directly using `LaunchKernel`.
+```cpp
+#include <sgl_kernel/utils.cuh>
+#include <dlpack/dlpack.h>
+__global__ void kernel() {}
+void test() {
+  const auto num_blocks = 1;
+  const auto num_threads = 32;
+  const auto dynamic_smem = 0;
+  DLDevice dev;  // suppose this is initialized properly
+  host::LaunchKernel(num_blocks, num_threads, dev)(kernel);
+  cudaStream_t stream = host::LaunchKernel::resolve_device(dev);
+  host::LaunchKernel(num_blocks, num_threads, stream, dynamic_smem)(kernel);
+}
+```
+## Add new kernels
+This section walks through a complete, end-to-end example of adding a new JIT kernel to the system.
+We use a simple add_constant kernel as a running example, which adds a constant integer value to every element of an input tensor.
+Conceptually, the Python interface looks like this:
+```python
+def add_constant(src: torch.Tensor, c: int):
+    return src + c
+```
+### STEP 1: Write the C++ kernel
+Write your CUDA kernel in [jit_kernel/csrc/add_constant.cuh](../../python/sglang/jit_kernel/csrc/add_constant.cuh). For demonstration purposes, we pass the constant value as a template parameter.
+```cpp
+#include <sgl_kernel/tensor.h>   // For TensorMatcher, SymbolicSize, SymbolicDevice
+#include <sgl_kernel/utils.cuh>  // For LaunchKernel
+#include <sgl_kernel/utils.h>    // For div_ceil, RuntimeCheck
+#include <dlpack/dlpack.h>
+#include <tvm/ffi/container/tensor.h>
+#include <cstddef>
+#include <cstdint>
+namespace {
+template <int32_t kConstant>
+__global__ void add_constant_kernel(int32_t* dst, const int32_t* src, size_t length) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < length) {
+    dst[idx] = src[idx] + kConstant;
+  }
+}
+constexpr size_t kBlockSize = 256;
+// You can also use struct with static method as an alternative
+template <int32_t kConstant>
+void add_constant(tvm::ffi::TensorView dst, tvm::ffi::TensorView src) {
+  using namespace host;
+  // 1. Validate input tensors
+  SymbolicSize N = {"num_elements"};
+  SymbolicDevice device_;
+  TensorMatcher({N})                  // 1D tensor, must be contiguous
+      .with_dtype<int32_t>()          // must be int32
+      .with_device<kDLCUDA>(device_)  // must be on CUDA device
+      .verify(dst)                    // check tensor dst
+      .verify(src);                   // check tensor src
+  // 2. Extract required parameters, prepare for kernel launch
+  const size_t num_elements = N.unwrap();
+  const size_t grid_size = div_ceil(num_elements, kBlockSize);
+  const DLDevice device = device_.unwrap();
+  // some extra runtime checks using host::RuntimeCheck
+  RuntimeCheck(num_elements > 0, "We only support non-empty tensors, got num_elements = ", num_elements);
+  // 3. Launch the kernel. Error code will be automatically checked.
+  LaunchKernel(grid_size, kBlockSize, device /*, dynamic_smem*/)(
+      // kernel function
+      add_constant_kernel<kConstant>,
+      // kernel arguments
+      static_cast<int32_t*>(dst.data_ptr()),
+      static_cast<int32_t*>(src.data_ptr()),
+      num_elements);
+}
+}  // namespace
+```
+### STEP 2: Create Python Interfaces
+Next, expose the kernel through a Python wrapper.
+Create a new file at [jit_kernel/add_constant.py](../../python/sglang/jit_kernel/add_constant.py) and expose the needed interfaces.
+```python
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+from sglang.jit_kernel.utils import cache_once, load_jit, make_cpp_args
+if TYPE_CHECKING:
+    from tvm_ffi.module import Module
+@cache_once
+def _jit_add_constant_module(constant: int) -> Module:
+    args = make_cpp_args(constant)  # pass all the template argument
+    return load_jit(
+        "add_constant",
+        *args,
+        cuda_files=["add_constant.cuh"],
+        cuda_wrappers=[("add_constant", f"add_constant<{args}>")],
+    )
+def add_constant(src: torch.Tensor, constant: int) -> torch.Tensor:
+    dst = torch.empty_like(src)
+    module = _jit_add_constant_module(constant)
+    module.add_constant(dst, src)
+    return dst
+```
+### STEP 3: Use your kernel
+Finally, import and use the kernel like a regular Python function:
+```python
+from sglang.jit_kernel.add_constant import add_constant
+```
+For a complete, runnable example, refer to [test_add_constant.py](../../python/sglang/jit_kernel/tests/test_add_constant.py).

sglang/docs/developer_guide/evaluating_new_models.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# Evaluating New Models with SGLang
+This document provides commands for evaluating models' accuracy and performance. Before open-sourcing new models, we strongly suggest running these commands to verify whether the score matches your internal benchmark results.
+**For cross verification, please submit commands for installation, server launching, and benchmark running with all the scores and hardware requirements when open-sourcing your models.**
+[Reference: MiniMax M2](https://github.com/sgl-project/sglang/pull/12129)
+## Accuracy
+### LLMs
+SGLang provides built-in scripts to evaluate common benchmarks.
+**MMLU**
+```bash
+python -m sglang.test.run_eval \
+  --eval-name mmlu \
+  --port 30000 \
+  --num-examples 1000 \
+  --max-tokens 8192
+```
+**GSM8K**
+```bash
+python -m sglang.test.few_shot_gsm8k \
+  --host 127.0.0.1 \
+  --port 30000 \
+  --num-questions 200 \
+  --num-shots 5
+```
+**HellaSwag**
+```bash
+python benchmark/hellaswag/bench_sglang.py \
+  --host 127.0.0.1 \
+  --port 30000 \
+  --num-questions 200 \
+  --num-shots 20
+```
+**GPQA**
+```bash
+python -m sglang.test.run_eval \
+  --eval-name gpqa \
+  --port 30000 \
+  --num-examples 198 \
+  --max-tokens 120000 \
+  --repeat 8
+```
+```{tip}
+For reasoning models, add `--thinking-mode <mode>` (e.g., `qwen3`, `deepseek-v3`). You may skip it if the model has forced thinking enabled.
+```
+**HumanEval**
+```bash
+pip install human_eval
+python -m sglang.test.run_eval \
+  --eval-name humaneval \
+  --num-examples 10 \
+  --port 30000
+```
+### VLMs
+**MMMU**
+```bash
+python benchmark/mmmu/bench_sglang.py \
+  --port 30000 \
+  --concurrency 64
+```
+```{tip}
+You can set max tokens by passing `--extra-request-body '{"max_tokens": 4096}'`.
+```
+For models capable of processing video, we recommend extending the evaluation to include `VideoMME`, `MVBench`, and other relevant benchmarks.
+## Performance
+Performance benchmarks measure **Latency** (Time To First Token - TTFT) and **Throughput** (tokens/second).
+### LLMs
+**Latency-Sensitive Benchmark**
+This simulates a scenario with low concurrency (e.g., single user) to measure latency.
+```bash
+python -m sglang.bench_serving \
+  --backend sglang \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --dataset-name random \
+  --num-prompts 10 \
+  --max-concurrency 1
+```
+**Throughput-Sensitive Benchmark**
+This simulates a high-traffic scenario to measure maximum system throughput.
+```bash
+python -m sglang.bench_serving \
+  --backend sglang \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --dataset-name random \
+  --num-prompts 1000 \
+  --max-concurrency 100
+```
+**Single Batch Performance**
+You can also benchmark the performance of processing a single batch offline.
+```bash
+python -m sglang.bench_one_batch_server \
+  --model <model-path> \
+  --batch-size 8 \
+  --input-len 1024 \
+  --output-len 1024
+```
+You can run more granular benchmarks:
+- **Low Concurrency**: `--num-prompts 10 --max-concurrency 1`
+- **Medium Concurrency**: `--num-prompts 80 --max-concurrency 16`
+- **High Concurrency**: `--num-prompts 500 --max-concurrency 100`
+## Reporting Results
+For each evaluation, please report:
+1.  **Metric Score**: Accuracy % (LLMs and VLMs); Latency (ms) and Throughput (tok/s) (LLMs only).
+2.  **Environment settings**: GPU type/count, SGLang commit hash.
+3.  **Launch configuration**: Model path, TP size, and any special flags.
+4.  **Evaluation parameters**: Number of shots, examples, max tokens.

sglang/docs/developer_guide/release_process.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# PyPI Package Release Process
+## Update the version in code
+Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
+## Upload the PyPI package
+```
+pip install build twine
+```
+```
+cd python
+bash upload_pypi.sh
+```
+## Make a release in GitHub
+Make a new release https://github.com/sgl-project/sglang/releases/new.

sglang/docs/developer_guide/setup_github_runner.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Set Up Self-Hosted Runners for GitHub Actions
+## Add a Runner
+### Step 1: Start a docker container.
+**You can mount a folder for the shared huggingface model weights cache. **
+The command below uses `/tmp/huggingface` as an example.
+```
+docker pull nvidia/cuda:12.9.1-devel-ubuntu22.04
+# Nvidia
+docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.9.1-devel-ubuntu22.04 /bin/bash
+# AMD
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.8-rocm700-mi30x /bin/bash
+# AMD just the last 2 GPUs
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.8-rocm700-mi30x /bin/bash
+```
+### Step 2: Configure the runner by `config.sh`
+Run these commands inside the container.
+```
+apt update && apt install -y curl python3-pip git
+pip install --upgrade pip
+export RUNNER_ALLOW_RUNASROOT=1
+```
+Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux to run `config.sh`
+**Notes**
+- Do not need to specify the runner group
+- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
+- Do not need to change the work folder.
+### Step 3: Run the runner by `run.sh`
+- Set up environment variables
+```
+export HF_HOME=/hf_home
+export SGLANG_IS_IN_CI=true
+export HF_TOKEN=hf_xxx
+export OPENAI_API_KEY=sk-xxx
+export CUDA_VISIBLE_DEVICES=0
+```
+- Run it forever
+```
+while true; do ./run.sh; echo "Restarting..."; sleep 2; done
+```

sglang/docs/diffusion/api/cli.md ADDED Viewed

	@@ -0,0 +1,332 @@

+# SGLang diffusion CLI Inference
+The SGLang-diffusion CLI provides a quick way to access the inference pipeline for image and video generation.
+## Prerequisites
+- A working SGLang diffusion installation and the `sglang` CLI available in `$PATH`.
+## Supported Arguments
+### Server Arguments
+- `--model-path {MODEL_PATH}`: Path to the model or model ID
+- `--lora-path {LORA_PATH}`: Path to a LoRA adapter (local path or HuggingFace model ID). If not specified, LoRA will not be applied.
+- `--lora-nickname {NAME}`: Nickname for the LoRA adapter. (default: `default`).
+- `--num-gpus {NUM_GPUS}`: Number of GPUs to use
+- `--tp-size {TP_SIZE}`: Tensor parallelism size (only for the encoder; should not be larger than 1 if text encoder offload is enabled, as layer-wise offload plus prefetch is faster)
+- `--sp-degree {SP_SIZE}`: Sequence parallelism size (typically should match the number of GPUs)
+- `--ulysses-degree {ULYSSES_DEGREE}`: The degree of DeepSpeed-Ulysses-style SP in USP
+- `--ring-degree {RING_DEGREE}`: The degree of ring attention-style SP in USP
+- `--attention-backend {BACKEND}`: Attention backend to use. For SGLang-native pipelines use `fa`, `torch_sdpa`, `sage_attn`, etc. For diffusers pipelines use diffusers backend names like `flash`, `_flash_3_hub`, `sage`, `xformers`.
+- `--attention-backend-config {CONFIG}`: Configuration for the attention backend. Can be a JSON string (e.g., '{"k": "v"}'), a path to a JSON/YAML file, or key=value pairs (e.g., "k=v,k2=v2").
+- `--cache-dit-config {PATH}`: Path to a Cache-DiT YAML/JSON config (diffusers backend only)
+- `--dit-precision {DTYPE}`: Precision for the DiT model (currently supports fp32, fp16, and bf16).
+### Sampling Parameters
+- `--prompt {PROMPT}`: Text description for the video you want to generate
+- `--num-inference-steps {STEPS}`: Number of denoising steps
+- `--negative-prompt {PROMPT}`: Negative prompt to guide generation away from certain concepts
+- `--seed {SEED}`: Random seed for reproducible generation
+**Image/Video Configuration**
+- `--height {HEIGHT}`: Height of the generated output
+- `--width {WIDTH}`: Width of the generated output
+- `--num-frames {NUM_FRAMES}`: Number of frames to generate
+- `--fps {FPS}`: Frames per second for the saved output, if this is a video-generation task
+**Frame Interpolation** (video only)
+Frame interpolation is a post-processing step that synthesizes new frames
+between each pair of consecutive generated frames, producing smoother
+motion without re-running the diffusion model. The `--frame-interpolation-exp`
+flag controls how many rounds of interpolation to apply: each round inserts one
+new frame into every gap between adjacent frames, so the output frame count
+follows the formula **(N − 1) × 2^exp + 1** (e.g. 5 original frames with
+`exp=1` → 4 gaps × 1 new frame + 5 originals = **9** frames; with `exp=2` →
+**17** frames).
+- `--enable-frame-interpolation`: Enable frame interpolation. Model weights are downloaded automatically on first use.
+- `--frame-interpolation-exp {EXP}`: Interpolation exponent — `1` = 2× temporal resolution, `2` = 4×, etc. (default: `1`)
+- `--frame-interpolation-scale {SCALE}`: RIFE inference scale; use `0.5` for high-resolution inputs to save memory (default: `1.0`)
+- `--frame-interpolation-model-path {PATH}`: Local directory or HuggingFace repo ID containing RIFE `flownet.pkl` weights (default: `elfgum/RIFE-4.22.lite`, downloaded automatically)
+Example — generate a 5-frame video and interpolate to 9 frames ((5 − 1) × 2¹ + 1 = 9):
+```bash
+sglang generate \
+  --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
+  --prompt "A dog running through a park" \
+  --num-frames 5 \
+  --enable-frame-interpolation \
+  --frame-interpolation-exp 1 \
+  --save-output
+```
+**Output Options**
+- `--output-path {PATH}`: Directory to save the generated video
+- `--save-output`: Whether to save the image/video to disk
+- `--return-frames`: Whether to return the raw frames
+### Using Configuration Files
+Instead of specifying all parameters on the command line, you can use a configuration file:
+```bash
+sglang generate --config {CONFIG_FILE_PATH}
+```
+The configuration file should be in JSON or YAML format with the same parameter names as the CLI options. Command-line arguments take precedence over settings in the configuration file, allowing you to override specific values while keeping the rest from the configuration file.
+Example configuration file (config.json):
+```json
+{
+    "model_path": "FastVideo/FastHunyuan-diffusers",
+    "prompt": "A beautiful woman in a red dress walking down a street",
+    "output_path": "outputs/",
+    "num_gpus": 2,
+    "sp_size": 2,
+    "tp_size": 1,
+    "num_frames": 45,
+    "height": 720,
+    "width": 1280,
+    "num_inference_steps": 6,
+    "seed": 1024,
+    "fps": 24,
+    "precision": "bf16",
+    "vae_precision": "fp16",
+    "vae_tiling": true,
+    "vae_sp": true,
+    "vae_config": {
+        "load_encoder": false,
+        "load_decoder": true,
+        "tile_sample_min_height": 256,
+        "tile_sample_min_width": 256
+    },
+    "text_encoder_precisions": [
+        "fp16",
+        "fp16"
+    ],
+    "mask_strategy_file_path": null,
+    "enable_torch_compile": false
+}
+```
+Or using YAML format (config.yaml):
+```yaml
+model_path: "FastVideo/FastHunyuan-diffusers"
+prompt: "A beautiful woman in a red dress walking down a street"
+output_path: "outputs/"
+num_gpus: 2
+sp_size: 2
+tp_size: 1
+num_frames: 45
+height: 720
+width: 1280
+num_inference_steps: 6
+seed: 1024
+fps: 24
+precision: "bf16"
+vae_precision: "fp16"
+vae_tiling: true
+vae_sp: true
+vae_config:
+  load_encoder: false
+  load_decoder: true
+  tile_sample_min_height: 256
+  tile_sample_min_width: 256
+text_encoder_precisions:
+  - "fp16"
+  - "fp16"
+mask_strategy_file_path: null
+enable_torch_compile: false
+```
+To see all the options, you can use the `--help` flag:
+```bash
+sglang generate --help
+```
+## Serve
+Launch the SGLang diffusion HTTP server and interact with it using the OpenAI SDK and curl.
+### Start the server
+Use the following command to launch the server:
+```bash
+SERVER_ARGS=(
+  --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  --text-encoder-cpu-offload
+  --pin-cpu-memory
+  --num-gpus 4
+  --ulysses-degree=2
+  --ring-degree=2
+)
+sglang serve "${SERVER_ARGS[@]}"
+```
+- **--model-path**: Which model to load. The example uses `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`.
+- **--port**: HTTP port to listen on (the default here is `30010`).
+For detailed API usage, including Image, Video Generation and LoRA management, please refer to the [OpenAI API Documentation](openai_api.md).
+### Cloud Storage Support
+SGLang diffusion supports automatically uploading generated images and videos to S3-compatible cloud storage (e.g., AWS S3, MinIO, Alibaba Cloud OSS, Tencent Cloud COS).
+When enabled, the server follows a **Generate -> Upload -> Delete** workflow:
+1. The artifact is generated to a temporary local file.
+2. The file is immediately uploaded to the configured S3 bucket in a background thread.
+3. Upon successful upload, the local file is deleted.
+4. The API response returns the public URL of the uploaded object.
+**Configuration**
+Cloud storage is enabled via environment variables. Note that `boto3` must be installed separately (`pip install boto3`) to use this feature.
+```bash
+# Enable S3 storage
+export SGLANG_CLOUD_STORAGE_TYPE=s3
+export SGLANG_S3_BUCKET_NAME=my-bucket
+export SGLANG_S3_ACCESS_KEY_ID=your-access-key
+export SGLANG_S3_SECRET_ACCESS_KEY=your-secret-key
+# Optional: Custom endpoint for MinIO/OSS/COS
+export SGLANG_S3_ENDPOINT_URL=https://minio.example.com
+```
+See [Environment Variables Documentation](../environment_variables.md) for more details.
+## Generate
+Run a one-off generation task without launching a persistent server.
+To use it, pass both server arguments and sampling parameters in one command, after the `generate` subcommand, for example:
+```bash
+SERVER_ARGS=(
+  --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers
+  --text-encoder-cpu-offload
+  --pin-cpu-memory
+  --num-gpus 4
+  --ulysses-degree=2
+  --ring-degree=2
+)
+SAMPLING_ARGS=(
+  --prompt "A curious raccoon"
+  --save-output
+  --output-path outputs
+  --output-file-name "A curious raccoon.mp4"
+)
+sglang generate "${SERVER_ARGS[@]}" "${SAMPLING_ARGS[@]}"
+# Or, users can set `SGLANG_CACHE_DIT_ENABLED` env as `true` to enable cache acceleration
+SGLANG_CACHE_DIT_ENABLED=true sglang generate "${SERVER_ARGS[@]}" "${SAMPLING_ARGS[@]}"
+```
+Once the generation task has finished, the server will shut down automatically.
+> [!NOTE]
+> The HTTP server-related arguments are ignored in this subcommand.
+## Component Path Overrides
+SGLang diffusion allows you to override any pipeline component (e.g., `vae`, `transformer`, `text_encoder`) by specifying a custom checkpoint path. This is useful for:
+### Example: FLUX.2-dev with Tiny AutoEncoder
+You can override **any** component by using `--<component>-path`, where `<component>` matches the key in the model's `model_index.json`:
+For example, replace the default VAE with a distilled tiny autoencoder for ~3x faster decoding:
+```bash
+sglang serve \
+  --model-path=black-forest-labs/FLUX.2-dev \
+  # with a Huggingface Repo ID
+  --vae-path=fal/FLUX.2-Tiny-AutoEncoder
+  # or use a local path
+  --vae-path=~/.cache/huggingface/hub/models--fal--FLUX.2-Tiny-AutoEncoder/snapshots/.../vae
+```
+**Important:**
+- The component key must match the one in your model's `model_index.json` (e.g., `vae`).
+- The path must:
+    - either be a Huggingface Repo ID (e.g., fal/FLUX.2-Tiny-AutoEncoder)
+    - or point to a **complete component folder**, containing `config.json` and safetensors files
+## Diffusers Backend
+SGLang diffusion supports a **diffusers backend** that allows you to run any diffusers-compatible model through SGLang's infrastructure using vanilla diffusers pipelines. This is useful for running models without native SGLang implementations or models with custom pipeline classes.
+### Arguments
+| Argument | Values | Description |
+|----------|--------|-------------|
+| `--backend` | `auto` (default), `sglang`, `diffusers` | `auto`: prefer native SGLang, fallback to diffusers. `sglang`: force native (fails if unavailable). `diffusers`: force vanilla diffusers pipeline. |
+| `--diffusers-attention-backend` | `flash`, `_flash_3_hub`, `sage`, `xformers`, `native` | Attention backend for diffusers pipelines. See [diffusers attention backends](https://huggingface.co/docs/diffusers/main/en/optimization/attention_backends). |
+| `--trust-remote-code` | flag | Required for models with custom pipeline classes (e.g., Ovis). |
+| `--vae-tiling` | flag | Enable VAE tiling for large image support (decodes tile-by-tile). |
+| `--vae-slicing` | flag | Enable VAE slicing for lower memory usage (decodes slice-by-slice). |
+| `--dit-precision` | `fp16`, `bf16`, `fp32` | Precision for the diffusion transformer. |
+| `--vae-precision` | `fp16`, `bf16`, `fp32` | Precision for the VAE. |
+| `--enable-torch-compile` | flag | Enable `torch.compile` for diffusers pipelines. |
+| `--cache-dit-config` | `{PATH}` | Path to a Cache-DiT YAML/JSON config file for accelerating diffusers pipelines with Cache-DiT. |
+### Example: Running Ovis-Image-7B
+[Ovis-Image-7B](https://huggingface.co/AIDC-AI/Ovis-Image-7B) is a 7B text-to-image model optimized for high-quality text rendering.
+```bash
+sglang generate \
+  --model-path AIDC-AI/Ovis-Image-7B \
+  --backend diffusers \
+  --trust-remote-code \
+  --diffusers-attention-backend flash \
+  --prompt "A serene Japanese garden with cherry blossoms" \
+  --height 1024 \
+  --width 1024 \
+  --num-inference-steps 30 \
+  --save-output \
+  --output-path outputs \
+  --output-file-name ovis_garden.png
+```
+### Extra Diffusers Arguments
+For pipeline-specific parameters not exposed via CLI, use `diffusers_kwargs` in a config file:
+```json
+{
+    "model_path": "AIDC-AI/Ovis-Image-7B",
+    "backend": "diffusers",
+    "prompt": "A beautiful landscape",
+    "diffusers_kwargs": {
+        "cross_attention_kwargs": {"scale": 0.5}
+    }
+}
+```
+```bash
+sglang generate --config config.json
+```
+### Cache-DiT Acceleration
+Users who use the diffusers backend can also leverage Cache-DiT acceleration and load custom cache configs from a YAML file to boost performance of diffusers pipelines. See the [Cache-DiT Acceleration](https://docs.sglang.io/diffusion/performance/cache/cache_dit.html) documentation for details.

sglang/docs/diffusion/api/openai_api.md ADDED Viewed

	@@ -0,0 +1,420 @@

+# SGLang Diffusion OpenAI API
+The SGLang diffusion HTTP server implements an OpenAI-compatible API for image and video generation, as well as LoRA adapter management.
+## Prerequisites
+- Python 3.11+ if you plan to use the OpenAI Python SDK.
+## Serve
+Launch the server using the `sglang serve` command.
+### Start the server
+```bash
+SERVER_ARGS=(
+  --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  --text-encoder-cpu-offload
+  --pin-cpu-memory
+  --num-gpus 4
+  --ulysses-degree=2
+  --ring-degree=2
+  --port 30010
+)
+sglang serve "${SERVER_ARGS[@]}"
+```
+- **--model-path**: Path to the model or model ID.
+- **--port**: HTTP port to listen on (default: `30000`).
+**Get Model Information**
+**Endpoint:** `GET /models`
+Returns information about the model served by this server, including model path, task type, pipeline configuration, and precision settings.
+**Curl Example:**
+```bash
+curl -sS -X GET "http://localhost:30010/models"
+```
+**Response Example:**
+```json
+{
+  "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+  "task_type": "T2V",
+  "pipeline_name": "wan_pipeline",
+  "pipeline_class": "WanPipeline",
+  "num_gpus": 4,
+  "dit_precision": "bf16",
+  "vae_precision": "fp16"
+}
+```
+---
+## Endpoints
+### Image Generation
+The server implements an OpenAI-compatible Images API under the `/v1/images` namespace.
+**Create an image**
+**Endpoint:** `POST /v1/images/generations`
+**Python Example (b64_json response):**
+```python
+import base64
+from openai import OpenAI
+client = OpenAI(api_key="sk-proj-1234567890", base_url="http://localhost:30010/v1")
+img = client.images.generate(
+    prompt="A calico cat playing a piano on stage",
+    size="1024x1024",
+    n=1,
+    response_format="b64_json",
+)
+image_bytes = base64.b64decode(img.data[0].b64_json)
+with open("output.png", "wb") as f:
+    f.write(image_bytes)
+```
+**Curl Example:**
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/generations" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -d '{
+        "prompt": "A calico cat playing a piano on stage",
+        "size": "1024x1024",
+        "n": 1,
+        "response_format": "b64_json"
+      }'
+```
+> **Note**
+> If `response_format=url` is used and cloud storage is not configured, the API returns
+> a relative URL like `/v1/images/<IMAGE_ID>/content`.
+**Edit an image**
+**Endpoint:** `POST /v1/images/edits`
+This endpoint accepts a multipart form upload with input images and a text prompt. The server can return either a base64-encoded image or a URL to download the image.
+**Curl Example (b64_json response):**
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/edits" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -F "image=@local_input_image.png" \
+  -F "url=image_url.jpg" \
+  -F "prompt=A calico cat playing a piano on stage" \
+  -F "size=1024x1024" \
+  -F "response_format=b64_json"
+```
+**Curl Example (URL response):**
+```bash
+curl -sS -X POST "http://localhost:30010/v1/images/edits" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -F "image=@local_input_image.png" \
+  -F "url=image_url.jpg" \
+  -F "prompt=A calico cat playing a piano on stage" \
+  -F "size=1024x1024" \
+  -F "response_format=url"
+```
+**Download image content**
+When `response_format=url` is used with `POST /v1/images/generations` or `POST /v1/images/edits`,
+the API returns a relative URL like `/v1/images/<IMAGE_ID>/content`.
+**Endpoint:** `GET /v1/images/{image_id}/content`
+**Curl Example:**
+```bash
+curl -sS -L "http://localhost:30010/v1/images/<IMAGE_ID>/content" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -o output.png
+```
+### Video Generation
+The server implements a subset of the OpenAI Videos API under the `/v1/videos` namespace.
+**Create a video**
+**Endpoint:** `POST /v1/videos`
+**Python Example:**
+```python
+from openai import OpenAI
+client = OpenAI(api_key="sk-proj-1234567890", base_url="http://localhost:30010/v1")
+video = client.videos.create(
+    prompt="A calico cat playing a piano on stage",
+    size="1280x720"
+)
+print(f"Video ID: {video.id}, Status: {video.status}")
+```
+**Curl Example:**
+```bash
+curl -sS -X POST "http://localhost:30010/v1/videos" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -d '{
+        "prompt": "A calico cat playing a piano on stage",
+        "size": "1280x720"
+      }'
+```
+**List videos**
+**Endpoint:** `GET /v1/videos`
+**Python Example:**
+```python
+videos = client.videos.list()
+for item in videos.data:
+    print(item.id, item.status)
+```
+**Curl Example:**
+```bash
+curl -sS -X GET "http://localhost:30010/v1/videos" \
+  -H "Authorization: Bearer sk-proj-1234567890"
+```
+**Download video content**
+**Endpoint:** `GET /v1/videos/{video_id}/content`
+**Python Example:**
+```python
+import time
+# Poll for completion
+while True:
+    page = client.videos.list()
+    item = next((v for v in page.data if v.id == video_id), None)
+    if item and item.status == "completed":
+        break
+    time.sleep(5)
+# Download content
+resp = client.videos.download_content(video_id=video_id)
+with open("output.mp4", "wb") as f:
+    f.write(resp.read())
+```
+**Curl Example:**
+```bash
+curl -sS -L "http://localhost:30010/v1/videos/<VIDEO_ID>/content" \
+  -H "Authorization: Bearer sk-proj-1234567890" \
+  -o output.mp4
+```
+---
+### LoRA Management
+The server supports dynamic loading, merging, and unmerging of LoRA adapters.
+**Important Notes:**
+- Mutual Exclusion: Only one LoRA can be *merged* (active) at a time
+- Switching: To switch LoRAs, you must first `unmerge` the current one, then `set` the new one
+- Caching: The server caches loaded LoRA weights in memory. Switching back to a previously loaded LoRA (same path) has little cost
+**Set LoRA Adapter**
+Loads one or more LoRA adapters and merges their weights into the model. Supports both single LoRA (backward compatible) and multiple LoRA adapters.
+**Endpoint:** `POST /v1/set_lora`
+**Parameters:**
+- `lora_nickname` (string or list of strings, required): A unique identifier for the LoRA adapter(s). Can be a single string or a list of strings for multiple LoRAs
+- `lora_path` (string or list of strings/None, optional): Path to the `.safetensors` file(s) or Hugging Face repo ID(s). Required for the first load; optional if re-activating a cached nickname. If a list, must match the length of `lora_nickname`
+- `target` (string or list of strings, optional): Which transformer(s) to apply the LoRA to. If a list, must match the length of `lora_nickname`. Valid values:
+  - `"all"` (default): Apply to all transformers
+  - `"transformer"`: Apply only to the primary transformer (high noise for Wan2.2)
+  - `"transformer_2"`: Apply only to transformer_2 (low noise for Wan2.2)
+  - `"critic"`: Apply only to the critic model
+- `strength` (float or list of floats, optional): LoRA strength for merge, default 1.0. If a list, must match the length of `lora_nickname`. Values < 1.0 reduce the effect, values > 1.0 amplify the effect
+**Single LoRA Example:**
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+  -H "Content-Type: application/json" \
+  -d '{
+        "lora_nickname": "lora_name",
+        "lora_path": "/path/to/lora.safetensors",
+        "target": "all",
+        "strength": 0.8
+      }'
+```
+**Multiple LoRA Example:**
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+  -H "Content-Type: application/json" \
+  -d '{
+        "lora_nickname": ["lora_1", "lora_2"],
+        "lora_path": ["/path/to/lora1.safetensors", "/path/to/lora2.safetensors"],
+        "target": ["transformer", "transformer_2"],
+        "strength": [0.8, 1.0]
+      }'
+```
+**Multiple LoRA with Same Target:**
+```bash
+curl -X POST http://localhost:30010/v1/set_lora \
+  -H "Content-Type: application/json" \
+  -d '{
+        "lora_nickname": ["style_lora", "character_lora"],
+        "lora_path": ["/path/to/style.safetensors", "/path/to/character.safetensors"],
+        "target": "all",
+        "strength": [0.7, 0.9]
+      }'
+```
+> [!NOTE]
+> When using multiple LoRAs:
+> - All list parameters (`lora_nickname`, `lora_path`, `target`, `strength`) must have the same length
+> - If `target` or `strength` is a single value, it will be applied to all LoRAs
+> - Multiple LoRAs applied to the same target will be merged in order
+**Merge LoRA Weights**
+Manually merges the currently set LoRA weights into the base model.
+> [!NOTE]
+> `set_lora` automatically performs a merge, so this is typically only needed if you have manually unmerged but want to re-apply the same LoRA without calling `set_lora` again.*
+**Endpoint:** `POST /v1/merge_lora_weights`
+**Parameters:**
+- `target` (string, optional): Which transformer(s) to merge. One of "all" (default), "transformer", "transformer_2", "critic"
+- `strength` (float, optional): LoRA strength for merge, default 1.0. Values < 1.0 reduce the effect, values > 1.0 amplify the effect
+**Curl Example:**
+```bash
+curl -X POST http://localhost:30010/v1/merge_lora_weights \
+  -H "Content-Type: application/json" \
+  -d '{"strength": 0.8}'
+```
+**Unmerge LoRA Weights**
+Unmerges the currently active LoRA weights from the base model, restoring it to its original state. This **must** be called before setting a different LoRA.
+**Endpoint:** `POST /v1/unmerge_lora_weights`
+**Curl Example:**
+```bash
+curl -X POST http://localhost:30010/v1/unmerge_lora_weights \
+  -H "Content-Type: application/json"
+```
+**List LoRA Adapters**
+Returns loaded LoRA adapters and current application status per module.
+**Endpoint:** `GET /v1/list_loras`
+**Curl Example:**
+```bash
+curl -sS -X GET "http://localhost:30010/v1/list_loras"
+```
+**Response Example:**
+```json
+{
+  "loaded_adapters": [
+    { "nickname": "lora_a", "path": "/weights/lora_a.safetensors" },
+    { "nickname": "lora_b", "path": "/weights/lora_b.safetensors" }
+  ],
+  "active": {
+    "transformer": [
+      {
+        "nickname": "lora2",
+        "path": "tarn59/pixel_art_style_lora_z_image_turbo",
+        "merged": true,
+        "strength": 1.0
+      }
+    ]
+  }
+}
+```
+Notes:
+- If LoRA is not enabled for the current pipeline, the server will return an error.
+- `num_lora_layers_with_weights` counts only layers that have LoRA weights applied for the active adapter.
+### Example: Switching LoRAs
+1.  Set LoRA A:
+    ```bash
+    curl -X POST http://localhost:30010/v1/set_lora -d '{"lora_nickname": "lora_a", "lora_path": "path/to/A"}'
+    ```
+2.  Generate with LoRA A...
+3.  Unmerge LoRA A:
+    ```bash
+    curl -X POST http://localhost:30010/v1/unmerge_lora_weights
+    ```
+4.  Set LoRA B:
+    ```bash
+    curl -X POST http://localhost:30010/v1/set_lora -d '{"lora_nickname": "lora_b", "lora_path": "path/to/B"}'
+    ```
+5.  Generate with LoRA B...
+### Adjust Output Quality
+The server supports adjusting output quality and compression levels for both image and video generation through the `output-quality` and `output-compression` parameters.
+#### Parameters
+- **`output-quality`** (string, optional): Preset quality level that automatically sets compression. **Default is `"default"`**. Valid values:
+  - `"maximum"`: Highest quality (100)
+  - `"high"`: High quality (90)
+  - `"medium"`: Medium quality (55)
+  - `"low"`: Lower quality (35)
+  - `"default"`: Auto-adjust based on media type (50 for video, 75 for image)
+- **`output-compression`** (integer, optional): Direct compression level override (0-100). **Default is `None`**. When provided (not `None`), takes precedence over `output-quality`.
+  - `0`: Lowest quality, smallest file size
+  - `100`: Highest quality, largest file size
+#### Notes
+- **Precedence**: When both `output-quality` and `output-compression` are provided, `output-compression` takes precedence
+- **Format Support**: Quality settings apply to JPEG, and video formats. PNG uses lossless compression and ignores these settings
+- **File Size vs Quality**: Lower compression values (or "low" quality preset) produce smaller files but may show visible artifacts

sglang/docs/diffusion/ci_perf.md ADDED Viewed

	@@ -0,0 +1,29 @@

+## Perf Baseline Generation Script
+`python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py` starts a local diffusion server, issues requests for selected test cases, aggregates stage/denoise-step/E2E timings from the perf log, and writes the results back to the `scenarios` section of `perf_baselines.json`.
+### Usage
+Update a single case:
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --case qwen_image_t2i
+```
+Select by regex:
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --match 'qwen_image_.*'
+```
+Run all keys from the baseline file `scenarios`:
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --all-from-baseline
+```
+Specify input/output paths and timeout:
+```bash
+python python/sglang/multimodal_gen/test/scripts/gen_perf_baselines.py --baseline python/sglang/multimodal_gen/test/server/perf_baselines.json --out /tmp/perf_baselines.json --timeout 600
+```

sglang/docs/diffusion/compatibility_matrix.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Compatibility Matrix
+The table below shows every supported model and the optimizations supported for them.
+The symbols used have the following meanings:
+- ✅ = Full compatibility
+- ❌ = No compatibility
+- ⭕ = Does not apply to this model
+## Models x Optimization
+The `HuggingFace Model ID` can be passed directly to `from_pretrained()` methods, and sglang-diffusion will use the
+optimal
+default parameters when initializing and generating videos.
+### Video Generation Models
+| Model Name                   | Hugging Face Model ID                             | Resolutions         | TeaCache | Sliding Tile Attn | Sage Attn | Video Sparse Attention (VSA) | Sparse Linear Attention (SLA) | Sage Sparse Linear Attention (SageSLA) | Sparse Video Gen 2 (SVG2) |
+|:-----------------------------|:--------------------------------------------------|:--------------------|:--------:|:-----------------:|:---------:|:----------------------------:|:----------------------------:|:-----------------------------------------------:|:----------------------------------:|
+| FastWan2.1 T2V 1.3B          | `FastVideo/FastWan2.1-T2V-1.3B-Diffusers`         | 480p                |    ⭕     |         ⭕         |      ⭕     |              ✅               |              ❌               |              ❌               |    ❌     |
+| FastWan2.2 TI2V 5B Full Attn | `FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers` | 720p                |    ⭕     |         ⭕         |     ⭕     |              ✅               |              ❌               |              ❌               |    ❌     |
+| Wan2.2 TI2V 5B               | `Wan-AI/Wan2.2-TI2V-5B-Diffusers`                 | 720p                |    ⭕     |         ⭕         |     ✅     |              ⭕               |              ❌               |              ❌               |    ❌     |
+| Wan2.2 T2V A14B              | `Wan-AI/Wan2.2-T2V-A14B-Diffusers`                | 480p<br>720p        |    ❌     |         ❌         |     ✅     |              ⭕               |              ❌               |              ❌               |    ❌     |
+| Wan2.2 I2V A14B              | `Wan-AI/Wan2.2-I2V-A14B-Diffusers`                | 480p<br>720p        |    ❌     |         ❌         |     ✅     |              ⭕               |              ❌               |              ❌               |    ❌     |
+| HunyuanVideo                 | `hunyuanvideo-community/HunyuanVideo`             | 720×1280<br>544×960 |    ❌     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| FastHunyuan                  | `FastVideo/FastHunyuan-diffusers`                 | 720×1280<br>544×960 |    ❌     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| Wan2.1 T2V 1.3B              | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`                | 480p                |    ✅     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| Wan2.1 T2V 14B               | `Wan-AI/Wan2.1-T2V-14B-Diffusers`                 | 480p, 720p          |    ✅     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| Wan2.1 I2V 480P              | `Wan-AI/Wan2.1-I2V-14B-480P-Diffusers`            | 480p                |    ✅     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| Wan2.1 I2V 720P              | `Wan-AI/Wan2.1-I2V-14B-720P-Diffusers`            | 720p                |    ✅     |         ✅         |     ✅     |              ⭕               |              ❌               |              ❌               |    ✅     |
+| TurboWan2.1 T2V 1.3B         | `IPostYellow/TurboWan2.1-T2V-1.3B-Diffusers`      | 480p                |    ✅     |         ❌         |     ❌     |              ❌               |              ✅               |              ✅               |    ⭕     |
+| TurboWan2.1 T2V 14B          | `IPostYellow/TurboWan2.1-T2V-14B-Diffusers`       | 480p                |    ✅     |         ❌         |     ❌     |              ❌               |              ✅               |              ✅               |    ⭕     |
+| TurboWan2.1 T2V 14B 720P     | `IPostYellow/TurboWan2.1-T2V-14B-720P-Diffusers`  | 720p                |    ✅     |         ❌         |     ❌     |              ❌               |              ✅               |              ✅               |    ⭕     |
+| TurboWan2.2 I2V A14B         | `IPostYellow/TurboWan2.2-I2V-A14B-Diffusers`      | 720p                |    ✅     |         ❌         |     ❌     |              ❌               |              ✅               |              ✅               |    ⭕     |
+**Note**:
+1.Wan2.2 TI2V 5B has some quality issues when performing I2V generation. We are working on fixing this issue.
+2.SageSLA Based on SpargeAttn. Install it first with `pip install git+https://github.com/thu-ml/SpargeAttn.git --no-build-isolation`
+### Image Generation Models
+| Model Name       | HuggingFace Model ID                    | Resolutions    |
+|:-----------------|:----------------------------------------|:---------------|
+| FLUX.1-dev       | `black-forest-labs/FLUX.1-dev`          | Any resolution |
+| FLUX.2-dev       | `black-forest-labs/FLUX.2-dev`          | Any resolution |
+| FLUX.2-Klein     | `black-forest-labs/FLUX.2-klein-4B`     | Any resolution |
+| Z-Image-Turbo    | `Tongyi-MAI/Z-Image-Turbo`              | Any resolution |
+| GLM-Image        | `zai-org/GLM-Image`                     | Any resolution |
+| Qwen Image       | `Qwen/Qwen-Image`                       | Any resolution |
+| Qwen Image 2512  | `Qwen/Qwen-Image-2512`                  | Any resolution |
+| Qwen Image Edit  | `Qwen/Qwen-Image-Edit`                  | Any resolution |
+## Verified LoRA Examples
+This section lists example LoRAs that have been explicitly tested and verified with each base model in the **SGLang Diffusion** pipeline.
+> Important:
+> LoRAs that are not listed here are not necessarily incompatible.
+> In practice, most standard LoRAs are expected to work, especially those following common Diffusers or SD-style conventions.
+> The entries below simply reflect configurations that have been manually validated by the SGLang team.
+### Verified LoRAs by Base Model
+| Base Model       | Supported LoRAs |
+|:-----------------|:----------------|
+| Wan2.2           | `lightx2v/Wan2.2-Distill-Loras`<br>`Cseti/wan2.2-14B-Arcane_Jinx-lora-v1` |
+| Wan2.1           | `lightx2v/Wan2.1-Distill-Loras` |
+| Z-Image-Turbo    | `tarn59/pixel_art_style_lora_z_image_turbo`<br>`wcde/Z-Image-Turbo-DeJPEG-Lora` |
+| Qwen-Image       | `lightx2v/Qwen-Image-Lightning`<br>`flymy-ai/qwen-image-realism-lora`<br>`prithivMLmods/Qwen-Image-HeadshotX`<br>`starsfriday/Qwen-Image-EVA-LoRA` |
+| Qwen-Image-Edit  | `ostris/qwen_image_edit_inpainting`<br>`lightx2v/Qwen-Image-Edit-2511-Lightning` |
+| Flux             | `dvyio/flux-lora-simple-illustration`<br>`XLabs-AI/flux-furry-lora`<br>`XLabs-AI/flux-RealismLora` |
+## Special requirements
+### Sliding Tile Attention
+- Currently, only Hopper GPUs (H100s) are supported.

sglang/docs/diffusion/contributing.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Contributing to SGLang Diffusion
+This guide outlines the requirements for contributing to the SGLang Diffusion module (`sglang.multimodal_gen`).
+## On AI-Assisted ("Vibe Coding") PRs
+Vibe-coded PRs are welcome — we judge code quality, not how it was produced. The bar is the same for all PRs:
+- **No over-commenting.** If the name says it all, skip the docstring.
+- **No over-catching.** Don't guard against errors that virtually never happen in practice.
+- **Test before submitting.** AI-generated code can be subtly wrong — verify correctness end-to-end.
+## Commit Message Convention
+We follow a structured commit message format to maintain a clean history.
+**Format:**
+```text
+[diffusion] <scope>: <subject>
+```
+**Examples:**
+- `[diffusion] cli: add --perf-dump-path argument`
+- `[diffusion] scheduler: fix deadlock in batch processing`
+- `[diffusion] model: support Stable Diffusion 3.5`
+**Rules:**
+- **Prefix**: Always start with `[diffusion]`.
+- **Scope** (Optional): `cli`, `scheduler`, `model`, `pipeline`, `docs`, etc.
+- **Subject**: Imperative mood, short and clear (e.g., "add feature" not "added feature").
+## Performance Reporting
+For PRs that impact **latency**, **throughput**, or **memory usage**, you **should** provide a performance comparison report.
+### How to Generate a Report
+1.  **Baseline**: run the benchmark (for a single generation task)
+    ```bash
+    $ sglang generate --model-path <model> --prompt "A benchmark prompt" --perf-dump-path baseline.json
+    ```
+2.  **New**: run the same benchmark, without modifying any server_args or sampling_params
+    ```bash
+    $ sglang generate --model-path <model> --prompt "A benchmark prompt" --perf-dump-path new.json
+    ```
+3.  **Compare**: run the compare script, which will print a Markdown table to the console
+    ```bash
+    $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...]
+    ### Performance Comparison Report
+    ...
+    ```
+4. **Paste**: paste the table into the PR description
+## CI-Based Change Protection
+Consider adding tests to the `pr-test` or `nightly-test` suites to safeguard your changes, especially for PRs that:
+- support a new model
+    - add a testcase for this new model to `testcase_configs.py`
+- support or fix important features
+- significantly improve performance
+Please run the according testcase, then update/add the baseline to `perf_baselines.json` by following the instruction in console if applicable.
+See [test](https://github.com/sgl-project/sglang/tree/main/python/sglang/multimodal_gen/test) for examples

sglang/docs/diffusion/environment_variables.md ADDED Viewed

	@@ -0,0 +1,36 @@

+## Caching Acceleration
+These variables configure caching acceleration for Diffusion Transformer (DiT) models.
+SGLang supports multiple caching strategies - see [caching documentation](performance/cache/index.md) for an overview.
+### Cache-DiT Configuration
+See [cache-dit documentation](performance/cache/cache_dit.md) for detailed configuration.
+| Environment Variable                | Default | Description                              |
+|-------------------------------------|---------|------------------------------------------|
+| `SGLANG_CACHE_DIT_ENABLED`          | false   | Enable Cache-DiT acceleration            |
+| `SGLANG_CACHE_DIT_FN`               | 1       | First N blocks to always compute         |
+| `SGLANG_CACHE_DIT_BN`               | 0       | Last N blocks to always compute          |
+| `SGLANG_CACHE_DIT_WARMUP`           | 4       | Warmup steps before caching              |
+| `SGLANG_CACHE_DIT_RDT`              | 0.24    | Residual difference threshold            |
+| `SGLANG_CACHE_DIT_MC`               | 3       | Max continuous cached steps              |
+| `SGLANG_CACHE_DIT_TAYLORSEER`       | false   | Enable TaylorSeer calibrator             |
+| `SGLANG_CACHE_DIT_TS_ORDER`         | 1       | TaylorSeer order (1 or 2)                |
+| `SGLANG_CACHE_DIT_SCM_PRESET`       | none    | SCM preset (none/slow/medium/fast/ultra) |
+| `SGLANG_CACHE_DIT_SCM_POLICY`       | dynamic | SCM caching policy                       |
+| `SGLANG_CACHE_DIT_SCM_COMPUTE_BINS` | not set | Custom SCM compute bins                  |
+| `SGLANG_CACHE_DIT_SCM_CACHE_BINS`   | not set | Custom SCM cache bins                    |
+## Cloud Storage
+These variables configure S3-compatible cloud storage for automatically uploading generated images and videos.
+| Environment Variable            | Default | Description                                            |
+|---------------------------------|---------|--------------------------------------------------------|
+| `SGLANG_CLOUD_STORAGE_TYPE`     | not set | Set to `s3` to enable cloud storage                    |
+| `SGLANG_S3_BUCKET_NAME`         | not set | The name of the S3 bucket                              |
+| `SGLANG_S3_ENDPOINT_URL`        | not set | Custom endpoint URL (for MinIO, OSS, etc.)             |
+| `SGLANG_S3_REGION_NAME`         | us-east-1 | AWS region name                                      |
+| `SGLANG_S3_ACCESS_KEY_ID`       | not set | AWS Access Key ID                                      |
+| `SGLANG_S3_SECRET_ACCESS_KEY`   | not set | AWS Secret Access Key                                  |

sglang/docs/diffusion/index.md ADDED Viewed

	@@ -0,0 +1,98 @@

+# SGLang Diffusion
+SGLang Diffusion is an inference framework for accelerated image and video generation using diffusion models. It provides an end-to-end unified pipeline with optimized kernels and an efficient scheduler loop.
+## Key Features
+- **Broad Model Support**: Wan series, FastWan series, Hunyuan, Qwen-Image, Qwen-Image-Edit, Flux, Z-Image, GLM-Image, and more
+- **Fast Inference**: Optimized kernels, efficient scheduler loop, and Cache-DiT acceleration
+- **Ease of Use**: OpenAI-compatible API, CLI, and Python SDK
+- **Multi-Platform**: NVIDIA GPUs (H100, H200, A100, B200, 4090), AMD GPUs (MI300X, MI325X) and Ascend NPU (A2, A3)
+---
+## Quick Start
+### Installation
+```bash
+uv pip install "sglang[diffusion]" --prerelease=allow
+```
+See [Installation Guide](installation.md) for more installation methods and ROCm-specific instructions.
+### Basic Usage
+Generate an image with the CLI:
+```bash
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A beautiful sunset over the mountains" \
+    --save-output
+```
+Or start a server with the OpenAI-compatible API:
+```bash
+sglang serve --model-path Qwen/Qwen-Image --port 30010
+```
+---
+## Documentation
+### Getting Started
+- **[Installation](installation.md)** - Install SGLang Diffusion via pip, uv, Docker, or from source
+- **[Compatibility Matrix](compatibility_matrix.md)** - Supported models and optimization compatibility
+### Usage
+- **[CLI Documentation](api/cli.md)** - Command-line interface for `sglang generate` and `sglang serve`
+- **[OpenAI API](api/openai_api.md)** - OpenAI-compatible API for image/video generation and LoRA management
+### Performance Optimization
+- **[Performance Overview](performance/index.md)** - Overview of all performance optimization strategies
+- **[Attention Backends](performance/attention_backends.md)** - Available attention backends (FlashAttention, SageAttention, etc.)
+- **[Caching Strategies](performance/cache/)** - Cache-DiT and TeaCache acceleration
+- **[Profiling](performance/profiling.md)** - Profiling techniques with PyTorch Profiler and Nsight Systems
+### Reference
+- **[Environment Variables](environment_variables.md)** - Configuration via environment variables
+- **[Support New Models](support_new_models.md)** - Guide for adding new diffusion models
+- **[Contributing](contributing.md)** - Contribution guidelines and commit message conventions
+- **[CI Performance](ci_perf.md)** - Performance baseline generation script
+---
+## CLI Quick Reference
+### Generate (one-off generation)
+```bash
+sglang generate --model-path <MODEL> --prompt "<PROMPT>" --save-output
+```
+### Serve (HTTP server)
+```bash
+sglang serve --model-path <MODEL> --port 30010
+```
+### Enable Cache-DiT acceleration
+```bash
+SGLANG_CACHE_DIT_ENABLED=true sglang generate --model-path <MODEL> --prompt "<PROMPT>"
+```
+---
+## References
+- [SGLang GitHub](https://github.com/sgl-project/sglang)
+- [Cache-DiT](https://github.com/vipshop/cache-dit)
+- [FastVideo](https://github.com/hao-ai-lab/FastVideo)
+- [xDiT](https://github.com/xdit-project/xDiT)
+- [Diffusers](https://github.com/huggingface/diffusers)

sglang/docs/diffusion/installation.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# Install SGLang-Diffusion
+You can install SGLang-Diffusion using one of the methods below.
+## Standard Installation (NVIDIA GPUs)
+### Method 1: With pip or uv
+It is recommended to use uv for a faster installation:
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang[diffusion]" --prerelease=allow
+```
+### Method 2: From source
+```bash
+# Use the latest release branch
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+# Install the Python packages
+pip install --upgrade pip
+pip install -e "python[diffusion]"
+# With uv
+uv pip install -e "python[diffusion]" --prerelease=allow
+```
+### Method 3: Using Docker
+The Docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang), built from the [Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile).
+Replace `<secret>` below with your HuggingFace Hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=<secret>" \
+    --ipc=host \
+    lmsysorg/sglang:dev \
+    zsh -c '\
+        echo "Installing diffusion dependencies..." && \
+        pip install -e "python[diffusion]" && \
+        echo "Starting SGLang-Diffusion..." && \
+        sglang generate \
+            --model-path black-forest-labs/FLUX.1-dev \
+            --prompt "A logo With Bold Large text: SGL Diffusion" \
+            --save-output \
+    '
+```
+## Platform-Specific: ROCm (AMD GPUs)
+For AMD Instinct GPUs (e.g., MI300X), you can use the ROCm-enabled Docker image:
+```bash
+docker run --device=/dev/kfd --device=/dev/dri --ipc=host \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env HF_TOKEN=<secret> \
+  lmsysorg/sglang:v0.5.5.post2-rocm700-mi30x \
+  sglang generate --model-path black-forest-labs/FLUX.1-dev --prompt "A logo With Bold Large text: SGL Diffusion" --save-output
+```
+For detailed ROCm system configuration and installation from source, see [AMD GPUs](../../platforms/amd_gpu.md).
+## Platform-Specific: MUSA (Moore Threads GPUs)
+For Moore Threads GPUs (MTGPU) with the MUSA software stack:
+```bash
+# Clone the repository
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+# Install the Python packages
+pip install --upgrade pip
+rm -f python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
+pip install -e "python[all_musa]"
+```
+## Platform-Specific: Ascend NPU
+For Ascend NPU, please follow the [NPU installation guide](../platforms/ascend_npu.md).
+Quick test:
+```bash
+sglang generate --model-path black-forest-labs/FLUX.1-dev \
+    --prompt "A logo With Bold Large text: SGL Diffusion" \
+    --save-output
+```

sglang/docs/diffusion/performance/attention_backends.md ADDED Viewed

	@@ -0,0 +1,131 @@

+# Attention Backends
+This document describes the attention backends available in sglang diffusion (`sglang.multimodal_gen`) and how to select them.
+## Overview
+Attention backends are defined by `AttentionBackendEnum` (`sglang.multimodal_gen.runtime.platforms.interface.AttentionBackendEnum`) and selected via the CLI flag `--attention-backend`.
+Backend selection is performed by the shared attention layers (e.g. `LocalAttention` / `USPAttention` / `UlyssesAttention` in `sglang.multimodal_gen.runtime.layers.attention.layer`) and therefore applies to any model component using these layers (e.g. diffusion transformer / DiT and encoders).
+When using the diffusers backend, `--attention-backend` is passed through to diffusers'
+`set_attention_backend` (e.g., `flash`, `_flash_3_hub`, `sage`, `xformers`, `native`).
+- **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA.
+- **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
+- **MPS**: always uses PyTorch SDPA.
+- **NPU**: always uses PyTorch SDPA.
+## Backend options
+For SGLang-native pipelines, the CLI accepts the lowercase names of `AttentionBackendEnum`. The table below lists the backends implemented by the built-in platforms. `fa3`/`fa4` are accepted as aliases for `fa`.
+| CLI value | Enum value | Notes |
+|---|---|---|
+| `fa` / `fa3` / `fa4` | `FA` | FlashAttention. `fa3/fa4` are normalized to `fa` during argument parsing (`ServerArgs.__post_init__`). |
+| `torch_sdpa` | `TORCH_SDPA` | PyTorch `scaled_dot_product_attention`. |
+| `sliding_tile_attn` | `SLIDING_TILE_ATTN` | Sliding Tile Attention (STA). Requires `st_attn`. Configure via `--attention-backend-config`. |
+| `sage_attn` | `SAGE_ATTN` | Requires `sageattention`. Upstream SageAttention CUDA extensions target SM80/SM86/SM89/SM90/SM120 (compute capability 8.0/8.6/8.9/9.0/12.0); see upstream `setup.py`: https://github.com/thu-ml/SageAttention/blob/main/setup.py. |
+| `sage_attn_3` | `SAGE_ATTN_3` | Requires SageAttention3 installed per upstream instructions. |
+| `video_sparse_attn` | `VIDEO_SPARSE_ATTN` | Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. |
+| `vmoba_attn` | `VMOBA_ATTN` | Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. |
+| `aiter` | `AITER` | Requires `aiter`. |
+| `sparse_video_gen_2_attn` | `SPARSE_VIDEO_GEN_2_ATTN` | Requires `svg`. See installation instructions at https://github.com/svg-project/Sparse-VideoGen. |
+## Selection priority
+The selection order in `runtime/layers/attention/selector.py` is:
+1. `global_force_attn_backend(...)` / `global_force_attn_backend_context_manager(...)`
+2. CLI `--attention-backend` (`ServerArgs.attention_backend`)
+3. Auto selection (platform capability, dtype, and installed packages)
+## Configuration
+Some backends require additional configuration. You can pass these parameters via `--attention-backend-config`. This argument accepts:
+- A path to a JSON or YAML configuration file.
+- A JSON string (e.g., `'{"sparsity": 0.5}'`).
+- Key-value pairs (e.g., `"sparsity=0.5,enable_x=true"`).
+### Supported Configuration Parameters
+**Sliding Tile Attention (`sliding_tile_attn`)**
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `mask_strategy_file_path` | `str` | **Required.** Path to the mask strategy JSON file. | - |
+| `sta_mode` | `str` | Mode of STA. | `STA_inference` |
+| `skip_time_steps` | `int` | Number of steps to use full attention before switching to sparse attention. | `15` |
+**Video Sparse Attention (`video_sparse_attn`)**
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `sparsity` | `float` | Validation sparsity (0.0 - 1.0). | `0.0` |
+**V-MoBA (`vmoba_attn`)**
+| Parameter | Type | Description | Default |
+| :--- | :--- | :--- | :--- |
+| `temporal_chunk_size` | `int` | Chunk size for temporal dimension. | - |
+| `temporal_topk` | `int` | Top-K tokens to select in temporal dimension. | - |
+| `spatial_chunk_size` | `list[int]` | Chunk size for spatial dimension (H, W). | - |
+| `spatial_topk` | `int` | Top-K tokens to select in spatial dimension. | - |
+| `st_chunk_size` | `list[int]` | Chunk size for spatiotemporal dimension (T, H, W). | - |
+| `st_topk` | `int` | Top-K tokens to select in spatiotemporal dimension. | - |
+| `moba_select_mode` | `str` | Selection mode (e.g., `threshold`). | `threshold` |
+| `moba_threshold` | `float` | Threshold value for selection. | `0.25` |
+| `moba_threshold_type` | `str` | Type of thresholding (e.g., `query_head`). | `query_head` |
+| `first_full_step` | `int` | Number of initial steps to use full attention. | `12` |
+| `first_full_layer` | `int` | Number of initial layers to use full attention. | `0` |
+| `temporal_layer` | `int` | Number of temporal layers. | `1` |
+| `spatial_layer` | `int` | Number of spatial layers. | `1` |
+| `st_layer` | `int` | Number of spatiotemporal layers. | `1` |
+## Platform support matrix
+| Backend | CUDA | ROCm | MPS | NPU | Notes |
+|---|---:|---:|---:|---:|---|
+| `fa` | ✅ | ✅ | ❌ | ❌ | CUDA requires SM80+ and fp16/bf16. FlashAttention is only used when the required runtime is installed; otherwise it falls back to `torch_sdpa`. |
+| `torch_sdpa` | ✅ | ✅ | ✅ | ✅ | Most compatible option across platforms. |
+| `sliding_tile_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `st_attn`. Configure via `--attention-backend-config`. |
+| `sage_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). |
+| `sage_attn_3` | ✅ | ❌ | ❌ | ❌ | CUDA-only (optional dependency). |
+| `video_sparse_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `vsa`. Configure `sparsity` via `--attention-backend-config`. |
+| `vmoba_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `kernel.attn.vmoba_attn.vmoba`. Configure via `--attention-backend-config`. |
+| `aiter` | ✅ | ❌ | ❌ | ❌ | Requires `aiter`. |
+| `sparse_video_gen_2_attn` | ✅ | ❌ | ❌ | ❌ | CUDA-only. Requires `svg`. |
+## Usage
+### Select a backend via CLI
+```bash
+sglang generate \
+  --model-path <MODEL_PATH_OR_ID> \
+  --prompt "..." \
+  --attention-backend fa
+```
+```bash
+sglang generate \
+  --model-path <MODEL_PATH_OR_ID> \
+  --prompt "..." \
+  --attention-backend torch_sdpa
+```
+### Using Sliding Tile Attention (STA)
+```bash
+# Pass the mask strategy file path via config
+sglang generate \
+  --model-path <MODEL_PATH_OR_ID> \
+  --prompt "..." \
+  --attention-backend sliding_tile_attn \
+  --attention-backend-config "mask_strategy_file_path=/abs/path/to/mask_strategy.json"
+```
+### Notes for ROCm / MPS
+- ROCm: use `--attention-backend torch_sdpa` or `fa` depending on what is available in your environment.
+- MPS: the platform implementation always uses `torch_sdpa`.

sglang/docs/diffusion/performance/cache/cache_dit.md ADDED Viewed

	@@ -0,0 +1,273 @@

+# Cache-DiT Acceleration
+SGLang integrates [Cache-DiT](https://github.com/vipshop/cache-dit), a caching acceleration engine for Diffusion Transformers (DiT), to achieve up to **1.69x inference speedup** with minimal quality loss.
+## Overview
+**Cache-DiT** uses intelligent caching strategies to skip redundant computation in the denoising loop:
+- **DBCache (Dual Block Cache)**: Dynamically decides when to cache transformer blocks based on residual differences
+- **TaylorSeer**: Uses Taylor expansion for calibration to optimize caching decisions
+- **SCM (Step Computation Masking)**: Step-level caching control for additional speedup
+## Basic Usage
+Enable Cache-DiT by exporting the environment variable and using `sglang generate` or `sglang serve` :
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A beautiful sunset over the mountains"
+```
+## Diffusers Backend
+Cache-DiT supports loading acceleration configs from a custom YAML file. For
+diffusers pipelines (`diffusers` backend), pass the YAML/JSON path via `--cache-dit-config`. This
+flow requires cache-dit >= 1.2.0 (`cache_dit.load_configs`).
+### Single GPU inference
+Define a `cache.yaml` file that contains:
+```yaml
+cache_config:
+  max_warmup_steps: 8
+  warmup_interval: 2
+  max_cached_steps: -1
+  max_continuous_cached_steps: 2
+  Fn_compute_blocks: 1
+  Bn_compute_blocks: 0
+  residual_diff_threshold: 0.12
+  enable_taylorseer: true
+  taylorseer_order: 1
+```
+Then apply the config with:
+```bash
+sglang generate \
+  --backend diffusers \
+  --model-path Qwen/Qwen-Image \
+  --cache-dit-config cache.yaml \
+  --prompt "A beautiful sunset over the mountains"
+```
+### Distributed inference
+- 1D Parallelism
+Define a parallelism only config yaml `parallel.yaml` file that contains:
+```yaml
+parallelism_config:
+  ulysses_size: auto
+  parallel_kwargs:
+    attention_backend: native
+    extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the distributed inference acceleration config from yaml. `ulysses_size: auto` means that cache-dit will auto detect the `world_size` as the ulysses_size. Otherwise, you should manually set it as specific int number, e.g, 4.
+Then apply the distributed config with: (Note: please add `--num-gpus N` to specify the number of gpus for distributed inference)
+```bash
+sglang generate \
+  --backend diffusers \
+  --num-gpus 4 \
+  --model-path Qwen/Qwen-Image \
+  --cache-dit-config parallel.yaml \
+  --prompt "A futuristic cityscape at sunset"
+```
+- 2D Parallelism
+You can also define a 2D parallelism config yaml `parallel_2d.yaml` file that contains:
+```yaml
+parallelism_config:
+  ulysses_size: auto
+  tp_size: 2
+  parallel_kwargs:
+    attention_backend: native
+    extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the 2D parallelism config from yaml. Here `tp_size: 2` means using tensor parallelism with size 2. The `ulysses_size: auto` means that cache-dit will auto detect the `world_size // tp_size` as the ulysses_size.
+- 3D Parallelism
+You can also define a 3D parallelism config yaml `parallel_3d.yaml` file that contains:
+```yaml
+parallelism_config:
+  ulysses_size: 2
+  ring_size: 2
+  tp_size: 2
+  parallel_kwargs:
+    attention_backend: native
+    extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the 3D parallelism config from yaml. Here `ulysses_size: 2`, `ring_size: 2`, `tp_size: 2` means using ulysses parallelism with size 2, ring parallelism with size 2 and tensor parallelism with size 2.
+### Hybrid Cache and Parallelism
+Define a hybrid cache and parallel acceleration config yaml `hybrid.yaml` file that contains:
+```yaml
+cache_config:
+  max_warmup_steps: 8
+  warmup_interval: 2
+  max_cached_steps: -1
+  max_continuous_cached_steps: 2
+  Fn_compute_blocks: 1
+  Bn_compute_blocks: 0
+  residual_diff_threshold: 0.12
+  enable_taylorseer: true
+  taylorseer_order: 1
+parallelism_config:
+  ulysses_size: auto
+  parallel_kwargs:
+    attention_backend: native
+    extra_parallel_modules: ["text_encoder", "vae"]
+```
+Then, apply the hybrid cache and parallel acceleration config from yaml.
+```bash
+sglang generate \
+  --backend diffusers \
+  --num-gpus 4 \
+  --model-path Qwen/Qwen-Image \
+  --cache-dit-config hybrid.yaml \
+  --prompt "A beautiful sunset over the mountains"
+```
+## Advanced Configuration
+### DBCache Parameters
+DBCache controls block-level caching behavior:
+| Parameter | Env Variable              | Default | Description                              |
+|-----------|---------------------------|---------|------------------------------------------|
+| Fn        | `SGLANG_CACHE_DIT_FN`     | 1       | Number of first blocks to always compute |
+| Bn        | `SGLANG_CACHE_DIT_BN`     | 0       | Number of last blocks to always compute  |
+| W         | `SGLANG_CACHE_DIT_WARMUP` | 4       | Warmup steps before caching starts       |
+| R         | `SGLANG_CACHE_DIT_RDT`    | 0.24    | Residual difference threshold            |
+| MC        | `SGLANG_CACHE_DIT_MC`     | 3       | Maximum continuous cached steps          |
+### TaylorSeer Configuration
+TaylorSeer improves caching accuracy using Taylor expansion:
+| Parameter | Env Variable                  | Default | Description                     |
+|-----------|-------------------------------|---------|---------------------------------|
+| Enable    | `SGLANG_CACHE_DIT_TAYLORSEER` | false   | Enable TaylorSeer calibrator    |
+| Order     | `SGLANG_CACHE_DIT_TS_ORDER`   | 1       | Taylor expansion order (1 or 2) |
+### Combined Configuration Example
+DBCache and TaylorSeer are complementary strategies that work together, you can configure both sets of parameters
+simultaneously:
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_FN=2 \
+SGLANG_CACHE_DIT_BN=1 \
+SGLANG_CACHE_DIT_WARMUP=4 \
+SGLANG_CACHE_DIT_RDT=0.4 \
+SGLANG_CACHE_DIT_MC=4 \
+SGLANG_CACHE_DIT_TAYLORSEER=true \
+SGLANG_CACHE_DIT_TS_ORDER=2 \
+sglang generate --model-path black-forest-labs/FLUX.1-dev \
+    --prompt "A curious raccoon in a forest"
+```
+### SCM (Step Computation Masking)
+SCM provides step-level caching control for additional speedup. It decides which denoising steps to compute fully and
+which to use cached results.
+**SCM Presets**
+SCM is configured with presets:
+| Preset   | Compute Ratio | Speed    | Quality    |
+|----------|---------------|----------|------------|
+| `none`   | 100%          | Baseline | Best       |
+| `slow`   | ~75%          | ~1.3x    | High       |
+| `medium` | ~50%          | ~2x      | Good       |
+| `fast`   | ~35%          | ~3x      | Acceptable |
+| `ultra`  | ~25%          | ~4x      | Lower      |
+**Usage**
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_SCM_PRESET=medium \
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A futuristic cityscape at sunset"
+```
+**Custom SCM Bins**
+For fine-grained control over which steps to compute vs cache:
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+SGLANG_CACHE_DIT_SCM_COMPUTE_BINS="8,3,3,2,2" \
+SGLANG_CACHE_DIT_SCM_CACHE_BINS="1,2,2,2,3" \
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A futuristic cityscape at sunset"
+```
+**SCM Policy**
+| Policy    | Env Variable                          | Description                                 |
+|-----------|---------------------------------------|---------------------------------------------|
+| `dynamic` | `SGLANG_CACHE_DIT_SCM_POLICY=dynamic` | Adaptive caching based on content (default) |
+| `static`  | `SGLANG_CACHE_DIT_SCM_POLICY=static`  | Fixed caching pattern                       |
+## Environment Variables
+All Cache-DiT parameters can be configured via environment variables.
+See [Environment Variables](../../environment_variables.md) for the complete list.
+## Supported Models
+SGLang Diffusion x Cache-DiT supports almost all models originally supported in SGLang Diffusion:
+| Model Family | Example Models              |
+|--------------|-----------------------------|
+| Wan          | Wan2.1, Wan2.2              |
+| Flux         | FLUX.1-dev, FLUX.2-dev      |
+| Z-Image      | Z-Image-Turbo               |
+| Qwen         | Qwen-Image, Qwen-Image-Edit |
+| Hunyuan      | HunyuanVideo                |
+## Performance Tips
+1. **Start with defaults**: The default parameters work well for most models
+2. **Use TaylorSeer**: It typically improves both speed and quality
+3. **Tune R threshold**: Lower values = better quality, higher values = faster
+4. **SCM for extra speed**: Use `medium` preset for good speed/quality balance
+5. **Warmup matters**: Higher warmup = more stable caching decisions
+## Limitations
+- **SGLang-native pipelines**: Distributed support (TP/SP) is not yet validated; Cache-DiT will be automatically
+  disabled when `world_size > 1`.
+- **SCM minimum steps**: SCM requires >= 8 inference steps to be effective
+- **Model support**: Only models registered in Cache-DiT's BlockAdapterRegister are supported
+## Troubleshooting
+### SCM disabled for low step count
+For models with < 8 inference steps (e.g., DMD distilled models), SCM will be automatically disabled. DBCache
+acceleration still works.
+## References
+- [Cache-DiT](https://github.com/vipshop/cache-dit)
+- [SGLang Diffusion](../index.md)

sglang/docs/diffusion/performance/cache/index.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Caching Acceleration for Diffusion Models
+SGLang provides multiple caching acceleration strategies for Diffusion Transformer (DiT) models. These strategies can significantly reduce inference time by skipping redundant computation.
+## Overview
+SGLang supports two complementary caching approaches:
+| Strategy | Scope | Mechanism | Best For |
+|----------|-------|-----------|----------|
+| **Cache-DiT** | Block-level | Skip individual transformer blocks dynamically | Advanced, higher speedup |
+| **TeaCache** | Timestep-level | Skip entire denoising steps based on L1 similarity | Simple, built-in |
+## Cache-DiT
+[Cache-DiT](https://github.com/vipshop/cache-dit) provides block-level caching with
+advanced strategies like DBCache and TaylorSeer. It can achieve up to **1.69x speedup**.
+See [cache_dit.md](cache_dit.md) for detailed configuration.
+### Quick Start
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A beautiful sunset over the mountains"
+```
+### Key Features
+- **DBCache**: Dynamic block-level caching based on residual differences
+- **TaylorSeer**: Taylor expansion-based calibration for optimized caching
+- **SCM**: Step-level computation masking for additional speedup
+## TeaCache
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+See [teacache.md](teacache.md) for detailed documentation.
+### Quick Overview
+- Tracks L1 distance between modulated inputs across timesteps
+- When accumulated distance is below threshold, reuses cached residual
+- Supports CFG with separate positive/negative caches
+### Supported Models
+- Wan (wan2.1, wan2.2)
+- Hunyuan (HunyuanVideo)
+- Z-Image
+For Flux and Qwen models, TeaCache is automatically disabled when CFG is enabled.
+## References
+- [Cache-DiT Repository](https://github.com/vipshop/cache-dit)
+- [TeaCache Paper](https://arxiv.org/abs/2411.14324)

sglang/docs/diffusion/performance/cache/teacache.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# TeaCache Acceleration
+> **Note**: This is one of two caching strategies available in SGLang.
+> For an overview of all caching options, see [caching](../index.md).
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+## Overview
+TeaCache works by:
+1. Tracking the L1 distance between modulated inputs across consecutive timesteps
+2. Accumulating the rescaled L1 distance over steps
+3. When accumulated distance is below a threshold, reusing the cached residual
+4. Supporting CFG (Classifier-Free Guidance) with separate positive/negative caches
+## How It Works
+### L1 Distance Tracking
+At each denoising step, TeaCache computes the relative L1 distance between the current and previous modulated inputs:
+```
+rel_l1 = |current - previous|.mean() / |previous|.mean()
+```
+This distance is then rescaled using polynomial coefficients and accumulated:
+```
+accumulated += poly(coefficients)(rel_l1)
+```
+### Cache Decision
+- If `accumulated >= threshold`: Force computation, reset accumulator
+- If `accumulated < threshold`: Skip computation, use cached residual
+### CFG Support
+For models that support CFG cache separation (Wan, Hunyuan, Z-Image), TeaCache maintains separate caches for positive and negative branches:
+- `previous_modulated_input` / `previous_residual` for positive branch
+- `previous_modulated_input_negative` / `previous_residual_negative` for negative branch
+For models that don't support CFG separation (Flux, Qwen), TeaCache is automatically disabled when CFG is enabled.
+## Configuration
+TeaCache is configured via `TeaCacheParams` in the sampling parameters:
+```python
+from sglang.multimodal_gen.configs.sample.teacache import TeaCacheParams
+params = TeaCacheParams(
+    teacache_thresh=0.1,           # Threshold for accumulated L1 distance
+    coefficients=[1.0, 0.0, 0.0],  # Polynomial coefficients for L1 rescaling
+)
+```
+### Parameters
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `teacache_thresh` | float | Threshold for accumulated L1 distance. Lower = more caching, faster but potentially lower quality |
+| `coefficients` | list[float] | Polynomial coefficients for L1 rescaling. Model-specific tuning |
+### Model-Specific Configurations
+Different models may have different optimal configurations. The coefficients are typically tuned per-model to balance speed and quality.
+## Supported Models
+TeaCache is built into the following model families:
+| Model Family | CFG Cache Separation | Notes |
+|--------------|---------------------|-------|
+| Wan (wan2.1, wan2.2) | Yes | Full support |
+| Hunyuan (HunyuanVideo) | Yes | To be supported |
+| Z-Image | Yes | To be supported |
+| Flux | No | To be supported |
+| Qwen | No | To be supported |
+## References
+- [TeaCache: Accelerating Diffusion Models with Temporal Similarity](https://arxiv.org/abs/2411.14324)

sglang/docs/diffusion/performance/index.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Performance Optimization
+SGLang-Diffusion provides multiple performance optimization strategies to accelerate inference. This section covers all available performance tuning options.
+## Overview
+| Optimization | Type | Description |
+|--------------|------|-------------|
+| **Cache-DiT** | Caching | Block-level caching with DBCache, TaylorSeer, and SCM |
+| **TeaCache** | Caching | Timestep-level caching using L1 similarity |
+| **Attention Backends** | Kernel | Optimized attention implementations (FlashAttention, SageAttention, etc.) |
+| **Profiling** | Diagnostics | PyTorch Profiler and Nsight Systems guidance |
+## Caching Strategies
+SGLang supports two complementary caching approaches:
+### Cache-DiT
+[Cache-DiT](https://github.com/vipshop/cache-dit) provides block-level caching with advanced strategies. It can achieve up to **1.69x speedup**.
+**Quick Start:**
+```bash
+SGLANG_CACHE_DIT_ENABLED=true \
+sglang generate --model-path Qwen/Qwen-Image \
+    --prompt "A beautiful sunset over the mountains"
+```
+**Key Features:**
+- **DBCache**: Dynamic block-level caching based on residual differences
+- **TaylorSeer**: Taylor expansion-based calibration for optimized caching
+- **SCM**: Step-level computation masking for additional speedup
+See [Cache-DiT Documentation](cache/cache_dit.md) for detailed configuration.
+### TeaCache
+TeaCache (Temporal similarity-based caching) accelerates diffusion inference by detecting when consecutive denoising steps are similar enough to skip computation entirely.
+**Quick Overview:**
+- Tracks L1 distance between modulated inputs across timesteps
+- When accumulated distance is below threshold, reuses cached residual
+- Supports CFG with separate positive/negative caches
+**Supported Models:** Wan (wan2.1, wan2.2), Hunyuan (HunyuanVideo), Z-Image
+See [TeaCache Documentation](cache/teacache.md) for detailed configuration.
+## Attention Backends
+Different attention backends offer varying performance characteristics depending on your hardware and model:
+- **FlashAttention**: Fastest on NVIDIA GPUs with fp16/bf16
+- **SageAttention**: Alternative optimized implementation
+- **xformers**: Memory-efficient attention
+- **SDPA**: PyTorch native scaled dot-product attention
+See [Attention Backends](attention_backends.md) for platform support and configuration options.
+## Profiling
+To diagnose performance bottlenecks, SGLang-Diffusion supports profiling tools:
+- **PyTorch Profiler**: Built-in Python profiling
+- **Nsight Systems**: GPU kernel-level analysis
+See [Profiling Guide](profiling.md) for detailed instructions.
+## References
+- [Cache-DiT Repository](https://github.com/vipshop/cache-dit)
+- [TeaCache Paper](https://arxiv.org/abs/2411.14324)