Spaces:

FoundationVision
/

LlamaGen

Runtime error

App Files Files Community

ShoufaChen commited on Jun 8, 2024

Commit

4bfb360

1 Parent(s): 4d20c2f

vllm

Browse files

Files changed (12) hide show

app.py +30 -51
app_naive.py +160 -0
requirements.txt +2 -1
serve/README.md +63 -0
serve/gpt_model.py +369 -0
serve/gpu_executor.py +201 -0
serve/llm.py +267 -0
serve/llm_engine.py +671 -0
serve/model_runner.py +1223 -0
serve/sample_c2i.py +97 -0
serve/sampler.py +868 -0
serve/worker.py +349 -0

app.py CHANGED Viewed

@@ -8,12 +8,12 @@ torch.backends.cudnn.allow_tf32 = True
 torch.set_float32_matmul_precision('high')
 setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
 setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
 import time
 import argparse
 from tokenizer_image.vq_model import VQ_models
-from models.gpt import GPT_models
-from models.generate import generate
 device = "cuda"
@@ -38,46 +38,16 @@ def load_model(args):
     del checkpoint
     print(f"image tokenizer is loaded")
-    # create and load gpt model
-    precision = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.precision]
-    latent_size = image_size // args.downsample_size
-    gpt_model = GPT_models[args.gpt_model](
-        vocab_size=args.codebook_size,
-        block_size=latent_size ** 2,
-        num_classes=args.num_classes,
-        cls_token_num=args.cls_token_num,
-        model_type=args.gpt_type,
-    ).to(device=device, dtype=precision)
-    checkpoint = torch.load(f"{ckpt_folder}{gpt_ckpt}", map_location="cpu")
-    if args.from_fsdp: # fspd
-        model_weight = checkpoint
-    elif "model" in checkpoint:  # ddp
-        model_weight = checkpoint["model"]
-    elif "module" in checkpoint: # deepspeed
-        model_weight = checkpoint["module"]
-    elif "state_dict" in checkpoint:
-        model_weight = checkpoint["state_dict"]
-    else:
-        raise Exception("please check model weight")
-    # if 'freqs_cis' in model_weight:
-    #     model_weight.pop('freqs_cis')
-    gpt_model.load_state_dict(model_weight, strict=False)
-    gpt_model.eval()
-    del checkpoint
     print(f"gpt model is loaded")
-    if args.compile:
-        print(f"compiling the model...")
-        gpt_model = torch.compile(
-            gpt_model,
-            mode="reduce-overhead",
-            fullgraph=True
-        ) # requires PyTorch 2.0 (optional)
-    else:
-        print(f"no need to compile model in demo")
-    return vq_model, gpt_model, image_size
 def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
@@ -85,20 +55,29 @@ def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
     latent_size = image_size // args.downsample_size
     # Labels to condition the model with (feel free to change):
     class_labels = [class_label for _ in range(n)]
-    c_indices = torch.tensor(class_labels, device=device)
     qzshape = [len(class_labels), args.codebook_embed_dim, latent_size, latent_size]
     t1 = time.time()
     torch.manual_seed(seed)
-    index_sample = generate(
-        gpt_model, c_indices, latent_size ** 2,
-        cfg_scale=cfg_scale, cfg_interval=args.cfg_interval,
-        temperature=temperature, top_k=top_k,
-        top_p=top_p, sample_logits=True,
-        )
     sampling_time = time.time() - t1
     print(f"gpt sampling takes about {sampling_time:.2f} seconds.")
     t2 = time.time()
     samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
     decoder_time = time.time() - t2
@@ -110,7 +89,7 @@ def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
 parser = argparse.ArgumentParser()
-parser.add_argument("--gpt-model", type=str, choices=list(GPT_models.keys()), default="GPT-XL")
 parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="c2i", help="class-conditional or text-conditional")
 parser.add_argument("--from-fsdp", action='store_true')
 parser.add_argument("--cls-token-num", type=int, default=1, help="max token number of condition input")
@@ -129,7 +108,7 @@ parser.add_argument("--temperature", type=float, default=1.0, help="temperature
 parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
 args = parser.parse_args()
-vq_model, gpt_model, image_size = load_model(args)
 with gr.Blocks() as demo:
     gr.Markdown("<h1 style='text-align: center'>Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation</h1>")

 torch.set_float32_matmul_precision('high')
 setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
 setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
+from vllm import SamplingParams
 import time
 import argparse
 from tokenizer_image.vq_model import VQ_models
+# from models.generate import generate
+from serve.llm import LLM
 device = "cuda"
     del checkpoint
     print(f"image tokenizer is loaded")
+    # Create an LLM.
+    args.image_size = image_size
+    args.gpt_ckpt = f"{ckpt_folder}{gpt_ckpt}"
+    llm = LLM(
+        args=args,
+        model='serve/fake_json/{}.json'.format(args.gpt_model),
+        gpu_memory_utilization=0.6,
+        skip_tokenizer_init=True)
     print(f"gpt model is loaded")
+    return vq_model, llm, image_size
 def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
     latent_size = image_size // args.downsample_size
     # Labels to condition the model with (feel free to change):
     class_labels = [class_label for _ in range(n)]
     qzshape = [len(class_labels), args.codebook_embed_dim, latent_size, latent_size]
+    prompt_token_ids = [[cind] for cind in class_labels]
+    if cfg_scale > 1.0:
+        prompt_token_ids.extend([[args.num_classes] for _ in range(len(prompt_token_ids))])
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=temperature, top_p=top_p, top_k=top_k,
+        max_tokens=latent_size ** 2)
     t1 = time.time()
     torch.manual_seed(seed)
+    outputs = llm.generate(
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        use_tqdm=False)
     sampling_time = time.time() - t1
     print(f"gpt sampling takes about {sampling_time:.2f} seconds.")
+    index_sample = torch.tensor([output.outputs[0].token_ids for output in outputs], device=device)
+    if args.cfg_scale > 1.0:
+        index_sample = index_sample[:len(class_labels)]
     t2 = time.time()
     samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
     decoder_time = time.time() - t2
 parser = argparse.ArgumentParser()
+parser.add_argument("--gpt-model", type=str, default="GPT-XL")
 parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="c2i", help="class-conditional or text-conditional")
 parser.add_argument("--from-fsdp", action='store_true')
 parser.add_argument("--cls-token-num", type=int, default=1, help="max token number of condition input")
 parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
 args = parser.parse_args()
+vq_model, llm, image_size = load_model(args)
 with gr.Blocks() as demo:
     gr.Markdown("<h1 style='text-align: center'>Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation</h1>")

app_naive.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from PIL import Image
+import gradio as gr
+from imagenet_en_cn import IMAGENET_1K_CLASSES
+from huggingface_hub import hf_hub_download
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision('high')
+setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
+setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
+import time
+import argparse
+from tokenizer_image.vq_model import VQ_models
+from models.gpt import GPT_models
+from models.generate import generate
+device = "cuda"
+model2ckpt = {
+    "GPT-XL": ("vq_ds16_c2i.pt", "c2i_XL_384.pt", 384),
+    "GPT-B": ("vq_ds16_c2i.pt", "c2i_B_256.pt", 256),
+}
+def load_model(args):
+    ckpt_folder = "./"
+    vq_ckpt, gpt_ckpt, image_size = model2ckpt[args.gpt_model]
+    hf_hub_download(repo_id="FoundationVision/LlamaGen", filename=vq_ckpt, local_dir=ckpt_folder)
+    hf_hub_download(repo_id="FoundationVision/LlamaGen", filename=gpt_ckpt, local_dir=ckpt_folder)
+    # create and load model
+    vq_model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim)
+    vq_model.to(device)
+    vq_model.eval()
+    checkpoint = torch.load(f"{ckpt_folder}{vq_ckpt}", map_location="cpu")
+    vq_model.load_state_dict(checkpoint["model"])
+    del checkpoint
+    print(f"image tokenizer is loaded")
+    # create and load gpt model
+    precision = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.precision]
+    latent_size = image_size // args.downsample_size
+    gpt_model = GPT_models[args.gpt_model](
+        vocab_size=args.codebook_size,
+        block_size=latent_size ** 2,
+        num_classes=args.num_classes,
+        cls_token_num=args.cls_token_num,
+        model_type=args.gpt_type,
+    ).to(device=device, dtype=precision)
+    checkpoint = torch.load(f"{ckpt_folder}{gpt_ckpt}", map_location="cpu")
+    if args.from_fsdp: # fspd
+        model_weight = checkpoint
+    elif "model" in checkpoint:  # ddp
+        model_weight = checkpoint["model"]
+    elif "module" in checkpoint: # deepspeed
+        model_weight = checkpoint["module"]
+    elif "state_dict" in checkpoint:
+        model_weight = checkpoint["state_dict"]
+    else:
+        raise Exception("please check model weight")
+    # if 'freqs_cis' in model_weight:
+    #     model_weight.pop('freqs_cis')
+    gpt_model.load_state_dict(model_weight, strict=False)
+    gpt_model.eval()
+    del checkpoint
+    print(f"gpt model is loaded")
+    if args.compile:
+        print(f"compiling the model...")
+        gpt_model = torch.compile(
+            gpt_model,
+            mode="reduce-overhead",
+            fullgraph=True
+        ) # requires PyTorch 2.0 (optional)
+    else:
+        print(f"no need to compile model in demo")
+    return vq_model, gpt_model, image_size
+def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
+    n = 4
+    latent_size = image_size // args.downsample_size
+    # Labels to condition the model with (feel free to change):
+    class_labels = [class_label for _ in range(n)]
+    c_indices = torch.tensor(class_labels, device=device)
+    qzshape = [len(class_labels), args.codebook_embed_dim, latent_size, latent_size]
+    t1 = time.time()
+    torch.manual_seed(seed)
+    index_sample = generate(
+        gpt_model, c_indices, latent_size ** 2,
+        cfg_scale=cfg_scale, cfg_interval=args.cfg_interval,
+        temperature=temperature, top_k=top_k,
+        top_p=top_p, sample_logits=True,
+        )
+    sampling_time = time.time() - t1
+    print(f"gpt sampling takes about {sampling_time:.2f} seconds.")
+    t2 = time.time()
+    samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
+    decoder_time = time.time() - t2
+    print(f"decoder takes about {decoder_time:.2f} seconds.")
+    # Convert to PIL.Image format:
+    samples = samples.mul(127.5).add_(128.0).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
+    samples = [Image.fromarray(sample) for sample in samples]
+    return samples
+parser = argparse.ArgumentParser()
+parser.add_argument("--gpt-model", type=str, choices=list(GPT_models.keys()), default="GPT-XL")
+parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="c2i", help="class-conditional or text-conditional")
+parser.add_argument("--from-fsdp", action='store_true')
+parser.add_argument("--cls-token-num", type=int, default=1, help="max token number of condition input")
+parser.add_argument("--precision", type=str, default='bf16', choices=["none", "fp16", "bf16"])
+parser.add_argument("--compile", action='store_true', default=False)
+parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+parser.add_argument("--downsample-size", type=int, choices=[8, 16], default=16)
+parser.add_argument("--num-classes", type=int, default=1000)
+parser.add_argument("--cfg-scale", type=float, default=4.0)
+parser.add_argument("--cfg-interval", type=float, default=-1)
+parser.add_argument("--seed", type=int, default=0)
+parser.add_argument("--top-k", type=int, default=2000,help="top-k value to sample with")
+parser.add_argument("--temperature", type=float, default=1.0, help="temperature value to sample with")
+parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
+args = parser.parse_args()
+vq_model, gpt_model, image_size = load_model(args)
+with gr.Blocks() as demo:
+    gr.Markdown("<h1 style='text-align: center'>Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation</h1>")
+    with gr.Tabs():
+        with gr.TabItem('Generate'):
+            with gr.Row():
+                with gr.Column():
+                    # with gr.Row():
+                    #     image_size = gr.Radio(choices=[384], value=384, label='Peize Model Resolution')
+                    with gr.Row():
+                        i1k_class = gr.Dropdown(
+                            list(IMAGENET_1K_CLASSES.values()),
+                            value='Eskimo dog, husky [爱斯基摩犬,哈士奇]',
+                            type="index", label='ImageNet-1K Class'
+                        )
+                    cfg_scale = gr.Slider(minimum=1, maximum=25, step=0.1, value=4.0, label='Classifier-free Guidance Scale')
+                    top_k = gr.Slider(minimum=1, maximum=16384, step=1, value=4000, label='Top-K')
+                    top_p = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label="Top-P")
+                    temperature = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label='Temperature')
+                    seed = gr.Slider(minimum=0, maximum=1000, step=1, value=42, label='Seed')
+                    # seed = gr.Number(value=0, label='Seed')
+                    button = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    output = gr.Gallery(label='Generated Images', height=700)
+                    button.click(infer, inputs=[cfg_scale, top_k, top_p, temperature, i1k_class, seed], outputs=[output])
+    demo.queue()
+    demo.launch(debug=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~torch~~


1	+ vllm==0.4.1
2	+ torchvision==0.17.1

serve/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+## serving by vLLM
+### Install
+```
+pip install vllm==0.4.1
+pip install torchvision==0.17.1
+```
+### Demo
+```
+cd ${THIS_REPO_ROOT}
+python3 autoregressive/serve/sample_c2i.py --vq-ckpt /path/to/vq_ds16size16384dim8.pt --gpt-ckpt /path/to/GPT-B/checkpoints/1500000.pt --gpt-model GPT-B
+```
+### Comparison (A100)
+Method | params | baseline(s) | vllm(s) | speed-up ratio
+--- |:---:|:---:|:---:|:---:
+GPT-B   | 100M | 7.80    | 2.39      |  326 %
+GPT-L   | 300M | 13.72   | 3.48      |  380 %
+GPT-XL  | 700M | 19.76   | 4.84      |  408 %
+GPT-XXL | 1.4B | 26.38   | 6.36      |  414 %
+GPT-3B  | 3.1B | -       | -         |   -
+```
+### GPT-B
+# 7.80 seconds
+python3 autoregressive/sample/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-04-24-20-56-19/002-GPT-B/checkpoints/1500000.pt
+# 2.39 seconds
+python3 autoregressive/serve/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-04-24-20-56-19/002-GPT-B/checkpoints/1500000.pt
+### GPT-L
+# 13.72 seconds
+python3 autoregressive/sample/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-04-27-14-27-57/011-GPT-L/checkpoints/1500000.pt --gpt-model GPT-L
+# 3.48 seconds
+python3 autoregressive/serve/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-04-27-14-27-57/011-GPT-L/checkpoints/1500000.pt --gpt-model GPT-L
+### GPT-XL
+# 19.76 seconds
+python3 autoregressive/sample/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-05-05-13-15-40/000-GPT-XL/checkpoints/1500000.pt --gpt-model GPT-XL
+# 4.84 seconds
+python3 autoregressive/serve/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/2024-05-05-13-15-40/000-GPT-XL/checkpoints/1500000.pt --gpt-model GPT-XL
+### GPT-XXL
+# 26.38 seconds
+python3 autoregressive/sample/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/20240506150815-GPT-XXXL/0125000/consolidated.pth --from-fsdp --gpt-model GPT-XXXL
+# 6.36 seconds
+python3 autoregressive/serve/sample_c2i.py --vq-ckpt /mnt/bn/foundation-lq/peize.sun/models/vq_ds16size16384dim8.pt --gpt-ckpt /mnt/bn/foundation-lq/peize.sun/vqgan_arnold/20240506150815-GPT-XXXL/0125000/consolidated.pth --from-fsdp --gpt-model GPT-XXXL
+```
+In 3B model, head size 100 is not supported by PagedAttention, supported head sizes are: [64, 80, 96, 112, 128, 256]

serve/gpt_model.py ADDED Viewed

	@@ -0,0 +1,369 @@

+from dataclasses import dataclass
+from typing import Optional, List
+import torch
+import torch.nn as nn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+from vllm.attention import AttentionMetadata
+from vllm.attention import Attention as pagedAttention
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from serve.sampler import Sampler
+def find_multiple(n: int, k: int):
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layer: int = 32
+    n_head: int = 32
+    n_kv_head: Optional[int] = None
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    initializer_range: float = 0.02
+    num_classes: int = 1000
+    class_dropout_prob: float = 0.1
+    model_type: str = 'c2i'
+    cfg_scale: float = 4.0
+    vocab_size: int = 16384
+    cls_token_num: int = 1
+    block_size: int = 256
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+#################################################################################
+#                      Embedding Layers for Class Labels                        #
+#################################################################################
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    # def token_drop(self, labels, force_drop_ids=None):
+    #     """
+    #     Drops labels to enable classifier-free guidance.
+    #     """
+    #     if force_drop_ids is None:
+    #         drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+    #     else:
+    #         drop_ids = force_drop_ids == 1
+    #     labels = torch.where(drop_ids, self.num_classes, labels)
+    #     return labels
+    # def forward(self, labels, train, force_drop_ids=None):
+    def forward(self, labels):
+        # use_dropout = self.dropout_prob > 0
+        # if (train and use_dropout) or (force_drop_ids is not None):
+        #     labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                  GPT Model                                    #
+#################################################################################
+# class RMSNorm(torch.nn.Module):
+#     def __init__(self, dim: int, eps: float = 1e-5):
+#         super().__init__()
+#         self.eps = eps
+#         self.weight = nn.Parameter(torch.ones(dim))
+#     def _norm(self, x):
+#         return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+#     def forward(self, x):
+#         output = self._norm(x.float()).type_as(x)
+#         return output * self.weight
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        hidden_dim = 4 * config.dim
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if config.ffn_dim_multiplier is not None:
+            hidden_dim = int(config.ffn_dim_multiplier * hidden_dim)
+        hidden_dim = find_multiple(hidden_dim, config.multiple_of)
+        # self.w1 = nn.Linear(config.dim, hidden_dim, bias=False)
+        # self.w3 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w_merged = nn.Linear(config.dim, hidden_dim * 2, bias=False)
+        self.act_fn = SiluAndMul()
+        self.w2 = nn.Linear(hidden_dim, config.dim, bias=False)
+        # self.ffn_dropout = nn.Dropout(config.ffn_dropout_p)
+    # def forward(self, x):
+    #     return self.ffn_dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+    def forward(self, x):
+        x = self.w_merged(x)
+        x = self.act_fn(x)
+        x = self.w2(x)
+        # return self.ffn_dropout(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        self.dim = config.dim
+        self.head_dim = config.dim // config.n_head
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head if config.n_kv_head is not None else config.n_head
+        total_kv_dim = (self.n_head + 2 * self.n_kv_head) * self.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_kv_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        # pagedAttention
+        self.attn = pagedAttention(self.n_head,
+                              self.head_dim,
+                              self.head_dim**-0.5,
+                              num_kv_heads=self.n_kv_head,
+                              )
+        # 2d rotary pos embedding
+        grid_size = int(config.block_size ** 0.5)
+        assert grid_size * grid_size == config.block_size
+        freqs_cis = precompute_freqs_cis_2d(grid_size, config.dim // config.n_head, config.rope_base, config.cls_token_num)
+        self.register_buffer('freqs_cis', freqs_cis)
+    def forward(
+        self,
+        x: torch.Tensor,
+        positions: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ):
+        kv_size = self.n_kv_head * self.head_dim
+        xq, xk, xv = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+        xq = xq.view(*xq.shape[:-1], 1, self.n_head, self.head_dim)
+        xk = xk.view(*xk.shape[:-1], 1, self.n_kv_head, self.head_dim)
+        freqs_cis = self.freqs_cis[positions].unsqueeze(1)
+        xq = apply_rotary_emb_bs(xq, freqs_cis)
+        xk = apply_rotary_emb_bs(xk, freqs_cis)
+        xq = xq.flatten(1)
+        xk = xk.flatten(1)
+        output = self.attn(xq, xk, xv, kv_cache, attn_metadata)
+        output = self.wo(output)
+        return output
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)
+    def forward(self, x: torch.Tensor, positions: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata):
+        h = x + self.attention(self.attention_norm(x), positions, kv_cache, attn_metadata)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.n_layer = config.n_layer
+        self.block_size = config.block_size
+        self.num_classes = config.num_classes
+        self.model_type = config.model_type
+        self.cls_token_num = config.cls_token_num
+        self.cfg_scale = config.cfg_scale
+        if self.model_type == 'c2i':
+            self.cls_embedding = LabelEmbedder(config.num_classes, config.dim, config.class_dropout_prob)
+        else:
+            raise Exception("vllm only supports c2i now, please check model type")
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(config.n_layer):
+            self.layers.append(TransformerBlock(config))
+        # output layer
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler(config.cfg_scale)
+    def forward(
+        self,
+        input_ids: torch.Tensor=None,
+        positions: torch.Tensor=None,
+        kv_caches: List[torch.Tensor]=None,
+        attn_metadata: AttentionMetadata=None,
+    ):
+        # if positions.max() == 0: # prefill in inference
+        #     token_embeddings = self.cls_embedding(input_ids)
+        # else: # decode_n_tokens(kv cache) in inference
+        #     token_embeddings = self.tok_embeddings(input_ids)
+        cond_ids = torch.clamp(input_ids, max=self.num_classes)
+        token_embeddings = self.cls_embedding(cond_ids) * (positions.max() == 0) + \
+            self.tok_embeddings(input_ids) * (positions.max() != 0)
+        hh = token_embeddings
+        # transformer blocks
+        for layer_id, layer in enumerate(self.layers):
+            hh = layer(hh, positions, kv_caches[layer_id], attn_metadata)
+        # output layers
+        hh = self.norm(hh)
+        return hh
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.output.weight, hidden_states, sampling_metadata)
+        return logits
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+    def custom_load_state_dict(self, model_weights):
+        model_weights = model_weights.copy()
+        for layer_id in range(len(self.layers)):
+            branch1 = f'layers.{layer_id}.feed_forward.w1.weight'
+            branch3 = f'layers.{layer_id}.feed_forward.w3.weight'
+            branch_merged = f'layers.{layer_id}.feed_forward.w_merged.weight'
+            model_weights[branch_merged] = torch.cat(
+                [model_weights[branch1], model_weights[branch3]], dim=0
+            )
+            model_weights.pop(branch1)
+            model_weights.pop(branch3)
+        if 'freqs_cis' in model_weights:
+            model_weights.pop('freqs_cis')
+        self.load_state_dict(model_weights, strict=False)
+#################################################################################
+#                      Rotary Positional Embedding Functions                    #
+#################################################################################
+# https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (seq_len, head_dim // 2)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1) # (cls_token_num+seq_len, head_dim // 2, 2)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+seq_len, head_dim // 2, 2)
+    return cond_cache
+def precompute_freqs_cis_2d(grid_size: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    # split the dimension into half, one for x and one for y
+    half_dim = n_elem // 2
+    freqs = 1.0 / (base ** (torch.arange(0, half_dim, 2)[: (half_dim // 2)].float() / half_dim))
+    t = torch.arange(grid_size, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (grid_size, head_dim // 2)
+    freqs_grid = torch.concat([
+        freqs[:, None, :].expand(-1, grid_size, -1),
+        freqs[None, :, :].expand(grid_size, -1, -1),
+    ], dim=-1)  # (grid_size, grid_size, head_dim // 2)
+    cache_grid = torch.stack([torch.cos(freqs_grid), torch.sin(freqs_grid)], dim=-1) # (grid_size, grid_size, head_dim // 2, 2)
+    cache = cache_grid.flatten(0, 1)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+grid_size**2, head_dim // 2, 2)
+    return cond_cache
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor):
+    # x: (bs, seq_len, n_head, head_dim)
+    # freqs_cis (seq_len, head_dim // 2, 2)
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2) # (bs, seq_len, n_head, head_dim//2, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2) # (1, seq_len, 1, head_dim//2, 2)
+    x_out2 = torch.stack([
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+    ], dim=-1)
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+def apply_rotary_emb_bs(x: torch.Tensor, freqs_cis: torch.Tensor):
+    # x: (bs, seq_len, n_head, head_dim)
+    # freqs_cis (seq_len, head_dim // 2, 2)
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2) # (bs, seq_len, n_head, head_dim//2, 2)
+    freqs_cis = freqs_cis.view(xshaped.size(0), xshaped.size(1), 1, xshaped.size(3), 2) # (bs, seq_len, 1, head_dim//2, 2)
+    x_out2 = torch.stack([
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+    ], dim=-1)
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+#################################################################################
+#                                GPT Configs                                    #
+#################################################################################
+### text-conditional
+def GPT_7B(**kwargs):
+    return Transformer(ModelArgs(n_layer=32, n_head=32, dim=4096, **kwargs)) # 6.6B
+def GPT_3B(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=32, dim=3200, **kwargs)) # 3.1B
+def GPT_1B(**kwargs):
+    return Transformer(ModelArgs(n_layer=22, n_head=32, dim=2048, **kwargs)) # 1.2B
+### class-conditional
+def GPT_XXXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=40, dim=2560, **kwargs)) # 3.9B
+def GPT_XXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=24, dim=1536, **kwargs)) # 1.4B
+def GPT_XL(**kwargs):
+    return Transformer(ModelArgs(n_layer=36, n_head=20, dim=1280, **kwargs)) # 775M
+def GPT_L(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=16, dim=1024, **kwargs)) # 343M
+def GPT_B(**kwargs):
+    return Transformer(ModelArgs(n_layer=12, n_head=12, dim=768, **kwargs)) # 111M
+GPT_models = {
+    'GPT-B': GPT_B, 'GPT-L': GPT_L, 'GPT-XL': GPT_XL, 'GPT-XXL': GPT_XXL, 'GPT-XXXL': GPT_XXXL,
+    'GPT-1B': GPT_1B, 'GPT-3B': GPT_3B, 'GPT-7B': GPT_7B,
+}

serve/gpu_executor.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from typing import Dict, List, Set, Tuple, Optional, Set
+import argparse
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         SpeculativeConfig, VisionLanguageConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+logger = init_logger(__name__)
+class GPUExecutor(ExecutorBase):
+    def __init__(
+        self,
+        args: argparse.ArgumentParser,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        self.args = args
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.vision_language_config = vision_language_config
+        self.speculative_config = speculative_config
+        self._init_executor()
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        If speculative decoding is enabled, we instead create the speculative
+        worker.
+        """
+        if self.speculative_config is None:
+            self._init_non_spec_worker()
+        else:
+            self._init_spec_worker()
+    def _init_non_spec_worker(self):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        # from vllm.worker.worker import Worker
+        from serve.worker import Worker
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = Worker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model(self.args)
+    def _init_spec_worker(self):
+        """Initialize a SpecDecodeWorker, using a draft model for proposals.
+        """
+        assert self.speculative_config is not None
+        from vllm.spec_decode.multi_step_worker import MultiStepWorker
+        from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+        from vllm.worker.worker import Worker
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        target_worker = Worker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=True,
+        )
+        draft_worker = MultiStepWorker(
+            model_config=self.speculative_config.draft_model_config,
+            parallel_config=self.speculative_config.draft_parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=True,
+        )
+        spec_decode_worker = SpecDecodeWorker.from_workers(
+            proposer_worker=draft_worker, scorer_worker=target_worker)
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+        self.driver_worker = spec_decode_worker
+        # Load model handled in spec decode worker.
+        self.driver_worker.init_device()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        num_lookahead_slots: int,
+    ) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=num_lookahead_slots,
+        )
+        return output
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.remove_lora(lora_id)
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+    def check_health(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
+class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output

serve/llm.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Modified from:
+#   vLLM:    https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
+from typing import List, Optional, Union
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from vllm.engine.arg_utils import EngineArgs
+# from vllm.engine.llm_engine import LLMEngine
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import MultiModalData
+from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Counter
+from serve.llm_engine import LLMEngine
+class LLM:
+    """An LLM for generating texts from given prompts and sampling parameters.
+    This class includes a tokenizer, a language model (possibly distributed
+    across multiple GPUs), and GPU memory space allocated for intermediate
+    states (aka KV cache). Given a batch of prompts and sampling parameters,
+    this class generates texts from the model, using an intelligent batching
+    mechanism and efficient memory management.
+    NOTE: This class is intended to be used for offline inference. For online
+    serving, use the `AsyncLLMEngine` class instead.
+    NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+    Args:
+        model: The name or path of a HuggingFace Transformers model.
+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+            if available, and "slow" will always use the slow tokenizer.
+        skip_tokenizer_init: If true, skip initialization of tokenizer and
+            detokenizer. Expect valid prompt_token_ids and None for prompt
+            from the input.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
+        tensor_parallel_size: The number of GPUs to use for distributed
+            execution with tensor parallelism.
+        dtype: The data type for the model weights and activations. Currently,
+            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+            the `torch_dtype` attribute specified in the model config file.
+            However, if the `torch_dtype` in the config is `float32`, we will
+            use `float16` instead.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq", "gptq", "squeezellm", and "fp8" (experimental).
+            If None, we first check the `quantization_config` attribute in the
+            model config file. If that is None, we assume the model weights are
+            not quantized and use `dtype` to determine the data type of
+            the weights.
+        revision: The specific model version to use. It can be a branch name,
+            a tag name, or a commit id.
+        tokenizer_revision: The specific tokenizer version to use. It can be a
+            branch name, a tag name, or a commit id.
+        seed: The seed to initialize the random number generator for sampling.
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+            reserve for the model weights, activations, and KV cache. Higher
+            values will increase the KV cache size and thus improve the model's
+            throughput. However, if the value is too high, it may cause out-of-
+            memory (OOM) errors.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Otherwise, too small values may cause out-of-memory (OOM) errors.
+        enforce_eager: Whether to enforce eager execution. If True, we will
+            disable CUDA graph and always execute the model in eager mode.
+            If False, we will use CUDA graph and eager execution in hybrid.
+        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode.
+        disable_custom_all_reduce: See ParallelConfig
+    """
+    def __init__(
+        self,
+        args: argparse.ArgumentParser,
+        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        skip_tokenizer_init: bool = False,
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        **kwargs,
+    ) -> None:
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+        engine_args = EngineArgs(
+            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            skip_tokenizer_init=skip_tokenizer_init,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            **kwargs,
+        )
+        self.llm_engine = LLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.LLM_CLASS, args=args)
+        self.request_counter = Counter()
+    def get_tokenizer(
+            self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+        return self.llm_engine.tokenizer.tokenizer
+    def set_tokenizer(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    ) -> None:
+        self.llm_engine.tokenizer.tokenizer = tokenizer
+    def generate(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        sampling_params: Optional[Union[SamplingParams,
+                                        List[SamplingParams]]] = None,
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> List[RequestOutput]:
+        """Generates the completions for the input prompts.
+        NOTE: This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+        Args:
+            prompts: A list of prompts to generate completions for.
+            sampling_params: The sampling parameters for text generation. If
+                None, we use the default sampling parameters.
+                When it is a single value, it is applied to every prompt.
+                When it is a list, the list must have the same length as the
+                prompts and it is paired one by one with the prompt.
+            prompt_token_ids: A list of token IDs for the prompts. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+            lora_request: LoRA request to use for generation, if any.
+            multi_modal_data: Multi modal data.
+        Returns:
+            A list of `RequestOutput` objects containing the generated
+            completions in the same order as the input prompts.
+        """
+        if prompts is None and prompt_token_ids is None:
+            raise ValueError("Either prompts or prompt_token_ids must be "
+                             "provided.")
+        if self.llm_engine.model_config.skip_tokenizer_init \
+            and prompts is not None:
+            raise ValueError("prompts must be None if skip_tokenizer_init "
+                             "is True")
+        if isinstance(prompts, str):
+            # Convert a single prompt to a list.
+            prompts = [prompts]
+        if (prompts is not None and prompt_token_ids is not None
+                and len(prompts) != len(prompt_token_ids)):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                             "must be the same.")
+        if prompts is not None:
+            num_requests = len(prompts)
+        else:
+            assert prompt_token_ids is not None
+            num_requests = len(prompt_token_ids)
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = SamplingParams()
+        elif isinstance(sampling_params,
+                        list) and len(sampling_params) != num_requests:
+            raise ValueError("The lengths of prompts and sampling_params "
+                             "must be the same.")
+        if multi_modal_data:
+            multi_modal_data.data = multi_modal_data.data.to(torch.float16)
+        # Add requests to the engine.
+        for i in range(num_requests):
+            prompt = prompts[i] if prompts is not None else None
+            token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
+            self._add_request(
+                prompt,
+                sampling_params[i]
+                if isinstance(sampling_params, list) else sampling_params,
+                token_ids,
+                lora_request=lora_request,
+                # Get ith image while maintaining the batch dim.
+                multi_modal_data=MultiModalData(
+                    type=multi_modal_data.type,
+                    data=multi_modal_data.data[i].unsqueeze(0))
+                if multi_modal_data else None,
+            )
+        return self._run_engine(use_tqdm)
+    def _add_request(
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]],
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> None:
+        request_id = str(next(self.request_counter))
+        self.llm_engine.add_request(request_id,
+                                    prompt,
+                                    sampling_params,
+                                    prompt_token_ids,
+                                    lora_request=lora_request,
+                                    multi_modal_data=multi_modal_data)
+    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+        # Initialize tqdm.
+        if use_tqdm:
+            num_requests = self.llm_engine.get_num_unfinished_requests()
+            pbar = tqdm(
+                total=num_requests,
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=f"Generation Speed: {0:.2f} toks/s",
+            )
+        # Run the engine.
+        outputs: List[RequestOutput] = []
+        while self.llm_engine.has_unfinished_requests():
+            step_outputs = self.llm_engine.step()
+            for output in step_outputs:
+                if output.finished:
+                    outputs.append(output)
+                    if use_tqdm:
+                        total_toks += (sum(
+                            len(stp.token_ids) for stp in output.outputs))
+                        spd = total_toks / pbar.format_dict["elapsed"]
+                        pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
+                        pbar.update(1)
+        if use_tqdm:
+            pbar.close()
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        outputs = sorted(outputs, key=lambda x: int(x.request_id))
+        return outputs

serve/llm_engine.py ADDED Viewed

	@@ -0,0 +1,671 @@

+# Modified from:
+#   vLLM:    https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
+import time
+from typing import Iterable, List, Optional, Type, Union
+import argparse
+from transformers import GenerationConfig, PreTrainedTokenizer
+import vllm
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
+                         LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, SpeculativeConfig,
+                         VisionLanguageConfig)
+from vllm.core.scheduler import Scheduler, SchedulerOutputs
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.metrics import StatLogger, Stats
+from vllm.engine.output_processor.interfaces import (
+    SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.engine.output_processor.util import create_output_by_sequence_group
+from vllm.engine.ray_utils import initialize_ray_cluster
+from vllm.executor.executor_base import ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
+                           SequenceGroup)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
+                                                     get_tokenizer_group)
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
+from vllm.utils import Counter
+logger = init_logger(__name__)
+_LOCAL_LOGGING_INTERVAL_SEC = 5
+def _load_generation_config_dict(model_config: ModelConfig):
+    try:
+        return GenerationConfig.from_pretrained(
+            model_config.model,
+            revision=model_config.revision,
+        ).to_diff_dict()
+    except OSError:
+        # Not found.
+        return {}
+class LLMEngine:
+    """An LLM engine that receives requests and generates texts.
+    This is the main class for the vLLM engine. It receives requests
+    from clients and generates texts from the LLM. It includes a tokenizer, a
+    language model (possibly distributed across multiple GPUs), and GPU memory
+    space allocated for intermediate states (aka KV cache). This class utilizes
+    iteration-level scheduling and efficient memory management to maximize the
+    serving throughput.
+    The `LLM` class wraps this class for offline batched inference and the
+    `AsyncLLMEngine` class wraps this class for online serving.
+    NOTE: The config arguments are derived from the `EngineArgs` class. For the
+    comprehensive list of arguments, see `EngineArgs`.
+    Args:
+        model_config: The configuration related to the LLM model.
+        cache_config: The configuration related to the KV cache memory
+            management.
+        parallel_config: The configuration related to distributed execution.
+        scheduler_config: The configuration related to the request scheduler.
+        device_config: The configuration related to the device.
+        lora_config (Optional): The configuration related to serving multi-LoRA.
+        vision_language_config (Optional): The configuration related to vision
+            language models.
+        speculative_config (Optional): The configuration related to speculative
+            decoding.
+        executor_class: The model executor class for managing distributed
+            execution.
+        log_stats: Whether to log statistics.
+        usage_context: Specified entry point, used for usage info collection
+    """
+    def __init__(
+        self,
+        args: argparse.ArgumentParser,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        vision_language_config: Optional[VisionLanguageConfig],
+        speculative_config: Optional[SpeculativeConfig],
+        decoding_config: Optional[DecodingConfig],
+        executor_class: Type[ExecutorBase],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+    ) -> None:
+        logger.info(
+            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
+            f"model={model_config.model!r}, "
+            f"speculative_config={speculative_config!r}, "
+            f"tokenizer={model_config.tokenizer!r}, "
+            f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
+            f"tokenizer_mode={model_config.tokenizer_mode}, "
+            f"revision={model_config.revision}, "
+            f"tokenizer_revision={model_config.tokenizer_revision}, "
+            f"trust_remote_code={model_config.trust_remote_code}, "
+            f"dtype={model_config.dtype}, "
+            f"max_seq_len={model_config.max_model_len}, "
+            f"download_dir={load_config.download_dir!r}, "
+            f"load_format={load_config.load_format}, "
+            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"disable_custom_all_reduce="
+            f"{parallel_config.disable_custom_all_reduce}, "
+            f"quantization={model_config.quantization}, "
+            f"enforce_eager={model_config.enforce_eager}, "
+            f"kv_cache_dtype={cache_config.cache_dtype}, "
+            f"quantization_param_path={model_config.quantization_param_path}, "
+            f"device_config={device_config.device}, "
+            f"decoding_config={decoding_config!r}, "
+            f"seed={model_config.seed})")
+        # TODO(woosuk): Print more configs in debug mode.
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.vision_language_config = vision_language_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.speculative_config = speculative_config
+        self.load_config = load_config
+        self.decoding_config = decoding_config or DecodingConfig()
+        self.log_stats = log_stats
+        if not self.model_config.skip_tokenizer_init:
+            self.tokenizer: BaseTokenizerGroup
+            self._init_tokenizer()
+            self.detokenizer = Detokenizer(self.tokenizer)
+        else:
+            self.detokenizer = None
+            self.tokenizer = None
+        self.seq_counter = Counter()
+        self.generation_config_fields = _load_generation_config_dict(
+            model_config)
+        self.model_executor = executor_class(
+            args=args,
+            model_config=model_config,
+            cache_config=cache_config,
+            parallel_config=parallel_config,
+            scheduler_config=scheduler_config,
+            device_config=device_config,
+            lora_config=lora_config,
+            vision_language_config=vision_language_config,
+            speculative_config=speculative_config,
+            load_config=load_config,
+        )
+        self._initialize_kv_caches()
+        # If usage stat is enabled, collect relevant info.
+        if is_usage_stats_enabled():
+            from vllm.model_executor.model_loader import (
+                get_architecture_class_name)
+            usage_message.report_usage(
+                get_architecture_class_name(model_config),
+                usage_context,
+                extra_kvs={
+                    # Common configuration
+                    "dtype":
+                    str(model_config.dtype),
+                    "tensor_parallel_size":
+                    parallel_config.tensor_parallel_size,
+                    "block_size":
+                    cache_config.block_size,
+                    "gpu_memory_utilization":
+                    cache_config.gpu_memory_utilization,
+                    # Quantization
+                    "quantization":
+                    model_config.quantization,
+                    "kv_cache_dtype":
+                    cache_config.cache_dtype,
+                    # Feature flags
+                    "enable_lora":
+                    bool(lora_config),
+                    "enable_prefix_caching":
+                    cache_config.enable_prefix_caching,
+                    "enforce_eager":
+                    model_config.enforce_eager,
+                    "disable_custom_all_reduce":
+                    parallel_config.disable_custom_all_reduce,
+                })
+        if self.tokenizer:
+            # Ping the tokenizer to ensure liveness if it runs in a
+            # different process.
+            self.tokenizer.ping()
+        # Create the scheduler.
+        # NOTE: the cache_config here have been updated with the numbers of
+        # GPU and CPU blocks, which are profiled in the distributed executor.
+        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
+        # Metric Logging.
+        if self.log_stats:
+            self.stat_logger = StatLogger(
+                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                labels=dict(model_name=model_config.model))
+            self.stat_logger.info("cache_config", self.cache_config)
+        # Create sequence output processor, e.g. for beam search or
+        # speculative decoding.
+        self.output_processor = (
+            SequenceGroupOutputProcessor.create_output_processor(
+                self.scheduler_config,
+                self.detokenizer,
+                self.scheduler,
+                self.seq_counter,
+                self.get_tokenizer_for_seq,
+                stop_checker=StopChecker(
+                    self.scheduler_config.max_model_len,
+                    self.get_tokenizer_for_seq,
+                ),
+            ))
+    def _initialize_kv_caches(self) -> None:
+        """Initialize the KV cache in the worker(s).
+        The workers will determine the number of blocks in both the GPU cache
+        and the swap CPU cache.
+        """
+        num_gpu_blocks, num_cpu_blocks = (
+            self.model_executor.determine_num_available_blocks())
+        if self.cache_config.num_gpu_blocks_override is not None:
+            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
+            logger.info(f"Overriding {num_gpu_blocks=} with "
+                        f"{num_gpu_blocks_override=}")
+            num_gpu_blocks = num_gpu_blocks_override
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: EngineArgs,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        args: argparse.ArgumentParser = None,
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_config = engine_args.create_engine_config()
+        # Initialize the cluster and specify the executor class.
+        if engine_config.device_config.device_type == "neuron":
+            from vllm.executor.neuron_executor import NeuronExecutor
+            executor_class = NeuronExecutor
+        elif engine_config.device_config.device_type == "cpu":
+            from vllm.executor.cpu_executor import CPUExecutor
+            executor_class = CPUExecutor
+        elif engine_config.parallel_config.worker_use_ray:
+            initialize_ray_cluster(engine_config.parallel_config)
+            from vllm.executor.ray_gpu_executor import RayGPUExecutor
+            executor_class = RayGPUExecutor
+        else:
+            assert engine_config.parallel_config.world_size == 1, (
+                "Ray is required if parallel_config.world_size > 1.")
+            # from vllm.executor.gpu_executor import GPUExecutor
+            from serve.gpu_executor import GPUExecutor
+            executor_class = GPUExecutor
+        # Create the LLM engine.
+        engine = cls(
+            **engine_config.to_dict(),
+            executor_class=executor_class,
+            log_stats=not engine_args.disable_log_stats,
+            usage_context=usage_context,
+            args=args,
+        )
+        return engine
+    def __reduce__(self):
+        # This is to ensure that the LLMEngine is not referenced in
+        # the closure used to initialize Ray worker actors
+        raise RuntimeError("LLMEngine should not be pickled!")
+    def get_tokenizer(self) -> "PreTrainedTokenizer":
+        return self.tokenizer.get_lora_tokenizer(None)
+    def get_tokenizer_for_seq(self,
+                              sequence: Sequence) -> "PreTrainedTokenizer":
+        return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
+    def _init_tokenizer(self, **tokenizer_init_kwargs):
+        init_kwargs = dict(
+            tokenizer_id=self.model_config.tokenizer,
+            enable_lora=bool(self.lora_config),
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+            max_input_length=None,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            trust_remote_code=self.model_config.trust_remote_code,
+            revision=self.model_config.tokenizer_revision)
+        init_kwargs.update(tokenizer_init_kwargs)
+        self.tokenizer = get_tokenizer_group(
+            self.parallel_config.tokenizer_pool_config, **init_kwargs)
+    def _verify_args(self) -> None:
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+        if self.lora_config:
+            self.lora_config.verify_with_model_config(self.model_config)
+            self.lora_config.verify_with_scheduler_config(
+                self.scheduler_config)
+    def encode_request(
+        self,
+        request_id: str,  # pylint: disable=unused-argument
+        prompt: Optional[str],
+        prompt_token_ids: Optional[List[int]] = None,
+        lora_request: Optional[LoRARequest] = None,
+    ):
+        if prompt_token_ids is None:
+            assert prompt is not None
+            prompt_token_ids = self.tokenizer.encode(request_id=request_id,
+                                                     prompt=prompt,
+                                                     lora_request=lora_request)
+        return prompt_token_ids
+    def add_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        multi_modal_data: Optional[MultiModalData] = None,
+    ) -> None:
+        """Add a request to the engine's request pool.
+        The request is added to the request pool and will be processed by the
+        scheduler as `engine.step()` is called. The exact scheduling policy is
+        determined by the scheduler.
+        Args:
+            request_id: The unique ID of the request.
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters for text generation.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            arrival_time: The arrival time of the request. If None, we use
+                the current monotonic time.
+            multi_modal_data: Multi modal data per request.
+        Details:
+            - Set arrival_time to the current time if it is None.
+            - Set prompt_token_ids to the encoded prompt if it is None.
+            - Create `best_of` number of :class:`~vllm.Sequence` objects.
+            - Create a :class:`~vllm.SequenceGroup` object
+              from the list of :class:`~vllm.Sequence`.
+            - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
+        Example:
+            >>> # initialize engine
+            >>> engine = LLMEngine.from_engine_args(engine_args)
+            >>> # set request arguments
+            >>> example_prompt = "Who is the president of the United States?"
+            >>> sampling_params = SamplingParams(temperature=0.0)
+            >>> request_id = 0
+            >>>
+            >>> # add the request to the engine
+            >>> engine.add_request(
+            >>>    str(request_id),
+            >>>    example_prompt,
+            >>>    SamplingParams(temperature=0.0))
+            >>> # continue the request processing
+            >>> ...
+        """
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+        max_logprobs = self.get_model_config().max_logprobs
+        if (sampling_params.logprobs
+                and sampling_params.logprobs > max_logprobs) or (
+                    sampling_params.prompt_logprobs
+                    and sampling_params.prompt_logprobs > max_logprobs):
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs.")
+        if arrival_time is None:
+            arrival_time = time.time()
+        prompt_token_ids = self.encode_request(
+            request_id=request_id,
+            prompt=prompt,
+            prompt_token_ids=prompt_token_ids,
+            lora_request=lora_request)
+        # Create the sequences.
+        block_size = self.cache_config.block_size
+        seq_id = next(self.seq_counter)
+        eos_token_id = None
+        if self.tokenizer:
+            eos_token_id = self.tokenizer.get_lora_tokenizer(
+                lora_request).eos_token_id
+        else:
+            logger.warning("Use None for EOS token id because tokenizer is "
+                           "not initialized")
+        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+                       eos_token_id, lora_request)
+        # Defensive copy of SamplingParams, which are used by the sampler,
+        # this doesn't deep-copy LogitsProcessor objects
+        sampling_params = sampling_params.clone()
+        # Add the eos token id into the sampling_params to support min_tokens
+        # processing
+        if seq.eos_token_id is not None:
+            sampling_params.all_stop_token_ids.add(seq.eos_token_id)
+        sampling_params.update_from_generation_config(
+            self.generation_config_fields)
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id, [seq], sampling_params,
+                                  arrival_time, lora_request, multi_modal_data)
+        # Add the sequence group to the scheduler.
+        self.scheduler.add_seq_group(seq_group)
+    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a request(s) with the given ID.
+        Args:
+            request_id: The ID(s) of the request to abort.
+        Details:
+            - Refer to the
+              :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
+              from class :class:`~vllm.core.scheduler.Scheduler`.
+        Example:
+            >>> # initialize engine and add a request with request_id
+            >>> request_id = str(0)
+            >>> # abort the request
+            >>> engine.abort_request(request_id)
+        """
+        self.scheduler.abort_seq_group(request_id)
+    def get_model_config(self) -> ModelConfig:
+        """Gets the model configuration."""
+        return self.model_config
+    def get_num_unfinished_requests(self) -> int:
+        """Gets the number of unfinished requests."""
+        return self.scheduler.get_num_unfinished_seq_groups()
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests."""
+        return self.scheduler.has_unfinished_seqs()
+    def _process_model_outputs(
+            self, output: List[SamplerOutput],
+            scheduled_seq_groups: List[SequenceGroup],
+            ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
+        """Apply the model output to the sequences in the scheduled seq groups.
+        Returns RequestOutputs that can be returned to the client.
+        """
+        now = time.time()
+        # Organize outputs by [sequence group][step] instead of
+        # [step][sequence group].
+        output_by_sequence_group = create_output_by_sequence_group(
+            sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
+        # Update the scheduled sequence groups with the model outputs.
+        for scheduled_seq_group, outputs in zip(scheduled_seq_groups,
+                                                output_by_sequence_group):
+            seq_group = scheduled_seq_group.seq_group
+            seq_group.update_num_computed_tokens(
+                scheduled_seq_group.token_chunk_size)
+            # If uncomputed tokens > 0, it means prefill is chunked.
+            # We don't need to process outputs in that case.
+            if seq_group.get_num_uncomputed_tokens() == 0:
+                self.output_processor.process_outputs(seq_group, outputs)
+        # Free the finished sequence groups.
+        self.scheduler.free_finished_seq_groups()
+        # Create the outputs.
+        request_outputs: List[RequestOutput] = []
+        for scheduled_seq_group in scheduled_seq_groups:
+            seq_group = scheduled_seq_group.seq_group
+            seq_group.maybe_set_first_token_time(now)
+            request_output = RequestOutput.from_seq_group(seq_group)
+            request_outputs.append(request_output)
+        for seq_group in ignored_seq_groups:
+            request_output = RequestOutput.from_seq_group(seq_group)
+            request_outputs.append(request_output)
+        return request_outputs
+    def step(self) -> List[RequestOutput]:
+        """Performs one decoding iteration and returns newly generated results.
+        .. figure:: https://i.imgur.com/sv2HssD.png
+            :alt: Overview of the step function
+            :align: center
+            Overview of the step function.
+        Details:
+            - Step 1: Schedules the sequences to be executed in the next
+              iteration and the token blocks to be swapped in/out/copy.
+                - Depending on the scheduling policy,
+                  sequences may be `preempted/reordered`.
+                - A Sequence Group (SG) refer to a group of sequences
+                  that are generated from the same prompt.
+            - Step 2: Calls the distributed executor to execute the model.
+            - Step 3: Processes the model output. This mainly includes:
+                - Decodes the relevant outputs.
+                - Updates the scheduled sequence groups with model outputs
+                  based on its `sampling parameters` (`use_beam_search` or not).
+                - Frees the finished sequence groups.
+            - Finally, it creates and returns the newly generated results.
+        Example:
+            >>> # Please see the example/ folder for more detailed examples.
+            >>>
+            >>> # initialize engine and request arguments
+            >>> engine = LLMEngine.from_engine_args(engine_args)
+            >>> example_inputs = [(0, "What is LLM?",
+            >>>    SamplingParams(temperature=0.0))]
+            >>>
+            >>> # Start the engine with an event loop
+            >>> while True:
+            >>>     if example_inputs:
+            >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
+            >>>         engine.add_request(str(req_id), prompt, sampling_params)
+            >>>
+            >>>     # continue the request processing
+            >>>     request_outputs = engine.step()
+            >>>     for request_output in request_outputs:
+            >>>         if request_output.finished:
+            >>>             # return or show the request output
+            >>>
+            >>>     if not (engine.has_unfinished_requests() or example_inputs):
+            >>>         break
+        """
+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        if not scheduler_outputs.is_empty():
+            output = self.model_executor.execute_model(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+                blocks_to_copy=scheduler_outputs.blocks_to_copy,
+                num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
+        else:
+            output = []
+        request_outputs = self._process_model_outputs(
+            output, scheduler_outputs.scheduled_seq_groups,
+            scheduler_outputs.ignored_seq_groups)
+        # Log stats.
+        if self.log_stats:
+            self.stat_logger.log(self._get_stats(scheduler_outputs))
+        return request_outputs
+    def do_log_stats(self) -> None:
+        """Forced log when no requests active."""
+        if self.log_stats:
+            self.stat_logger.log(self._get_stats(scheduler_outputs=None))
+    def _get_stats(self,
+                   scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
+        """Get Stats to be Logged to Prometheus."""
+        now = time.time()
+        # KV Cache Usage in %.
+        num_total_gpu = self.cache_config.num_gpu_blocks
+        num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
+        gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
+        num_total_cpu = self.cache_config.num_cpu_blocks
+        cpu_cache_usage = 0.
+        if num_total_cpu > 0:
+            num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
+            )
+            cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
+        # Scheduler State
+        num_running = len(self.scheduler.running)
+        num_swapped = len(self.scheduler.swapped)
+        num_waiting = len(self.scheduler.waiting)
+        # Iteration stats if we have scheduler output.
+        num_prompt_tokens = 0
+        num_generation_tokens = 0
+        time_to_first_tokens = []
+        time_per_output_tokens = []
+        time_e2e_requests = []
+        if scheduler_outputs is not None:
+            prompt_run = scheduler_outputs.num_prefill_groups > 0
+            # Number of Tokens.
+            if prompt_run:
+                num_prompt_tokens = sum(
+                    len(scheduled_seq_group.seq_group.prompt_token_ids)
+                    for scheduled_seq_group in
+                    scheduler_outputs.scheduled_seq_groups)
+                num_generation_tokens = sum(
+                    scheduled_seq_group.seq_group.num_seqs()
+                    for scheduled_seq_group in
+                    scheduler_outputs.scheduled_seq_groups)
+            else:
+                num_generation_tokens = scheduler_outputs.num_batched_tokens
+            # Latency Timings.
+            time_last_iters = []
+            for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+                seq_group = scheduled_seq_group.seq_group
+                # Time since last token.
+                # (n.b. updates seq_group.metrics.last_token_time)
+                time_last_iters.append(seq_group.get_last_latency(now))
+                # Time since arrival for all finished requests.
+                if seq_group.is_finished():
+                    time_e2e_requests.append(now -
+                                             seq_group.metrics.arrival_time)
+            time_to_first_tokens = time_last_iters if prompt_run else []
+            time_per_output_tokens = [] if prompt_run else time_last_iters
+        return Stats(
+            now=now,
+            num_running=num_running,
+            num_swapped=num_swapped,
+            num_waiting=num_waiting,
+            gpu_cache_usage=gpu_cache_usage,
+            cpu_cache_usage=cpu_cache_usage,
+            num_prompt_tokens=num_prompt_tokens,
+            num_generation_tokens=num_generation_tokens,
+            time_to_first_tokens=time_to_first_tokens,
+            time_per_output_tokens=time_per_output_tokens,
+            time_e2e_requests=time_e2e_requests,
+        )
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_executor.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_executor.remove_lora(lora_id)
+    def list_loras(self) -> List[int]:
+        return self.model_executor.list_loras()
+    def check_health(self) -> None:
+        self.model_executor.check_health()

serve/model_runner.py ADDED Viewed

	@@ -0,0 +1,1223 @@

+import contextlib
+import time
+from enum import IntEnum
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
+                            get_attn_backend)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
+from vllm.distributed.device_communicators import (custom_all_reduce,
+                                                   pynccl_utils)
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader import get_model
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.utils import (CudaMemoryProfiler, async_tensor_h2d, is_hip,
+                        is_pin_memory_available, make_tensor_with_pad,
+                        maybe_expand_dim)
+from serve.gpt_model import GPT_models
+logger = init_logger(__name__)
+_PAD_SLOT_ID = -1
+LORA_WARMUP_RANK = 8
+_BATCH_SIZE_ALIGNMENT = 8
+# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
+# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+]
+class PreparePromptMetadata(NamedTuple):
+    input_tokens: List[int]
+    input_positions: List[int]
+    attn_metadata: Optional[AttentionMetadataPerStage]
+    prompt_lens: List[int]
+    subquery_lens: List[int]
+    lora_index_mapping: List[int]
+    lora_prompt_mapping: List[int]
+    lora_requests: Set[LoRARequest]
+    multi_modal_input: Optional[torch.Tensor]
+    slot_mapping: List[int]
+    @classmethod
+    def empty(cls):
+        return PreparePromptMetadata(
+            input_tokens=[],
+            input_positions=[],
+            attn_metadata=None,
+            prompt_lens=[],
+            subquery_lens=[],
+            lora_index_mapping=[],
+            lora_prompt_mapping=[],
+            lora_requests=set(),
+            multi_modal_input=None,
+            slot_mapping=[],
+        )
+class PrepareDecodeMetadata(NamedTuple):
+    input_tokens: List[int]
+    input_positions: List[int]
+    attn_metadata: Optional[AttentionMetadata]
+    lora_index_mapping: List[int]
+    lora_prompt_mapping: List[int]
+    lora_requests: Set[LoRARequest]
+    slot_mapping: List[int]
+    @classmethod
+    def empty(cls):
+        return PrepareDecodeMetadata(
+            input_tokens=[],
+            input_positions=[],
+            attn_metadata=None,
+            lora_index_mapping=[],
+            lora_prompt_mapping=[],
+            lora_requests=set(),
+            slot_mapping=[],
+        )
+# How batches are constructed.
+class BatchType(IntEnum):
+    # Every batch is prefill.
+    PREFILL = 0
+    # Every batch is decode.
+    DECODE = 1
+    # Batch is a mixture of prefill and decode.
+    MIXED = 2
+class ModelRunner:
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.is_driver_worker = is_driver_worker
+        # model_config can be None in tests/samplers/test_sampler.py.
+        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+        self.sliding_window = (model_config.get_sliding_window()
+                               if model_config is not None else None)
+        self.device_config = (device_config
+                              if device_config is not None else DeviceConfig())
+        self.device = self.device_config.device
+        # Set after load_model.
+        self.lora_manager: LRUCacheWorkerLoRAManager = None
+        self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+        self.graph_memory_pool: Optional[Tuple[
+            int, int]] = None  # Set during graph capture.
+        self.max_context_len_to_capture = (
+            self.model_config.max_context_len_to_capture
+            if self.model_config is not None else 0)
+        self.pin_memory = is_pin_memory_available()
+        self.kv_cache_dtype = kv_cache_dtype
+        self.vision_language_config = vision_language_config
+        self.attn_backend = get_attn_backend(
+            self.model_config.dtype if model_config is not None else None)
+        # Lazy initialization
+        self.model: torch.nn.Module  # Set after load_model
+        self.block_size: int  # Set after initial profiling.
+        # When using CUDA graph, the input block tables must be padded to
+        # max_context_len_to_capture. However, creating the block table in
+        # Python can be expensive. To optimize this, we cache the block table
+        # in numpy and only copy the actual input content at every iteration.
+        # The shape of the cached block table will be
+        # (max batch size to capture, max context len to capture / block size).
+        self.graph_block_tables: torch.Tensor  # Set after initial profiling.
+    def load_model(self, args) -> None:
+        with CudaMemoryProfiler() as m:
+            precision = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.precision]
+            latent_size = args.image_size // args.downsample_size
+            gpt_model = GPT_models[args.gpt_model](
+                vocab_size=args.codebook_size,
+                block_size=latent_size ** 2,
+                num_classes=args.num_classes,
+                cls_token_num=args.cls_token_num,
+                model_type=args.gpt_type,
+                cfg_scale=args.cfg_scale,
+            ).to(device='cuda', dtype=precision) # TODO: make device configurable
+            checkpoint = torch.load(args.gpt_ckpt, map_location="cpu")
+            if args.from_fsdp: # fspd
+                model_weight = checkpoint
+            elif "model" in checkpoint:  # ddp
+                model_weight = checkpoint["model"]
+            elif "state_dict" in checkpoint:
+                model_weight = checkpoint["state_dict"]
+            else:
+                raise Exception("please check model weight")
+            gpt_model.custom_load_state_dict(model_weight)
+            gpt_model.eval()
+            del checkpoint
+            self.model = gpt_model
+        self.model_memory_usage = m.consumed_memory
+        logger.info(f"Loading model weights took "
+                    f"{self.model_memory_usage / float(2**30):.4f} GB")
+        if self.lora_config:
+            assert hasattr(self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
+                               "Model does not support LoRA")
+            assert hasattr(
+                self.model,
+                "embedding_modules"), "Model does not have embedding_modules"
+            assert hasattr(self.model, "embedding_padding_modules"
+                           ), "Model does not have embedding_padding_modules"
+            self.lora_manager = LRUCacheWorkerLoRAManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens, self.vocab_size,
+                self.lora_config, self.device, self.model.embedding_modules,
+                self.model.embedding_padding_modules)
+            self.model = self.lora_manager.create_lora_manager(self.model)
+        if self.kv_cache_dtype == "fp8" and is_hip():
+            # Currently scaled KV cache is only enabled on ROCm
+            if self.model_config.quantization_param_path is not None:
+                if callable(getattr(self.model, "load_kv_cache_scales", None)):
+                    self.model.load_kv_cache_scales(
+                        self.model_config.quantization_param_path)
+                else:
+                    raise RuntimeError("Using FP8 KV cache and scaling "
+                                       "factors provided but model "
+                                       f"{self.model.__class__} does not "
+                                       "support loading scaling factors.")
+            else:
+                logger.warn("Using FP8 KV cache but no scaling factors "
+                            "provided. Defaulting to scaling factors of 1.0. "
+                            "This may lead to less accurate results!")
+        elif self.model_config.quantization_param_path is not None:
+            logger.warn("KV cache scaling factors provided, "
+                        "but the KV cache data type is not FP8. "
+                        "KV cache scaling factors will not be used.")
+    def set_block_size(self, block_size: int) -> None:
+        self.block_size = block_size
+        self.graph_block_tables = np.zeros(
+            (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
+            dtype=np.int32)
+    def get_max_block_per_batch(self) -> int:
+        block_size = self.block_size
+        return (self.max_context_len_to_capture + block_size - 1) // block_size
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> PreparePromptMetadata:
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        lora_index_mapping: List[int] = []
+        lora_prompt_mapping: List[int] = []
+        lora_requests: Set[LoRARequest] = set()
+        prompt_lens: List[int] = []
+        context_lens: List[int] = []
+        subquery_lens: List[int] = []
+        prefix_block_tables: List[List[int]] = []
+        multi_modal_input_list: List[torch.Tensor] = []
+        if len(seq_group_metadata_list) == 0:
+            return PreparePromptMetadata.empty()
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if (self.scheduler_config is not None
+                    and self.scheduler_config.chunked_prefill_enabled
+                    and not (computed_block_nums is None
+                             or computed_block_nums == [])):
+                raise RuntimeError(
+                    "chunked prefill cannot be used with prefix caching "
+                    "now.")
+            token_chunk_size = seq_group_metadata.token_chunk_size
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            computed_len = seq_data.get_num_computed_tokens()
+            # We should use get_len here because in case of preemption
+            # it contains output tokens.
+            prefill_end = min(seq_data.get_len(),
+                              computed_len + token_chunk_size)
+            prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end]
+            prompt_len = prefill_end
+            prompt_lens.append(prompt_len)
+            # NOTE: This only works for oooooooxxx style attention.
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0 and self.sliding_window is None:
+                # Prefix is not supported with sliding_window
+                computed_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[computed_len:]
+                prefix_block_tables.append(computed_block_nums)
+            elif self.scheduler_config.chunked_prefill_enabled:
+                if seq_group_metadata.block_tables is not None:
+                    # Prefill has chunked before.
+                    block_table = seq_group_metadata.block_tables[seq_id]
+                    prefix_block_tables.append(block_table)
+                else:
+                    # The first prefill.
+                    prefix_block_tables.append([])
+            else:
+                prefix_block_tables.append([])
+                # Right now, prefill start is always 0. However, this
+                # assumption can be changed once chunked prefill is introduced.
+                assert computed_len == 0
+            # actual prompt lens
+            context_lens.append(computed_len)
+            subquery_lens.append(prompt_len - computed_len)
+            input_tokens.extend(prompt_tokens)
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.extend(list(range(computed_len, prefill_end)))
+            lora_id = seq_group_metadata.lora_int_id
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+            lora_index_mapping += [lora_id] * (prompt_len - computed_len)
+            lora_prompt_mapping.extend(
+                [lora_id] *
+                (prompt_len - computed_len
+                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.extend([_PAD_SLOT_ID] * prompt_len)
+                continue
+            # Compute the slot mapping.
+            block_table = seq_group_metadata.block_tables[seq_id]
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, prompt_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                assert computed_len == 0, (
+                    "Prefix caching is currently not supported with "
+                    "sliding window attention")
+                start_idx = max(0, prompt_len - self.sliding_window)
+            for i in range(computed_len, prefill_end):
+                if i < start_idx:
+                    slot_mapping.append(_PAD_SLOT_ID)
+                    continue
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+        max_subquery_len = max(subquery_lens)
+        max_prompt_len = max(prompt_lens)
+        assert max_subquery_len > 0
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+        # Prepare prefix block tables
+        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
+        block_tables = make_tensor_with_pad(
+            prefix_block_tables,
+            max_len=max_prompt_block_table_len,
+            pad=0,
+            dtype=torch.int,
+            device=self.device,
+        )
+        # Query length can be shorter than key (i.e., prompt) when prefill
+        # is chunked or prefix cached.
+        subquery_lens_tensor = torch.tensor(subquery_lens,
+                                            dtype=torch.long,
+                                            device=self.device)
+        subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
+                                         dtype=torch.int32,
+                                         device=self.device)
+        prompt_lens_tensor = torch.tensor(prompt_lens,
+                                          dtype=torch.long,
+                                          device=self.device)
+        seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=self.device)
+        torch.cumsum(subquery_lens_tensor,
+                     dim=0,
+                     dtype=subquery_start_loc.dtype,
+                     out=subquery_start_loc[1:])
+        torch.cumsum(prompt_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=True,
+            prompt_lens=prompt_lens,
+            prompt_lens_tensor=prompt_lens_tensor,
+            max_subquery_len=max_subquery_len,
+            max_context_len=None,
+            max_prompt_len=max_prompt_len,
+            subquery_start_loc=subquery_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+        )
+        return PreparePromptMetadata(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            prompt_lens=prompt_lens,
+            subquery_lens=subquery_lens,
+            lora_index_mapping=lora_index_mapping,
+            lora_prompt_mapping=lora_prompt_mapping,
+            lora_requests=lora_requests,
+            multi_modal_input=multi_modal_input,
+            slot_mapping=slot_mapping,
+        )
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> PrepareDecodeMetadata:
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
+        context_lens: List[int] = []
+        block_tables: List[List[int]] = []
+        lora_index_mapping: List[int] = []
+        lora_prompt_mapping: List[int] = []
+        lora_requests: Set[LoRARequest] = set()
+        if len(seq_group_metadata_list) == 0:
+            return PrepareDecodeMetadata.empty()
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            lora_id = seq_group_metadata.lora_int_id
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append(generation_token)
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append(position)
+                context_len = seq_len if self.sliding_window is None else min(
+                    seq_len, self.sliding_window)
+                context_lens.append(context_len)
+                block_table = seq_group_metadata.block_tables[seq_id]
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append(slot)
+                lora_index_mapping.append(lora_id)
+                lora_prompt_mapping.append(lora_id)
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+        # vLLM uses cuda graph only for decoding requests.
+        # See `capture_model` API for more details.
+        # For decoding requests, batch_size == input_tokens.
+        batch_size = len(input_tokens)
+        max_context_len = max(context_lens)
+        use_captured_graph = (
+            not self.model_config.enforce_eager
+            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+            and max_context_len <= self.max_context_len_to_capture)
+        if use_captured_graph:
+            graph_batch_size = _get_graph_batch_size(batch_size)
+            assert graph_batch_size >= batch_size
+            for _ in range(graph_batch_size - batch_size):
+                input_tokens.append(0)
+                input_positions.append(0)
+                slot_mapping.append(_PAD_SLOT_ID)
+                context_lens.append(1)
+                block_tables.append([])
+                lora_index_mapping.append(0)
+            batch_size = graph_batch_size
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+        if use_captured_graph:
+            # When using cuda-graph all these tensors should be
+            # padded.
+            assert context_lens_tensor.shape[0] == len(input_tokens)
+            assert context_lens_tensor.shape[0] == len(input_positions)
+            assert context_lens_tensor.shape[0] == len(slot_mapping)
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device=self.device)
+        else:
+            max_block_table_len = max(
+                len(block_table) for block_table in block_tables)
+            block_tables = make_tensor_with_pad(
+                block_tables,
+                max_len=max_block_table_len,
+                pad=0,
+                dtype=torch.int,
+                device=self.device,
+            )
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=False,
+            prompt_lens=None,
+            prompt_lens_tensor=None,
+            max_subquery_len=None,
+            max_context_len=max_context_len,
+            max_prompt_len=None,
+            subquery_start_loc=None,
+            seq_start_loc=None,
+            context_lens=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+        return PrepareDecodeMetadata(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            lora_index_mapping=lora_index_mapping,
+            lora_prompt_mapping=lora_prompt_mapping,
+            lora_requests=lora_requests,
+            slot_mapping=slot_mapping,
+        )
+    def _prepare_sample(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+        subquery_lens: Optional[List[int]],
+    ) -> SamplingMetadata:
+        seq_groups: List[Tuple[List[int], SamplingParams]] = []
+        selected_token_indices: List[int] = []
+        generators: List[torch.Generator] = []
+        selected_token_start_idx = 0
+        categorized_sample_indices: Dict[SamplingType,
+                                         List[Tuple[int, int]]] = {
+                                             t: []
+                                             for t in SamplingType
+                                         }
+        categorized_sample_indices_start_idx = 0
+        categorized_sampled_token_indices_start_idx = 0
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+            if seq_group_metadata.is_prompt:
+                assert len(seq_ids) == 1
+                assert subquery_lens is not None
+                subquery_len = subquery_lens[i]
+                if sampling_params.prompt_logprobs is not None:
+                    # NOTE: prompt token positions do not need sample, skip
+                    categorized_sample_indices_start_idx += subquery_len - 1
+                categorized_sample_indices[
+                    sampling_params.sampling_type].append(
+                        (categorized_sample_indices_start_idx,
+                         categorized_sampled_token_indices_start_idx))
+                categorized_sample_indices_start_idx += 1
+                categorized_sampled_token_indices_start_idx += 1
+                if sampling_params.prompt_logprobs is not None:
+                    selected_token_indices.extend(
+                        range(selected_token_start_idx,
+                              selected_token_start_idx + subquery_len - 1))
+                selected_token_indices.append(selected_token_start_idx +
+                                              subquery_len - 1)
+                selected_token_start_idx += subquery_len
+                if sampling_params.seed is not None:
+                    seq_group_metadata.state.generator = torch.Generator(
+                        device=self.device).manual_seed(sampling_params.seed)
+            else:
+                num_seqs = len(seq_ids)
+                selected_token_indices.extend(
+                    range(selected_token_start_idx,
+                          selected_token_start_idx + num_seqs))
+                selected_token_start_idx += num_seqs
+                categorized_sample_indices[
+                    sampling_params.sampling_type].extend(
+                        list(
+                            zip(
+                                range(
+                                    categorized_sample_indices_start_idx,
+                                    categorized_sample_indices_start_idx +
+                                    num_seqs),
+                                range(
+                                    categorized_sampled_token_indices_start_idx,
+                                    categorized_sampled_token_indices_start_idx
+                                    + num_seqs))))
+                categorized_sample_indices_start_idx += num_seqs
+                categorized_sampled_token_indices_start_idx += num_seqs
+            if sampling_params.seed is not None:
+                generators.append(seq_group_metadata.state.generator)
+        selected_token_indices = async_tensor_h2d(selected_token_indices,
+                                                  dtype=torch.long,
+                                                  target_device=self.device,
+                                                  pin_memory=self.pin_memory)
+        categorized_sample_indices = {
+            t: maybe_expand_dim(
+                async_tensor_h2d(seq_ids,
+                                 dtype=torch.int,
+                                 target_device=self.device,
+                                 pin_memory=self.pin_memory), 2, 2)
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+        sampling_metadata = SamplingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+            generators=generators,
+        )
+        return sampling_metadata
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+        if self.is_driver_worker:
+            prefill_reqs = []
+            decode_reqs = []
+            for seq_group_meta in seq_group_metadata_list:
+                if seq_group_meta.is_prompt:
+                    prefill_reqs.append(seq_group_meta)
+                else:
+                    decode_reqs.append(seq_group_meta)
+            # Prepare input tensors.
+            (
+                input_tokens,
+                input_positions,
+                prefill_attn_metadata,
+                prompt_lens,
+                subquery_lens,
+                lora_index_mapping,
+                lora_prompt_mapping,
+                lora_requests,
+                multi_modal_input,
+                slot_mapping,
+            ) = self._prepare_prompt(prefill_reqs)
+            (
+                decode_input_tokens,
+                decode_input_positions,
+                decode_attn_metadata,
+                decode_lora_index_mapping,
+                decode_lora_prompt_mapping,
+                decode_lora_requests,
+                decode_slot_mapping,
+            ) = self._prepare_decode(decode_reqs)
+            sampling_metadata = self._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens,
+                                                     subquery_lens)
+            if not self.scheduler_config.chunked_prefill_enabled:
+                assert (len(prefill_reqs) and len(decode_reqs)) == 0
+            num_prefills = len(prompt_lens)
+            num_prefill_tokens = len(input_tokens)
+            num_decode_tokens = len(decode_input_tokens)
+            # Coalesce tensors. Note that attn_metadata is currently not
+            # coalesced for simplicity.
+            input_tokens.extend(decode_input_tokens)
+            input_positions.extend(decode_input_positions)
+            slot_mapping.extend(decode_slot_mapping)
+            lora_index_mapping.extend(decode_lora_index_mapping)
+            lora_prompt_mapping.extend(decode_lora_prompt_mapping)
+            lora_requests.update(decode_lora_requests)
+            input_tokens = torch.tensor(input_tokens,
+                                        dtype=torch.long,
+                                        device=self.device)
+            input_positions = torch.tensor(input_positions,
+                                           dtype=torch.long,
+                                           device=self.device)
+            slot_mapping = torch.tensor(slot_mapping,
+                                        dtype=torch.long,
+                                        device=self.device)
+            if self.lora_config:
+                lora_mapping = LoRAMapping(
+                    lora_index_mapping,
+                    lora_prompt_mapping,
+                )
+            else:
+                lora_mapping = None
+            # Broadcast the metadata.
+            # If batch contains both prefill and decode, it sends 2 broadcasts.
+            # If it only contains 1 type, it triggers a single broadcast.
+            if (prefill_attn_metadata is not None
+                    and decode_attn_metadata is not None):
+                batch_type = BatchType.MIXED
+            elif prefill_attn_metadata is not None:
+                batch_type = BatchType.PREFILL
+            else:
+                batch_type = BatchType.DECODE
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "selected_token_indices":
+                sampling_metadata.selected_token_indices,
+                "lora_requests": lora_requests,
+                "lora_mapping": lora_mapping,
+                "multi_modal_input": multi_modal_input,
+                "num_prefill_tokens": num_prefill_tokens,
+                "num_decode_tokens": num_decode_tokens,
+                "slot_mapping": slot_mapping,
+                "num_prefills": num_prefills,
+                "batch_type": batch_type,
+            }
+            if prefill_attn_metadata is not None:
+                metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+            else:
+                assert decode_attn_metadata is not None
+                metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+            # Broadcast decode attn metadata for mixed batch type.
+            # The additional broadcast costs 300us overhead on 4 A10 GPUs.
+            # We can potentially reduce the overhead by coelescing tensors.
+            if batch_type == BatchType.MIXED:
+                assert decode_attn_metadata is not None
+                metadata_dict = decode_attn_metadata.asdict_zerocopy()
+                broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            slot_mapping = metadata_dict.pop("slot_mapping")
+            num_prefills = metadata_dict.pop("num_prefills")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            lora_mapping = metadata_dict.pop("lora_mapping")
+            lora_requests = metadata_dict.pop("lora_requests")
+            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            num_prefill_tokens = metadata_dict.pop("num_prefill_tokens")
+            num_decode_tokens = metadata_dict.pop("num_decode_tokens")
+            batch_type = metadata_dict.pop("batch_type")
+            # Create an attention metadata.
+            prefill_attn_metadata = None
+            decode_attn_metadata = None
+            if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED:
+                prefill_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+            else:
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                seq_data=None,
+                prompt_lens=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                generators=None,
+                perform_sampling=False,
+            )
+            # if it is a mixed batch, decode attn_metadata is broadcasted
+            # separately.
+            if batch_type == BatchType.MIXED:
+                metadata_dict = broadcast_tensor_dict(src=0)
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+        attn_metadata = AttentionMetadata(
+            num_prefills=num_prefills,
+            slot_mapping=slot_mapping,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            prefill_metadata=prefill_attn_metadata,
+            decode_metadata=decode_attn_metadata,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, lora_requests, lora_mapping,
+                multi_modal_input)
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[torch.Tensor],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         lora_requests, lora_mapping, multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        if self.lora_config:
+            self.set_active_loras(lora_requests, lora_mapping)
+        # Currently cuda graph is only supported by the decode phase.
+        prefill_meta = attn_metadata.prefill_metadata
+        decode_meta = attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            graph_batch_size = input_tokens.shape[0]
+            model_executable = self.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+        hidden_states = model_executable(**execute_model_kwargs)
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        # Only perform sampling in the driver worker.
+        if not sampling_metadata.perform_sampling:
+            return None
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests = []
+        dummy_lora_requests_per_seq = []
+        if self.lora_config:
+            for idx in range(self.lora_config.max_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_local_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+            dummy_lora_requests_per_seq = [
+                dummy_lora_requests[idx % len(dummy_lora_requests)]
+                for idx in range(max_num_seqs)
+            ]
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for vision encoding, which needs
+        # to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+        if self.vision_language_config:
+            max_num_seqs = min(
+                max_num_seqs,
+                int(max_num_batched_tokens /
+                    self.vision_language_config.image_feature_size))
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
+                seq_len, self.vision_language_config)
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
+                multi_modal_data=fake_multi_modal_input,
+            )
+            seqs.append(seq)
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+        self.execute_model(seqs, kv_caches)
+        torch.cuda.synchronize()
+        return
+    def remove_all_loras(self) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_all_loras()
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_lora(lora_id)
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_loras()
+    @torch.inference_mode()
+    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
+        """Cuda graph capture a model.
+        Note that CUDA graph's performance gain is negligible if number
+        of batched tokens are larger than 200. And since CUDA graph
+        requires fixed sized tensors, supporting large/variable batch
+        size requires high GPU memory overhead. Thus, vLLM only captures
+        decoding requests. Mixed batch (chunked prefill + decoding) or
+        prefill requests are not captured.
+        Since it is used for decoding-only, it assumes there's only 1 token
+        per sequence in the batch.
+        """
+        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
+        # deleted before the CUDA graphs.
+        self.pynccl_backend = pynccl_utils.get_nccl_backend()
+        assert not self.model_config.enforce_eager
+        logger.info("Capturing the model for CUDA graphs. This may lead to "
+                    "unexpected consequences if the model is not static. To "
+                    "run the model in eager mode, set 'enforce_eager=True' or "
+                    "use '--enforce-eager' in the CLI.")
+        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
+                    "If you are running out of memory, consider decreasing "
+                    "`gpu_memory_utilization` or enforcing eager mode. "
+                    "You can also reduce the `max_num_seqs` as needed "
+                    "to decrease memory usage.")
+        start_time = time.perf_counter()
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
+        slot_mapping.fill_(_PAD_SLOT_ID)
+        context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
+        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        graph_batch_size = _get_graph_batch_size(
+            self.scheduler_config.max_num_seqs)
+        batch_size_capture_list = [
+            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
+        ]
+        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
+        # kernel, pynccl, and PyTorch NCCL. When using CUDA graph, we use
+        # either custom all-reduce kernel or pynccl. When not using CUDA
+        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
+        # We always prioritize using custom all-reduce kernel but fall back
+        # to PyTorch or pynccl if it is disabled or not supported.
+        with custom_all_reduce.capture():
+            # NOTE: Capturing the largest batch size first may help reduce the
+            # memory usage of CUDA graph.
+            for batch_size in reversed(batch_size_capture_list):
+                # Create dummy attn_metadata.
+                decode_metadata = self.attn_backend.make_metadata(
+                    is_prompt=False,
+                    prompt_lens=None,
+                    prompt_lens_tensor=None,
+                    max_subquery_len=None,
+                    max_context_len=self.max_context_len_to_capture,
+                    max_prompt_len=None,
+                    subquery_start_loc=None,
+                    seq_start_loc=None,
+                    context_lens=context_lens[:batch_size],
+                    block_tables=block_tables[:batch_size],
+                    use_cuda_graph=True,
+                )
+                attn_metadata = AttentionMetadata(
+                    num_prefills=0,
+                    num_prefill_tokens=0,
+                    num_decode_tokens=batch_size,
+                    slot_mapping=slot_mapping[:batch_size],
+                    prefill_metadata=None,
+                    decode_metadata=decode_metadata,
+                    kv_cache_dtype=self.kv_cache_dtype,
+                )
+                if self.lora_config:
+                    lora_mapping = LoRAMapping(
+                        [0] * batch_size,
+                        [0] * batch_size,
+                    )
+                    self.set_active_loras(set(), lora_mapping)
+                graph_runner = CUDAGraphRunner(self.model)
+                graph_runner.capture(
+                    input_tokens[:batch_size],
+                    input_positions[:batch_size],
+                    kv_caches,
+                    attn_metadata,
+                    memory_pool=self.graph_memory_pool,
+                )
+                self.graph_memory_pool = graph_runner.graph.pool()
+                self.graph_runners[batch_size] = graph_runner
+        end_time = time.perf_counter()
+        elapsed_time = end_time - start_time
+        # This usually takes < 10 seconds.
+        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
+    def __del__(self) -> None:
+        # Delete the CUDA graphs before deleting the pynccl communicator.
+        # NOTE(woosuk): This is necessary because otherwise deadlocks can
+        # happen.
+        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
+        # TODO(youkaichao): when we get enough user feedback that pynccl is
+        # more stable than cupy, we can remove this, e.g. in v0.4.1.
+        self.graph_runners.clear()
+        self.pynccl_backend = None
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+class CUDAGraphRunner:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.input_buffers: Dict[str, torch.Tensor] = {}
+        self.output_buffers: Dict[str, torch.Tensor] = {}
+        self._graph: Optional[torch.cuda.CUDAGraph] = None
+    @property
+    def graph(self):
+        assert self._graph is not None
+        return self._graph
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        memory_pool,
+        **kwargs,
+    ) -> None:
+        assert self._graph is None
+        # Run the model once without capturing the graph.
+        # This is to make sure that the captured graph does not include the
+        # kernel launches for initial benchmarking (e.g., Triton autotune).
+        with _maybe_pynccl():
+            self.model(
+                input_ids,
+                positions,
+                kv_caches,
+                attn_metadata,
+                **kwargs,
+            )
+        torch.cuda.synchronize()
+        # Capture the graph.
+        # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
+        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
+        self._graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._graph, pool=memory_pool):  # noqa: SIM117
+            with _maybe_pynccl():
+                hidden_states = self.model(
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    attn_metadata,
+                    **kwargs,
+                )
+        torch.cuda.synchronize()
+        # Save the input and output buffers.
+        self.input_buffers = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "kv_caches": kv_caches,
+            "slot_mapping": attn_metadata.slot_mapping,
+            "context_lens": attn_metadata.decode_metadata.context_lens,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+        }
+        self.output_buffers = {"hidden_states": hidden_states}
+        return
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        # KV caches are fixed tensors, so we don't need to copy them.
+        del kv_caches
+        # Copy the input tensors to the input buffers.
+        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
+        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
+                                                 non_blocking=True)
+        self.input_buffers["context_lens"].copy_(
+            attn_metadata.decode_metadata.context_lens, non_blocking=True)
+        self.input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        # Run the graph.
+        self.graph.replay()
+        # Return the output tensor.
+        return self.output_buffers["hidden_states"]
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+@contextlib.contextmanager
+def _maybe_pynccl():
+    if pynccl_utils.is_initialized(
+    ) and not custom_all_reduce.is_initialized():
+        with with_pynccl_for_all_reduce():
+            yield
+    else:
+        yield
+def _get_graph_batch_size(batch_size: int) -> int:
+    """Returns the padded batch size given actual batch size.
+    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+    """
+    if batch_size <= 2:
+        return batch_size
+    elif batch_size <= 4:
+        return 4
+    else:
+        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
+                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
+def _prepare_fake_inputs(
+        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
+    """Prepare fake inputs for profile run."""
+    if vision_language_config:
+        prompt_tokens = [
+            vision_language_config.image_token_id
+        ] * vision_language_config.image_feature_size + [0] * (
+            seq_len - vision_language_config.image_feature_size)
+        fake_image_input = MultiModalData(
+            type=MultiModalData.Type.IMAGE,
+            data=torch.zeros(vision_language_config.image_input_shape,
+                             dtype=torch.float16))
+    else:
+        prompt_tokens = [0] * seq_len
+        fake_image_input = None
+    return SequenceData(prompt_tokens), fake_image_input

serve/sample_c2i.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import time
+import argparse
+import torch
+from torchvision.utils import save_image
+from tokenizer.tokenizer_image.vq_model import VQ_models
+from serve.gpt_model import GPT_models
+from serve.llm import LLM
+from vllm import SamplingParams
+def main(args):
+    # Setup PyTorch:
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # create and load model
+    vq_model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim)
+    vq_model.to(device)
+    vq_model.eval()
+    checkpoint = torch.load(args.vq_ckpt, map_location="cpu")
+    vq_model.load_state_dict(checkpoint["model"])
+    del checkpoint
+    print(f"image tokenizer is loaded")
+    # Labels to condition the model with (feel free to change):
+    class_labels = [207, 360, 387, 974, 88, 979, 417, 279]
+    latent_size = args.image_size // args.downsample_size
+    qzshape = [len(class_labels), args.codebook_embed_dim, latent_size, latent_size]
+    prompt_token_ids = [[cind] for cind in class_labels]
+    if args.cfg_scale > 1.0:
+        prompt_token_ids.extend([[args.num_classes] for _ in range(len(prompt_token_ids))])
+    # Create an LLM.
+    llm = LLM(
+        args=args,
+        model='autoregressive/serve/fake_json/{}.json'.format(args.gpt_model),
+        gpu_memory_utilization=0.9,
+        skip_tokenizer_init=True)
+    print(f"gpt model is loaded")
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=args.temperature, top_p=args.top_p, top_k=args.top_k,
+        max_tokens=latent_size ** 2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    t1 = time.time()
+    outputs = llm.generate(
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=sampling_params,
+        use_tqdm=False)
+    sampling_time = time.time() - t1
+    print(f"gpt sampling takes about {sampling_time:.2f} seconds.")
+    # decode to image
+    index_sample = torch.tensor([output.outputs[0].token_ids for output in outputs], device=device)
+    if args.cfg_scale > 1.0:
+        index_sample = index_sample[:len(class_labels)]
+    t2 = time.time()
+    samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
+    decoder_time = time.time() - t2
+    print(f"decoder takes about {decoder_time:.2f} seconds.")
+    # Save and display images:
+    save_image(samples, "sample_{}.png".format(args.gpt_type), nrow=4, normalize=True, value_range=(-1, 1))
+    print(f"image is saved to sample_{args.gpt_type}.png")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpt-model", type=str, choices=list(GPT_models.keys()), default="GPT-B")
+    parser.add_argument("--gpt-ckpt", type=str, required=True, help="ckpt path for gpt model")
+    parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="c2i", help="class-conditional or text-conditional")
+    parser.add_argument("--from-fsdp", action='store_true')
+    parser.add_argument("--cls-token-num", type=int, default=1, help="max token number of condition input")
+    parser.add_argument("--precision", type=str, default='bf16', choices=["none", "fp16", "bf16"])
+    parser.add_argument("--compile", action='store_true', default=False)
+    parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+    parser.add_argument("--vq-ckpt", type=str, required=True, help="ckpt path for vq model")
+    parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+    parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+    parser.add_argument("--image-size", type=int, choices=[256, 384, 512], default=384)
+    parser.add_argument("--downsample-size", type=int, choices=[8, 16], default=16)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale", type=float, default=4.0)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--top-k", type=int, default=2000,help="top-k value to sample with")
+    parser.add_argument("--temperature", type=float, default=1.0, help="temperature value to sample with")
+    parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
+    args = parser.parse_args()
+    main(args)

serve/sampler.py ADDED Viewed

	@@ -0,0 +1,868 @@

+"""A layer that samples the next tokens from the model's outputs."""
+import itertools
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from vllm.model_executor.layers.ops.sample import sample as sample_triton
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingTensors)
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
+                           SamplerOutput, SequenceData, SequenceGroupOutput,
+                           SequenceOutput)
+class Sampler(nn.Module):
+    """Samples the next tokens from the model's outputs.
+    This layer does the following:
+    1. Discard the hidden states that are not used for sampling (i.e., all
+        tokens except the final one in each prompt).
+    2. Compute the logits for the next tokens.
+    3. Apply presence, frequency and repetition penalties.
+    4. Apply temperature scaling.
+    5. Apply top-p and top-k truncation.
+    6. Sample the next tokens.
+    Here, each sequence group within the batch can have different sampling
+    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
+    The structure of the logits tensor is coupled with the seq_groups in
+    sampling_metadata. Typically, each sequence in each seq_group has one row in
+    logits for the next token to be sampled; however, for a seq_group with a
+    prompt request with the prompt_logprobs sampling parameter, there are rows
+    in logits for each token in the input prompt.
+    """
+    def __init__(self, cfg_scale=1.0):
+        super().__init__()
+        self.cfg_scale = cfg_scale
+        # Whether or not the SamplerOutput should have on-device tensors
+        # containing the sampled token ids and probabilities. This is used by
+        # speculative decoding.
+        self.include_gpu_probs_tensor = False
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        assert logits is not None
+        _, vocab_size = logits.shape
+        if self.cfg_scale > 1.0:
+            logits_combined = logits
+            cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_scale
+            logits = torch.cat([logits, logits], dim=0)
+        # Apply min_tokens penalty which sets stop tokens to -inf if min_tokens
+        # have not been generated yet
+        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
+        # Prepare sampling tensors with pinned memory to avoid blocking.
+        (sampling_tensors, do_penalties, do_top_p_top_k,
+         do_min_p) = SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype)
+        # Apply presence and frequency penalties.
+        if do_penalties:
+            logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                      sampling_tensors.output_tokens,
+                                      sampling_tensors.presence_penalties,
+                                      sampling_tensors.frequency_penalties,
+                                      sampling_tensors.repetition_penalties)
+        # Apply temperature scaling.
+        # Use in-place division to avoid creating a new tensor.
+        logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
+        if do_top_p_top_k:
+            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                        sampling_tensors.top_ks)
+        if do_min_p:
+            logits = _apply_min_p(logits, sampling_tensors.min_ps)
+        # We use float32 for probabilities and log probabilities.
+        # Compute the probabilities.
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        # Compute the log probabilities.
+        # Use log_softmax to ensure numerical stability.
+        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+        # Sample the next tokens.
+        sample_results, maybe_sampled_tokens_tensor = _sample(
+            probs,
+            logprobs,
+            sampling_metadata,
+            sampling_tensors,
+            include_gpu_probs_tensor=self.include_gpu_probs_tensor,
+            modify_greedy_probs=self._should_modify_greedy_probs_inplace,
+        )
+        if self.cfg_scale > 1.0:
+            cond_result = sample_results[:len(sample_results) // 2]
+            sample_results = cond_result + cond_result
+        if self.include_gpu_probs_tensor:
+            assert maybe_sampled_tokens_tensor is not None
+            sampled_tokens_tensor = maybe_sampled_tokens_tensor
+            on_device_tensors = (probs, sampled_tokens_tensor)
+        else:
+            on_device_tensors = None
+        # Get the logprobs query results.
+        prompt_logprobs, sample_logprobs = _get_logprobs(
+            logprobs, sampling_metadata, sample_results)
+        return _build_sampler_output(sample_results,
+                                     sampling_metadata,
+                                     prompt_logprobs,
+                                     sample_logprobs,
+                                     on_device_tensors=on_device_tensors)
+    @property
+    def _should_modify_greedy_probs_inplace(self) -> bool:
+        """Whether or not the sampler should modify the probability distribution
+        of greedily-sampled tokens such that multinomial sampling would sample
+        the greedily-sampled token.
+        In other words, if True then we set the probability of the greedily-
+        sampled token to 1.
+        This is used by speculative decoding, which requires that the sampling
+        method be encoded into the probability distribution.
+        """
+        # Modify greedy probs if include_gpu_probs_tensor is set.
+        return self.include_gpu_probs_tensor
+def _get_bin_counts_and_mask(
+    tokens: torch.Tensor,
+    vocab_size: int,
+    num_seqs: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=tokens.device)
+    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+    return bin_counts, mask
+def _apply_min_tokens_penalty(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    # list of indices in logits that will be set to -inf
+    logits_to_penalize = []
+    start_idx = 0
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
+        seq_ids, sampling_params = seq_group
+        # handle prompt_logprobs by skipping rows in logits added for the prompt
+        # tokens (prompt logprobs are not penalized)
+        if (i < sampling_metadata.num_prompts
+                and sampling_params.prompt_logprobs is not None):
+            assert len(seq_ids) == 1
+            start_idx += sampling_metadata.prompt_lens[i] - 1
+        min_tokens = sampling_params.min_tokens
+        if min_tokens > 0:
+            seqs_to_penalize = []
+            for i, seq_id in enumerate(seq_ids):
+                seq_data = sampling_metadata.seq_data[seq_id]
+                if len(seq_data.output_token_ids) < min_tokens:
+                    seqs_to_penalize.append(i)
+            if seqs_to_penalize:
+                # convert to the index into logits
+                seqs_to_penalize = [start_idx + i for i in seqs_to_penalize]
+                # use set() to remove any duplicates
+                token_ids_to_penalize = set(sampling_params.stop_token_ids +
+                                            [sampling_params.eos_token_id])
+                # itertools.product pairs each seq index with every token id
+                logits_to_penalize.extend(
+                    itertools.product(seqs_to_penalize, token_ids_to_penalize))
+        start_idx += len(seq_ids)
+    if logits_to_penalize:
+        # use zip and * to group indices along each dimension
+        # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) )
+        logits[tuple(zip(*logits_to_penalize))] = -float("inf")
+    # verifies that no rows in logits were missed unexpectedly
+    assert start_idx == logits.shape[0]
+    return logits
+def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+                     output_tokens_tensor: torch.Tensor,
+                     presence_penalties: torch.Tensor,
+                     frequency_penalties: torch.Tensor,
+                     repetition_penalties: torch.Tensor) -> torch.Tensor:
+    num_seqs, vocab_size = logits.shape
+    _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
+                                              num_seqs)
+    output_bin_counts, output_mask = _get_bin_counts_and_mask(
+        output_tokens_tensor, vocab_size, num_seqs)
+    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
+    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
+    logits = torch.where(logits > 0, logits / repetition_penalties,
+                         logits * repetition_penalties)
+    # We follow the definition in OpenAI API.
+    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+    return logits
+def _apply_top_k_top_p(
+    logits: torch.Tensor,
+    p: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+    # Apply top-k.
+    top_k_mask = logits_sort.size(1) - k.to(torch.long)
+    # Get all the top_k values.
+    top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+    top_k_mask = logits_sort < top_k_mask
+    logits_sort.masked_fill_(top_k_mask, -float("inf"))
+    # Apply top-p.
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+    # at least one
+    top_p_mask[:, -1] = False
+    logits_sort.masked_fill_(top_p_mask, -float("inf"))
+    # Re-sort the probabilities.
+    src = torch.arange(logits_idx.shape[-1],
+                       device=logits_idx.device).expand_as(logits_idx)
+    logits_idx_inv = torch.empty_like(logits_idx).scatter_(dim=-1,
+                                                           index=logits_idx,
+                                                           src=src)
+    logits = torch.gather(logits_sort, dim=-1, index=logits_idx_inv)
+    return logits
+def _apply_min_p(
+    logits: torch.Tensor,
+    min_p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Adapted from
+    https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
+    """
+    probs = torch.softmax(logits, dim=-1)
+    top_probs, _ = probs.max(dim=-1, keepdim=True)
+    scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs
+    tokens_to_remove = probs < scaled_min_p
+    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
+    return logits
+def _greedy_sample(
+    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
+    samples: torch.Tensor,
+) -> List[Tuple[List[int], List[int]]]:
+    samples = samples.tolist()
+    sample_idx = 0
+    results = []
+    for seq_group in selected_seq_groups:
+        seq_ids, _ = seq_group
+        num_parent_seqs = len(seq_ids)
+        assert num_parent_seqs == 1, (
+            "Greedy sampling should have only one seq.")
+        parent_ids = list(range(num_parent_seqs))
+        next_token_ids = [samples[sample_idx]]
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    return results
+def _random_sample(
+    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
+    is_prompts: List[bool],
+    random_samples: torch.Tensor,
+) -> List[Tuple[List[int], List[int]]]:
+    # Find the maximum best_of value of the prompt phase requests.
+    random_samples = random_samples.cpu()
+    sample_idx = 0
+    results = []
+    for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
+        seq_ids, sampling_params = seq_group
+        num_parent_seqs = len(seq_ids)
+        if is_prompt:
+            # Prompt phase.
+            parent_ids = [0] * sampling_params.best_of
+            next_token_ids = random_samples[
+                sample_idx, :sampling_params.best_of].tolist()
+        else:
+            # Generation phase.
+            parent_ids = list(range(num_parent_seqs))
+            next_token_ids = random_samples[sample_idx:sample_idx +
+                                            num_parent_seqs, 0].tolist()
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    return results
+def _beam_search_sample(
+    selected_seq_groups: List[Tuple[List[int], SamplingParams]],
+    is_prompts: List[bool],
+    seq_data: Dict[int, SequenceData],
+    logprobs: torch.Tensor,
+) -> List[Tuple[List[int], List[int]]]:
+    # We sample 2 * beam_width candidates to make sure that with high
+    # probability we can get `beam_width` candidates in addition to
+    # the finished sequences for the next iteration. See
+    # https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
+    # for details. See also HF reference:
+    # https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
+    #
+    # NOTE: Beam search is not vectorized, so its speed can be slower than
+    # other sampling methods.
+    sample_idx = 0
+    results = []
+    for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
+        seq_ids, sampling_params = seq_group
+        num_parent_seqs = len(seq_ids)
+        beam_width = sampling_params.best_of
+        seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
+        if is_prompt:
+            # Prompt phase.
+            assert num_parent_seqs == 1, (
+                "Prompt input should have only one seq.")
+            parent_ids = [0] * (2 * beam_width)
+            _, next_token_ids = torch.topk(seq_group_logprobs[0],
+                                           2 * beam_width)
+            next_token_ids = next_token_ids.tolist()
+        else:
+            # Generation phase.
+            cumulative_logprobs = [
+                seq_data[seq_id].cumulative_logprob for seq_id in seq_ids
+            ]
+            cumulative_logprobs = torch.tensor(
+                cumulative_logprobs,
+                dtype=torch.float,
+                device=seq_group_logprobs.device)
+            seq_group_logprobs = (seq_group_logprobs +
+                                  cumulative_logprobs.unsqueeze(dim=1))
+            _, topk_ids = torch.topk(seq_group_logprobs.flatten(),
+                                     2 * beam_width)
+            topk_ids = topk_ids.tolist()
+            vocab_size = seq_group_logprobs.size(-1)
+            parent_ids = [i // vocab_size for i in topk_ids]
+            next_token_ids = [i % vocab_size for i in topk_ids]
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    assert sample_idx == logprobs.size(0)
+    return results
+# torch.multinomial forces a GPU<->CPU sync.
+# Therefore, we use an optimized implementation instead.
+# Note that we always sample with replacement.
+# probs will be modified in place, but this is fine, as we pass
+# in a copy already.
+def _multinomial(
+    probs: torch.Tensor,
+    num_samples: int,
+    seq_groups: Optional[List[Tuple[List[int], SamplingParams]]] = None,
+    generators: Optional[List[torch.Generator]] = None,
+) -> torch.Tensor:
+    if num_samples > 1:
+        # This is equivalent to torch.repeat_interleaved (which also
+        # forces a GPU<->CPU sync).
+        # This allows us to do sampling with replacement by creating
+        # num_samples copies of each row in the tensor, and then
+        # batch sampling the resulting tensor.
+        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
+                                         probs.shape[1]).contiguous().view(
+                                             -1, probs.shape[1])
+    q = torch.empty_like(probs)
+    if seq_groups is None:
+        q.exponential_()
+    else:
+        sample_idx = 0
+        for (seq_ids, _), generator in zip(seq_groups, generators):
+            next_sample_idx = sample_idx + len(seq_ids) * num_samples
+            q[sample_idx:next_sample_idx].exponential_(generator=generator)
+            sample_idx = next_sample_idx
+    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
+def _sample_with_torch(
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    include_gpu_probs_tensor: bool,
+    modify_greedy_probs: bool,
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
+    categorized_seq_group_ids = {t: [] for t in SamplingType}
+    categorized_sample_indices = sampling_metadata.categorized_sample_indices
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
+        _, sampling_params = seq_group
+        sampling_type = sampling_params.sampling_type
+        categorized_seq_group_ids[sampling_type].append(i)
+    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
+    sample_metadata = {}
+    multinomial_samples = {}
+    # Create output tensor for sampled token ids.
+    if include_gpu_probs_tensor:
+        sampled_token_ids_tensor = torch.empty(logprobs.shape[0],
+                                               1,
+                                               dtype=torch.long,
+                                               device=logprobs.device)
+    else:
+        sampled_token_ids_tensor = None
+    # Counterintiutively, having two loops here is actually faster.
+    # The first loop can run without waiting on GPU<->CPU sync.
+    for sampling_type in SamplingType:
+        sample_indices = categorized_sample_indices[sampling_type][:, 0]
+        num_tokens = len(sample_indices)
+        if num_tokens == 0:
+            continue
+        seq_group_ids = categorized_seq_group_ids[sampling_type]
+        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids]
+        is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids]
+        sample_metadata[sampling_type] = (seq_group_ids, seq_groups,
+                                          is_prompts, sample_indices)
+        long_sample_indices = sample_indices.long()
+        if sampling_type == SamplingType.GREEDY:
+            greedy_samples = torch.argmax(logprobs[long_sample_indices],
+                                          dim=-1)
+            if include_gpu_probs_tensor:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[
+                    long_sample_indices] = greedy_samples.unsqueeze(-1)
+            if modify_greedy_probs:
+                # If required, modify the probabilities such that sampling from
+                # the modified distribution would always sample the argmax
+                # token id.
+                _modify_greedy_probs_inplace(logprobs, probs,
+                                             long_sample_indices,
+                                             greedy_samples)
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            max_best_of_in_batch = 1
+            for seq_group, is_prompt in zip(seq_groups, is_prompts):
+                if is_prompt:
+                    _, sampling_params = seq_group
+                    max_best_of_in_batch = max(max_best_of_in_batch,
+                                               sampling_params.best_of)
+            seeded_args = {} if sampling_type == SamplingType.RANDOM else {
+                "seq_groups": seq_groups,
+                "generators": sampling_metadata.generators,
+            }
+            multinomial_samples[sampling_type] = _multinomial(
+                probs[long_sample_indices], max_best_of_in_batch,
+                **seeded_args)
+            if include_gpu_probs_tensor:
+                # Store sampled tokens in output tensor.
+                sampled_token_ids_tensor[
+                    long_sample_indices] = multinomial_samples[sampling_type]
+        elif sampling_type == SamplingType.BEAM:
+            beam_search_logprobs = logprobs[sample_indices]
+        else:
+            raise ValueError(f"Unsupported sampling type: {sampling_type}")
+    # GPU<->CPU sync happens in the loop below.
+    # This also converts the sample output to Python objects.
+    for sampling_type in SamplingType:
+        if sampling_type not in sample_metadata:
+            continue
+        seq_group_ids, seq_groups, is_prompts, sample_indices = sample_metadata[
+            sampling_type]
+        if sampling_type == SamplingType.GREEDY:
+            sample_results = _greedy_sample(seq_groups, greedy_samples)
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            sample_results = _random_sample(seq_groups, is_prompts,
+                                            multinomial_samples[sampling_type])
+        elif sampling_type == SamplingType.BEAM:
+            sample_results = _beam_search_sample(seq_groups, is_prompts,
+                                                 sampling_metadata.seq_data,
+                                                 beam_search_logprobs)
+        sample_results_dict.update(zip(seq_group_ids, sample_results))
+    sample_results = [
+        sample_results_dict[i]
+        for i in range(len(sampling_metadata.seq_groups))
+    ]
+    return sample_results, sampled_token_ids_tensor
+def _sample_with_triton_kernel(
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sampling_tensors: SamplingTensors,
+) -> List[Tuple[List[int], List[int]]]:
+    categorized_seq_group_ids = {t: [] for t in SamplingType}
+    categorized_sample_indices = sampling_metadata.categorized_sample_indices
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
+        _, sampling_params = seq_group
+        sampling_type = sampling_params.sampling_type
+        categorized_seq_group_ids[sampling_type].append(i)
+    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
+    sample_metadata = {}
+    max_best_of_in_batch = 1
+    # Counterintiutively, having two loops here is actually faster.
+    # The first loop can run without waiting on GPU<->CPU sync.
+    for sampling_type in SamplingType:
+        sample_indices = categorized_sample_indices[sampling_type][:, 0]
+        sampled_token_indices = categorized_sample_indices[sampling_type][:, 1]
+        num_tokens = len(sample_indices)
+        if num_tokens == 0:
+            continue
+        seq_group_ids = categorized_seq_group_ids[sampling_type]
+        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids]
+        is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids]
+        sample_metadata[sampling_type] = (seq_group_ids, seq_groups,
+                                          is_prompts, sample_indices,
+                                          sampled_token_indices)
+        if sampling_type in (SamplingType.GREEDY, SamplingType.RANDOM,
+                             SamplingType.RANDOM_SEED):
+            for seq_group, is_prompt in zip(seq_groups, is_prompts):
+                if is_prompt:
+                    _, sampling_params = seq_group
+                    max_best_of_in_batch = max(max_best_of_in_batch,
+                                               sampling_params.best_of)
+        elif sampling_type == SamplingType.BEAM:
+            beam_search_logprobs = logprobs[sample_indices]
+        else:
+            raise ValueError(f"Unsupported sampling type: {sampling_type}")
+    sampled_tokens, _, _ = sample_triton(
+        probs=probs,
+        seeds=sampling_tensors.sampling_seeds,
+        max_best_of=max_best_of_in_batch,
+        sample_indices=sampling_tensors.sample_indices,
+        logprobs=logprobs,
+        # don't save logprobs because we have logic for that below
+        # TODO: use this instead of the CPU-based logic below
+        save_logprobs=False,
+    )
+    # GPU<->CPU sync happens in the loop below.
+    for sampling_type in SamplingType:
+        if sampling_type not in sample_metadata:
+            continue
+        (seq_group_ids, seq_groups, is_prompts, sample_indices,
+         sampled_token_indices) = sample_metadata[sampling_type]
+        if sampling_type == SamplingType.GREEDY:
+            sample_results = _greedy_sample(
+                seq_groups, sampled_tokens[sampled_token_indices][:, 0])
+        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
+            sample_results = _random_sample(
+                seq_groups, is_prompts, sampled_tokens[sampled_token_indices])
+        elif sampling_type == SamplingType.BEAM:
+            sample_results = _beam_search_sample(seq_groups, is_prompts,
+                                                 sampling_metadata.seq_data,
+                                                 beam_search_logprobs)
+        sample_results_dict.update(zip(seq_group_ids, sample_results))
+    sample_results = [
+        sample_results_dict[i]
+        for i in range(len(sampling_metadata.seq_groups))
+    ]
+    return sample_results
+def _sample(
+    probs: torch.Tensor, logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors,
+    include_gpu_probs_tensor: bool, modify_greedy_probs: bool
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
+    return _sample_with_torch(
+        probs,
+        logprobs,
+        sampling_metadata,
+        include_gpu_probs_tensor=include_gpu_probs_tensor,
+        modify_greedy_probs=modify_greedy_probs,
+    )
+    # TODO: Enable once Triton kernel & associated code is faster.
+    # return _sample_with_triton_kernel(probs, logprobs, sampling_metadata,
+    #                                   sampling_tensors)
+def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+    """
+    This function calculates the ranks of the chosen tokens in a logprob tensor.
+    Args:
+        x (torch.Tensor): 2D logprob tensor of shape (N, M)
+                        where N is the no. of tokens and M is the vocab dim.
+        indices (torch.Tensor): List of chosen token indices.
+    Returns:
+        torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
+                    Each element in the returned tensor represents the rank
+                    of the chosen token in the input logprob tensor.
+    """
+    vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
+             indices]
+    return (x > vals[:, None]).long().sum(1).add_(1)
+def _get_logprobs(
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sample_results: List[Tuple[List[int], List[int]]],
+) -> Tuple[List[Optional[List[Optional[Dict[int, float]]]]], List[List[Dict[
+        int, float]]]]:
+    # Prepare query indices
+    batched_logprobs_query_seq_indices: List[int] = []
+    batched_logprobs_query_token_indices: List[int] = []
+    # at least get one logprob for each token
+    largest_num_logprobs = 1
+    sample_idx = 0
+    for i, (seq_group, sample_result) in enumerate(
+            zip(sampling_metadata.seq_groups, sample_results)):
+        seq_ids, sampling_params = seq_group
+        next_token_ids, parent_ids = sample_result
+        num_parent_seqs = len(seq_ids)
+        if (i < sampling_metadata.num_prompts
+                and sampling_params.prompt_logprobs is not None):
+            largest_num_logprobs = max(largest_num_logprobs,
+                                       sampling_params.prompt_logprobs)
+            prompt_len = sampling_metadata.prompt_lens[i]
+            prompt_tokens = sampling_metadata.seq_data[
+                seq_ids[0]].prompt_token_ids
+            batched_logprobs_query_seq_indices.extend(
+                sample_idx + j for j in range(prompt_len - 1))
+            batched_logprobs_query_token_indices.extend(
+                token_id for token_id in prompt_tokens[1:])
+            sample_idx += prompt_len - 1
+        batched_logprobs_query_seq_indices.extend(
+            [sample_idx + parent_id for parent_id in parent_ids])
+        batched_logprobs_query_token_indices.extend(next_token_ids)
+        if sampling_params.logprobs is not None:
+            largest_num_logprobs = max(largest_num_logprobs,
+                                       sampling_params.logprobs)
+        sample_idx += num_parent_seqs
+    assert sample_idx == logprobs.size(0)
+    batched_logprobs_query_seq_indices_gpu = torch.tensor(
+        batched_logprobs_query_seq_indices, device=logprobs.device)
+    batched_logprobs_query_token_indices_gpu = torch.tensor(
+        batched_logprobs_query_token_indices, device=logprobs.device)
+    # Batched query for logprobs of selected token
+    batched_logprobs_query_result = logprobs[[
+        batched_logprobs_query_seq_indices_gpu,
+        batched_logprobs_query_token_indices_gpu
+    ]]
+    batched_ranks_query_result = _get_ranks(
+        logprobs[batched_logprobs_query_seq_indices_gpu],
+        batched_logprobs_query_token_indices_gpu)
+    # Batched query for logprobs of topk tokens
+    if largest_num_logprobs > 0:
+        top_logprobs, top_token_ids = torch.topk(logprobs,
+                                                 largest_num_logprobs,
+                                                 dim=-1)
+        top_logprobs = top_logprobs.cpu()
+        top_token_ids = top_token_ids.cpu()
+    else:
+        top_logprobs, top_token_ids = None, None
+    batched_logprobs_query_result = batched_logprobs_query_result.cpu()
+    batched_ranks_query_result = batched_ranks_query_result.cpu()
+    # Gather results
+    result_prompt_logprobs: List[Optional[PromptLogprobs]] = []
+    result_sample_logprobs: List[SampleLogprobs] = []
+    sample_idx = 0
+    query_result_idx = 0
+    for i, (seq_group, sample_result) in enumerate(
+            zip(sampling_metadata.seq_groups, sample_results)):
+        seq_ids, sampling_params = seq_group
+        next_token_ids, parent_ids = sample_result
+        # Prompt logprobs
+        if (i < sampling_metadata.num_prompts
+                and sampling_params.prompt_logprobs is not None):
+            num_logprobs = sampling_params.prompt_logprobs
+            prompt_tokens = sampling_metadata.seq_data[
+                seq_ids[0]].prompt_token_ids
+            group_prompt_logprobs: PromptLogprobs = [None]
+            for token_id in prompt_tokens[1:]:
+                prompt_logprobs_dict = {
+                    token_id:
+                    (batched_logprobs_query_result[query_result_idx].item(),
+                     batched_ranks_query_result[query_result_idx].item())
+                }
+                if num_logprobs > 0:
+                    prompt_logprobs_dict.update(
+                        zip(
+                            top_token_ids[sample_idx, :num_logprobs].tolist(),
+                            zip(
+                                top_logprobs[
+                                    sample_idx, :num_logprobs].tolist(),
+                                range(1, num_logprobs + 1))))
+                group_prompt_logprobs.append({
+                    token_id: Logprob(*logprob_rank)
+                    for token_id, logprob_rank in prompt_logprobs_dict.items()
+                })
+                sample_idx += 1
+                query_result_idx += 1
+            result_prompt_logprobs.append(group_prompt_logprobs)
+        else:
+            result_prompt_logprobs.append(None)
+        # Sample logprobs
+        num_logprobs = sampling_params.logprobs
+        if num_logprobs is None:
+            num_logprobs = 0
+        group_sample_logprobs: SampleLogprobs = []
+        for next_token_id, parent_id in zip(next_token_ids, parent_ids):
+            sample_logprobs_dict = {
+                next_token_id:
+                (batched_logprobs_query_result[query_result_idx].item(),
+                 batched_ranks_query_result[query_result_idx].item())
+            }
+            query_result_idx += 1
+            if num_logprobs >= 0:
+                sample_logprobs_dict.update(
+                    zip(
+                        top_token_ids[sample_idx +
+                                      parent_id, :num_logprobs].tolist(),
+                        zip(
+                            top_logprobs[sample_idx +
+                                         parent_id, :num_logprobs].tolist(),
+                            range(1, num_logprobs + 1))))
+            group_sample_logprobs.append({
+                token_id: Logprob(*logprob_rank)
+                for token_id, logprob_rank in sample_logprobs_dict.items()
+            })
+        result_sample_logprobs.append(group_sample_logprobs)
+        sample_idx += len(seq_ids)
+    return result_prompt_logprobs, result_sample_logprobs
+def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+                                 sample_indices: torch.Tensor,
+                                 greedy_samples: torch.Tensor) -> None:
+    """Modify the probability distributions of the greedily-sampled tokens such
+    that each sampled token has a "probability" of 1.0. This is required by
+    speculative decoding, which depends on the sampling method being encoded
+    within the probability distribution for correctness.
+    # Why do we only need to do this for greedy sampling?
+    vLLM's sampler performs the following steps for greedy or multinomial
+    (random) sampling:
+        1. Get logits from model.
+        2. Modify logits according to per-sequence sampling parameters.
+            - Multiply by temperature, top-k and top-p masking, penalize tokens
+                according to their frequency, etc.
+        3. Sample a token.
+            - Random sampling simply samples from the modified probability
+                distribution.
+            - Greedy sampling performs `argmax` to obtain the token with the
+                highest likelihood.
+    Ignoring greedy sampling for a moment, we find that the computed probability
+    distribution has the following property: we can sample from it independently
+    and find that the token sampled by the Sampler has a frequency corresponding
+    to how often we see it in our sampling. In other words, for tokens sampled
+    with vLLM's random SamplingType, the computed probability distribution
+    encodes the sampling methodology completely.
+    Greedy sampling does not normally have this property. vLLM modifies logits
+    according to sampling params, then performs `argmax`, then returns the
+    sampled token and the computed probability distribution. If we sample from
+    the distribution, we'll find the likelihood of the greedily-sampled token
+    is not always 1.0.
+    Since lossless speculative decoding requires that the sampling methodology
+    be encoded within the probability distribution, we are motivated to modify
+    the probability distribution such that the sampled token has probability 1
+    when speculative decoding is used.
+    NOTE: Alternatively, we could use an extremely low temperature to achieve
+    greedy sampling using multinomial computation and unite the codepaths. This
+    has implications on the overall design of the sampler, e.g. how to record
+    accurate logprobs for the user, so this improvement is deferred to later.
+    """
+    logprobs[sample_indices, :] = -float('inf')
+    logprobs[sample_indices, greedy_samples] = 0.0
+    probs[sample_indices, :] = 0
+    probs[sample_indices, greedy_samples] = 1.0
+def _build_sampler_output(
+    sample_results: List[Tuple[List[int], List[int]]],
+    sampling_metadata: SamplingMetadata,
+    prompt_logprobs: List[Optional[PromptLogprobs]],
+    sample_logprobs: List[SampleLogprobs],
+    on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor]],
+) -> SamplerOutput:
+    """Construct Python objects with the output of sampling.
+    Args:
+        on_device_tensors: Tuple containing on-device tensors with the
+            probabilities used in sampling and the sampled token ids. This
+            allows post-processing without copies to CPU/serialization, e.g. in
+            speculative decoding rejection sampling.
+    """
+    sampler_output = []
+    for (seq_group, sample_result, group_prompt_logprobs,
+         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
+                                       sample_results, prompt_logprobs,
+                                       sample_logprobs):
+        seq_ids, _ = seq_group
+        next_token_ids, parent_ids = sample_result
+        seq_outputs = []
+        for parent_id, next_token_id, logprobs in zip(parent_ids,
+                                                      next_token_ids,
+                                                      group_sample_logprobs):
+            seq_outputs.append(
+                SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
+        sampler_output.append(
+            SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
+    # If not specified, store None values in SamplerOutput.
+    if on_device_tensors is not None:
+        sampled_token_probs, sampled_token_ids = on_device_tensors
+    else:
+        sampled_token_probs, sampled_token_ids = (None, None)
+    return SamplerOutput(
+        outputs=sampler_output,
+        sampled_token_probs=sampled_token_probs,
+        sampled_token_ids=sampled_token_ids,
+    )

serve/worker.py ADDED Viewed

	@@ -0,0 +1,349 @@

+"""A GPU worker class."""
+import gc
+import os
+from typing import Any, Dict, List, Optional, Set, Tuple
+import torch
+import torch.distributed
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import (broadcast_tensor_dict,
+                              ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.distributed.device_communicators import pynccl_utils
+from vllm.distributed.device_communicators.custom_all_reduce import (
+    init_custom_ar)
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.cache_engine import CacheEngine
+# from vllm.worker.model_runner import ModelRunner
+from vllm.worker.worker_base import WorkerBase
+from serve.model_runner import ModelRunner
+class Worker(WorkerBase):
+    """A worker class that executes (a partition of) the model on a GPU.
+    Each worker is associated with a single GPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the GPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+        self.vision_language_config = vision_language_config
+        if self.vision_language_config:
+            assert not self.lora_config, (
+                "To be tested: vision language model with LoRA settings.")
+        self.model_runner = ModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            load_config=load_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: CacheEngine
+        self.gpu_cache: List[torch.Tensor]
+    def init_device(self) -> None:
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            torch.cuda.set_device(self.device)
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            torch.cuda.empty_cache()
+            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+    def load_model(self, args):
+        self.model_runner.load_model(args)
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.cuda.empty_cache()
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.cuda.synchronize()
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_gpu_blocks = int(
+            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
+        gc.collect()
+        torch.cuda.empty_cache()
+        return num_gpu_blocks, num_cpu_blocks
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(num_gpu_blocks,
+                                    self.cache_config.block_size,
+                                    self.model_config.max_model_len)
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self._init_cache_engine()
+        self._warm_up_model()
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+                                        self.parallel_config)
+        self.gpu_cache = self.cache_engine.gpu_cache
+        self.model_runner.set_block_size(self.cache_engine.block_size)
+    def _warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.gpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+    def cache_swap(
+        self,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        # Issue cache operations.
+        # TODO(woosuk): Profile swapping overhead and optimize if needed.
+        if blocks_to_swap_in:
+            self.cache_engine.swap_in(blocks_to_swap_in)
+        if blocks_to_swap_out:
+            self.cache_engine.swap_out(blocks_to_swap_out)
+        if blocks_to_copy:
+            self.cache_engine.copy(blocks_to_copy)
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
+        blocks_to_swap_in: Optional[Dict[int, int]] = None,
+        blocks_to_swap_out: Optional[Dict[int, int]] = None,
+        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+        num_lookahead_slots: int = 0,
+    ) -> List[SamplerOutput]:
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups = len(seq_group_metadata_list)
+            assert blocks_to_swap_in is not None
+            assert blocks_to_swap_out is not None
+            assert blocks_to_copy is not None
+            data: Dict[str, Any] = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_swap_in = data["blocks_to_swap_in"]
+            blocks_to_swap_out = data["blocks_to_swap_out"]
+            blocks_to_copy = data["blocks_to_copy"]
+        assert blocks_to_swap_in is not None
+        assert blocks_to_swap_out is not None
+        assert blocks_to_copy is not None
+        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return []
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.gpu_cache)
+        # Worker only supports single-step execution. Wrap the output in a list
+        # to conform to interface.
+        return [output]
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(self.cache_config,
+                                                self.model_config,
+                                                self.parallel_config)
+def init_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank)
+    if pynccl_utils.is_initialized():
+        pynccl_world_size = pynccl_utils.get_world_size()
+        if pynccl_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "pynccl is already initialized but the pynccl world "
+                "size does not match parallel_config.world_size "
+                f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+    elif parallel_config.world_size > 1:
+        # NOTE(woosuk): We don't initialize pynccl process group when world size
+        # is 1.
+        pynccl_utils.init_process_group(
+            world_size=parallel_config.world_size,
+            local_rank=local_rank,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+    # Initialize a custom fast all-reduce implementation.
+    if not parallel_config.disable_custom_all_reduce:
+        init_custom_ar()
+    # A small all_reduce for warmup.
+    torch.distributed.all_reduce(torch.zeros(1).cuda())
+    if pynccl_utils.is_initialized():
+        pynccl_utils.all_reduce(torch.zeros(1).cuda())
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    if torch_dtype == torch.bfloat16:
+        compute_capability = torch.cuda.get_device_capability()
+        if compute_capability[0] < 8:
+            gpu_name = torch.cuda.get_device_name()
+            raise ValueError(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+                f"{compute_capability[0]}.{compute_capability[1]}. "
+                "You can use float16 instead by explicitly setting the"
+                "`dtype` flag in CLI, for example: --dtype=half.")
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
+                                max_model_len) -> None:
+    if num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")