import asyncio from typing import Generator, List import uvicorn from fastapi import FastAPI, Request from fastapi.responses import Response, StreamingResponse import torch import tensorrt_llm from tensorrt_llm.logger import logger from tensorrt_llm.runtime import ModelRunner from utils import load_tokenizer, read_model_name, throttle_generator TOKENIZER_DIR = "./model/tokenizer" ENGINE_DIR = "./model/engine" STREAM_INTERVAL = 5 MAX_NEW_TOKENS = 1024 MAX_ATTENTION_WINDOW_SIZE = 4096 TEMPERATURE = 1.0 TOP_K = 40 TOP_P = 0.5 LENGTH_PENALTY = 1.0 REPETITION_PENALTY = 1.2 app = FastAPI() config = uvicorn.Config( app, host=None, port=4000, log_level="info", timeout_keep_alive=5 ) runner: ModelRunner = None tokenizer = None pad_id = None end_id = None @app.get("/health") async def health() -> Response: return Response(status_code=200) @app.post("/summary") async def generate(request: Request) -> Response: assert runner is not None assert tokenizer is not None assert pad_id is not None assert end_id is not None req_json: dict = await request.json() text = req_json.pop("text") instruction = f"[INST] You are a world class expert summarizer tasked with providing a **high level** summary of webpages. Ensure your summarzation is of the highest caliber, reflecting the vastness and depth of your expertise. Ignore messy portions of the page that might be junk text.\n### Webpage Text:\n```{text}``` [/INST]" print(f"\n\nInstruction: {instruction}") input_ids: List[torch.Tensor] = tokenizer.encode( instruction, add_special_tokens=False, truncation=True, max_length=4096, return_tensors="pt", ) # todo encode again here, but output text input_len = input_ids.size(1) with torch.no_grad(): # Batch of outputs outputs: Generator[dict, None, None] = runner.generate( [input_ids], max_new_tokens=MAX_NEW_TOKENS, max_attention_window_size=4096, end_id=end_id, pad_id=pad_id, temperature=TEMPERATURE, top_k=TOP_K, top_p=TOP_P, length_penalty=LENGTH_PENALTY, repetition_penalty=REPETITION_PENALTY, streaming=True, output_sequence_lengths=True, return_dict=True, ) torch.cuda.synchronize() def stream_results() -> Generator[str, None, None]: for output in throttle_generator(outputs, STREAM_INTERVAL): output_ids: torch.Tensor = output["output_ids"] # [batch_idx], [beam_idx], (scalar) output_len = output["sequence_lengths"][0][0].item() output_txt: str = tokenizer.decode( # [batch_idx], [beam_idx], [slice] output_ids[0][0][input_len:output_len].tolist() ) yield output_txt return StreamingResponse(stream_results(), media_type="text/plain") async def main(): global runner, tokenizer, pad_id, end_id runtime_rank = tensorrt_llm.mpi_rank() logger.set_level("info") model_name = read_model_name(ENGINE_DIR) tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=TOKENIZER_DIR, model_name=model_name, ) runner_kwargs = dict(engine_dir=ENGINE_DIR, rank=runtime_rank, debug_mode=True) runner = ModelRunner.from_dir(**runner_kwargs) await uvicorn.Server(config).serve() if __name__ == "__main__": asyncio.run(main())