Upload inference_vLLM.ipynb

Browse files

Files changed (1) hide show

inference_vLLM.ipynb +576 -0

inference_vLLM.ipynb ADDED Viewed

	@@ -0,0 +1,576 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "97626fd6-6fcc-4d8f-a5af-40441e46f98b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting vllm\n",
+      "  Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: autoawq in /usr/local/lib/python3.11/dist-packages (0.2.7.post2)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.0)\n",
+      "Collecting sentencepiece (from vllm)\n",
+      "  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
+      "Requirement already satisfied: numpy<2.0.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (1.26.3)\n",
+      "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (2.32.3)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from vllm) (4.67.1)\n",
+      "Collecting py-cpuinfo (from vllm)\n",
+      "  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)\n",
+      "Requirement already satisfied: transformers>=4.45.2 in /usr/local/lib/python3.11/dist-packages (from vllm) (4.47.0.dev0)\n",
+      "Requirement already satisfied: tokenizers>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.20.3)\n",
+      "Collecting protobuf (from vllm)\n",
+      "  Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from vllm) (3.11.8)\n",
+      "Collecting openai>=1.45.0 (from vllm)\n",
+      "  Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)\n",
+      "Collecting uvicorn[standard] (from vllm)\n",
+      "  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)\n",
+      "Collecting pydantic>=2.9 (from vllm)\n",
+      "  Downloading pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)\n",
+      "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from vllm) (10.2.0)\n",
+      "Requirement already satisfied: prometheus-client>=0.18.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.21.0)\n",
+      "Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)\n",
+      "  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting tiktoken>=0.6.0 (from vllm)\n",
+      "  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm)\n",
+      "  Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)\n",
+      "Collecting outlines<0.1,>=0.0.43 (from vllm)\n",
+      "  Downloading outlines-0.0.46-py3-none-any.whl.metadata (15 kB)\n",
+      "Collecting typing-extensions>=4.10 (from vllm)\n",
+      "  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Requirement already satisfied: filelock>=3.10.4 in /usr/local/lib/python3.11/dist-packages (from vllm) (3.13.1)\n",
+      "Collecting partial-json-parser (from vllm)\n",
+      "  Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl.metadata (6.2 kB)\n",
+      "Requirement already satisfied: pyzmq in /usr/local/lib/python3.11/dist-packages (from vllm) (24.0.1)\n",
+      "Collecting msgspec (from vllm)\n",
+      "  Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)\n",
+      "Collecting gguf==0.10.0 (from vllm)\n",
+      "  Downloading gguf-0.10.0-py3-none-any.whl.metadata (3.5 kB)\n",
+      "Requirement already satisfied: importlib-metadata in /usr/lib/python3/dist-packages (from vllm) (4.6.4)\n",
+      "Collecting mistral-common>=1.5.0 (from mistral-common[opencv]>=1.5.0->vllm)\n",
+      "  Downloading mistral_common-1.5.1-py3-none-any.whl.metadata (4.6 kB)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.2)\n",
+      "Collecting einops (from vllm)\n",
+      "  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting compressed-tensors==0.8.0 (from vllm)\n",
+      "  Downloading compressed_tensors-0.8.0-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Collecting ray>=2.9 (from vllm)\n",
+      "  Downloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (17 kB)\n",
+      "Collecting nvidia-ml-py>=12.560.30 (from vllm)\n",
+      "  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)\n",
+      "Collecting torch==2.5.1 (from vllm)\n",
+      "  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)\n",
+      "Collecting torchvision==0.20.1 (from vllm)\n",
+      "  Downloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)\n",
+      "Collecting xformers==0.0.28.post3 (from vllm)\n",
+      "  Downloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)\n",
+      "Collecting fastapi!=0.113.*,!=0.114.0,>=0.107.0 (from vllm)\n",
+      "  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.1.3)\n",
+      "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (2024.2.0)\n",
+      "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (9.1.0.70)\n",
+      "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-curand-cu12==10.3.5.147 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Collecting nvidia-nccl-cu12==2.21.5 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-nvtx-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting triton==3.1.0 (from torch==2.5.1->vllm)\n",
+      "  Downloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)\n",
+      "Collecting sympy==1.13.1 (from torch==2.5.1->vllm)\n",
+      "  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch==2.5.1->vllm) (1.3.0)\n",
+      "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (from autoawq) (1.1.1)\n",
+      "Requirement already satisfied: datasets>=2.20 in /usr/local/lib/python3.11/dist-packages (from autoawq) (3.1.0)\n",
+      "Requirement already satisfied: zstandard in /usr/local/lib/python3.11/dist-packages (from autoawq) (0.23.0)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (18.1.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (2.2.3)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (3.5.0)\n",
+      "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.70.16)\n",
+      "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.26.3)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (24.1)\n",
+      "Collecting starlette<0.42.0,>=0.40.0 (from fastapi!=0.113.*,!=0.114.0,>=0.107.0->vllm)\n",
+      "  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
+      "Collecting interegular>=0.3.2 (from lm-format-enforcer<0.11,>=0.10.9->vllm)\n",
+      "  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)\n",
+      "Requirement already satisfied: jsonschema<5.0.0,>=4.21.1 in /usr/local/lib/python3.11/dist-packages (from mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (4.23.0)\n",
+      "Collecting pillow (from vllm)\n",
+      "  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
+      "Collecting tiktoken>=0.6.0 (from vllm)\n",
+      "  Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting opencv-python-headless<5.0.0,>=4.0.0 (from mistral-common[opencv]>=1.5.0->vllm)\n",
+      "  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (4.6.0)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.45.0->vllm) (1.7.0)\n",
+      "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (0.27.2)\n",
+      "Collecting jiter<1,>=0.4.0 (from openai>=1.45.0->vllm)\n",
+      "  Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
+      "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (1.3.1)\n",
+      "Collecting lark (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)\n",
+      "Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (1.6.0)\n",
+      "Collecting cloudpickle (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n",
+      "Collecting diskcache (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting numba (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
+      "Requirement already satisfied: referencing in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (0.35.1)\n",
+      "Collecting pycountry (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting pyairports (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading pyairports-2.1.1-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Collecting annotated-types>=0.6.0 (from pydantic>=2.9->vllm)\n",
+      "  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n",
+      "Collecting pydantic-core==2.27.1 (from pydantic>=2.9->vllm)\n",
+      "  Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting click>=7.0 (from ray>=2.9->vllm)\n",
+      "  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Collecting msgpack<2.0.0,>=1.0.0 (from ray>=2.9->vllm)\n",
+      "  Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
+      "Requirement already satisfied: aiosignal in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.3.1)\n",
+      "Requirement already satisfied: frozenlist in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.5.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2.2.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2024.8.30)\n",
+      "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken>=0.6.0->vllm) (2024.11.6)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers>=4.45.2->vllm) (0.4.5)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (2.4.3)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (24.2.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (6.1.0)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (0.2.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (1.18.0)\n",
+      "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.11/dist-packages (from uvicorn[standard]->vllm) (0.14.0)\n",
+      "Collecting httptools>=0.6.3 (from uvicorn[standard]->vllm)\n",
+      "  Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
+      "Collecting python-dotenv>=0.13 (from uvicorn[standard]->vllm)\n",
+      "  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]->vllm)\n",
+      "  Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
+      "Collecting watchfiles>=0.13 (from uvicorn[standard]->vllm)\n",
+      "  Downloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
+      "Collecting websockets>=10.4 (from uvicorn[standard]->vllm)\n",
+      "  Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai>=1.45.0->vllm) (1.0.5)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (2023.12.1)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (0.20.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch==2.5.1->vllm) (2.1.5)\n",
+      "Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2024.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2024.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets>=2.20->autoawq) (1.16.0)\n",
+      "Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl (198.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.9/198.9 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading compressed_tensors-0.8.0-py3-none-any.whl (86 kB)\n",
+      "Downloading gguf-0.10.0-py3-none-any.whl (71 kB)\n",
+      "Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m906.5/906.5 MB\u001b[0m \u001b[31m107.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m164.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl (16.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.7/16.7 MB\u001b[0m \u001b[31m150.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m138.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m132.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━��━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m138.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m117.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m109.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m135.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m148.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m154.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl (188.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.7/188.7 MB\u001b[0m \u001b[31m150.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m134.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (99 kB)\n",
+      "Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m165.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.5/209.5 MB\u001b[0m \u001b[31m142.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
+      "Downloading lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)\n",
+      "Downloading mistral_common-1.5.1-py3-none-any.whl (6.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m145.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m180.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)\n",
+      "Downloading openai-1.57.0-py3-none-any.whl (389 kB)\n",
+      "Downloading outlines-0.0.46-py3-none-any.whl (101 kB)\n",
+      "Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m152.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl (19 kB)\n",
+      "Downloading pydantic-2.10.3-py3-none-any.whl (456 kB)\n",
+      "Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m212.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl (67.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 MB\u001b[0m \u001b[31m172.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n",
+      "Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m153.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n",
+      "Downloading einops-0.8.0-py3-none-any.whl (43 kB)\n",
+      "Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209 kB)\n",
+      "Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl (9.9 kB)\n",
+      "Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
+      "Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)\n",
+      "Downloading click-8.1.7-py3-none-any.whl (97 kB)\n",
+      "Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (459 kB)\n",
+      "Downloading interegular-0.3.3-py37-none-any.whl (23 kB)\n",
+      "Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (343 kB)\n",
+      "Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (403 kB)\n",
+      "Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.9/49.9 MB\u001b[0m \u001b[31m155.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
+      "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
+      "Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m157.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)\n",
+      "Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168 kB)\n",
+      "Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n",
+      "Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
+      "Downloading lark-1.2.2-py3-none-any.whl (111 kB)\n",
+      "Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m138.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyairports-2.1.1-py3-none-any.whl (371 kB)\n",
+      "Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m137.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading uvicorn-0.32.1-py3-none-any.whl (63 kB)\n",
+      "Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.9/43.9 MB\u001b[0m \u001b[31m169.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: sentencepiece, pyairports, py-cpuinfo, nvidia-ml-py, websockets, uvloop, typing-extensions, triton, sympy, python-dotenv, pycountry, protobuf, pillow, partial-json-parser, opencv-python-headless, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, msgspec, msgpack, llvmlite, lark, jiter, interegular, httptools, gguf, einops, diskcache, cloudpickle, click, annotated-types, watchfiles, uvicorn, tiktoken, starlette, pydantic-core, nvidia-cusparse-cu12, numba, pydantic, prometheus-fastapi-instrumentator, nvidia-cusolver-cu12, torch, ray, openai, mistral-common, lm-format-enforcer, fastapi, xformers, torchvision, outlines, compressed-tensors, vllm\n",
+      "  Attempting uninstall: typing-extensions\n",
+      "    Found existing installation: typing_extensions 4.9.0\n",
+      "    Uninstalling typing_extensions-4.9.0:\n",
+      "      Successfully uninstalled typing_extensions-4.9.0\n",
+      "  Attempting uninstall: triton\n",
+      "    Found existing installation: triton 3.0.0\n",
+      "    Uninstalling triton-3.0.0:\n",
+      "      Successfully uninstalled triton-3.0.0\n",
+      "  Attempting uninstall: sympy\n",
+      "    Found existing installation: sympy 1.12\n",
+      "    Uninstalling sympy-1.12:\n",
+      "      Successfully uninstalled sympy-1.12\n",
+      "  Attempting uninstall: pillow\n",
+      "    Found existing installation: pillow 10.2.0\n",
+      "    Uninstalling pillow-10.2.0:\n",
+      "      Successfully uninstalled pillow-10.2.0\n",
+      "  Attempting uninstall: nvidia-nvtx-cu12\n",
+      "    Found existing installation: nvidia-nvtx-cu12 12.4.99\n",
+      "    Uninstalling nvidia-nvtx-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-nvtx-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-nvjitlink-cu12\n",
+      "    Found existing installation: nvidia-nvjitlink-cu12 12.4.99\n",
+      "    Uninstalling nvidia-nvjitlink-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-nvjitlink-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-nccl-cu12\n",
+      "    Found existing installation: nvidia-nccl-cu12 2.20.5\n",
+      "    Uninstalling nvidia-nccl-cu12-2.20.5:\n",
+      "      Successfully uninstalled nvidia-nccl-cu12-2.20.5\n",
+      "  Attempting uninstall: nvidia-curand-cu12\n",
+      "    Found existing installation: nvidia-curand-cu12 10.3.5.119\n",
+      "    Uninstalling nvidia-curand-cu12-10.3.5.119:\n",
+      "      Successfully uninstalled nvidia-curand-cu12-10.3.5.119\n",
+      "  Attempting uninstall: nvidia-cufft-cu12\n",
+      "    Found existing installation: nvidia-cufft-cu12 11.2.0.44\n",
+      "    Uninstalling nvidia-cufft-cu12-11.2.0.44:\n",
+      "      Successfully uninstalled nvidia-cufft-cu12-11.2.0.44\n",
+      "  Attempting uninstall: nvidia-cuda-runtime-cu12\n",
+      "    Found existing installation: nvidia-cuda-runtime-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-runtime-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-runtime-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cuda-nvrtc-cu12\n",
+      "    Found existing installation: nvidia-cuda-nvrtc-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-nvrtc-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cuda-cupti-cu12\n",
+      "    Found existing installation: nvidia-cuda-cupti-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-cupti-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-cupti-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cublas-cu12\n",
+      "    Found existing installation: nvidia-cublas-cu12 12.4.2.65\n",
+      "    Uninstalling nvidia-cublas-cu12-12.4.2.65:\n",
+      "      Successfully uninstalled nvidia-cublas-cu12-12.4.2.65\n",
+      "  Attempting uninstall: nvidia-cusparse-cu12\n",
+      "    Found existing installation: nvidia-cusparse-cu12 12.3.0.142\n",
+      "    Uninstalling nvidia-cusparse-cu12-12.3.0.142:\n",
+      "      Successfully uninstalled nvidia-cusparse-cu12-12.3.0.142\n",
+      "  Attempting uninstall: nvidia-cusolver-cu12\n",
+      "    Found existing installation: nvidia-cusolver-cu12 11.6.0.99\n",
+      "    Uninstalling nvidia-cusolver-cu12-11.6.0.99:\n",
+      "      Successfully uninstalled nvidia-cusolver-cu12-11.6.0.99\n",
+      "  Attempting uninstall: torch\n",
+      "    Found existing installation: torch 2.4.1+cu124\n",
+      "    Uninstalling torch-2.4.1+cu124:\n",
+      "      Successfully uninstalled torch-2.4.1+cu124\n",
+      "  Attempting uninstall: torchvision\n",
+      "    Found existing installation: torchvision 0.19.1+cu124\n",
+      "    Uninstalling torchvision-0.19.1+cu124:\n",
+      "      Successfully uninstalled torchvision-0.19.1+cu124\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "torchaudio 2.4.1+cu124 requires torch==2.4.1, but you have torch 2.5.1 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed annotated-types-0.7.0 click-8.1.7 cloudpickle-3.1.0 compressed-tensors-0.8.0 diskcache-5.6.3 einops-0.8.0 fastapi-0.115.6 gguf-0.10.0 httptools-0.6.4 interegular-0.3.3 jiter-0.8.0 lark-1.2.2 llvmlite-0.43.0 lm-format-enforcer-0.10.9 mistral-common-1.5.1 msgpack-1.1.0 msgspec-0.18.6 numba-0.60.0 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-ml-py-12.560.30 nvidia-nccl-cu12-2.21.5 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.4.127 openai-1.57.0 opencv-python-headless-4.10.0.84 outlines-0.0.46 partial-json-parser-0.2.1.1.post4 pillow-10.4.0 prometheus-fastapi-instrumentator-7.0.0 protobuf-5.29.1 py-cpuinfo-9.0.0 pyairports-2.1.1 pycountry-24.6.1 pydantic-2.10.3 pydantic-core-2.27.1 python-dotenv-1.0.1 ray-2.40.0 sentencepiece-0.2.0 starlette-0.41.3 sympy-1.13.1 tiktoken-0.7.0 torch-2.5.1 torchvision-0.20.1 triton-3.1.0 typing-extensions-4.12.2 uvicorn-0.32.1 uvloop-0.21.0 vllm-0.6.4.post1 watchfiles-1.0.0 websockets-14.1 xformers-0.0.28.post3\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install vllm autoawq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "56a955b4-c65a-4146-9281-ebcb4ee81209",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sun Dec  8 00:57:27 2024       \n",
+      "+-----------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |\n",
+      "|-----------------------------------------+------------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                        |               MIG M. |\n",
+      "|=========================================+========================+======================|\n",
+      "|   0  NVIDIA H100 NVL                On  |   00000000:AE:00.0 Off |                    0 |\n",
+      "| N/A   32C    P0             60W /  310W |       1MiB /  95830MiB |      0%      Default |\n",
+      "|                                         |                        |             Disabled |\n",
+      "+-----------------------------------------+------------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+-----------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                              |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
+      "|        ID   ID                                                               Usage      |\n",
+      "|=========================================================================================|\n",
+      "|  No running processes found                                                             |\n",
+      "+-----------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8f40dc17-a05a-466f-85d9-5bfa473c133b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ecf09ccb-b47b-40bc-a501-ded94239465d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-08 01:02:42 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
+      "INFO 12-08 01:02:42 awq_marlin.py:113] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference\n",
+      "WARNING 12-08 01:02:42 config.py:428] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+      "WARNING 12-08 01:02:42 arg_utils.py:1013] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.\n",
+      "INFO 12-08 01:02:42 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.\n",
+      "INFO 12-08 01:02:42 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN', speculative_config=None, tokenizer='kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN, num_scheduler_steps=1, chunked_prefill_enabled=True multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n",
+      "INFO 12-08 01:02:44 selector.py:135] Using Flash Attention backend.\n",
+      "INFO 12-08 01:02:44 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN...\n",
+      "INFO 12-08 01:02:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1b29444155504b5eaadb35d628775f18",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-08 01:02:53 model_runner.py:1077] Loading model weights took 37.0786 GB\n",
+      "INFO 12-08 01:02:54 worker.py:232] Memory profiling results: total_gpu_memory=93.11GiB initial_memory_usage=37.70GiB peak_torch_memory=38.28GiB memory_usage_post_profile=37.80GiB non_torch_memory=0.69GiB kv_cache_size=51.35GiB gpu_memory_utilization=0.97\n",
+      "INFO 12-08 01:02:54 gpu_executor.py:113] # GPU blocks: 10516, # CPU blocks: 819\n",
+      "INFO 12-08 01:02:54 gpu_executor.py:117] Maximum concurrency for 131072 tokens per request: 1.28x\n",
+      "INFO 12-08 01:03:00 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "INFO 12-08 01:03:00 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 12-08 01:03:27 model_runner.py:1518] Graph capturing finished in 27 secs, took 1.58 GiB\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = LLM(\n",
+    "    model=\"kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN\",\n",
+    "    gpu_memory_utilization=0.97,\n",
+    "    quantization=\"awq\"\n",
+    ")\n",
+    "tokenizer = llm.get_tokenizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7ec0376a-e9da-4f47-82d4-8da03b9540e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"あなたは日本語で応答するAIチャットボットです。ユーザをサポートしてください。\"},\n",
+    "    {\"role\": \"user\", \"content\": \"plotly.graph_objectsを使って散布図を作るサンプルコードを書いてください。\"},\n",
+    "]\n",
+    "\n",
+    "prompt = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True\n",
+    ")\n",
+    "\n",
+    "sampling_params = SamplingParams(\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.9,\n",
+    "    max_tokens=1000\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "89debe47-9611-4ab3-bd32-4c2b3e546e0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:13<00:00, 13.02s/it, est. speed input: 6.22 toks/s, output: 20.82 toks/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "plotly.graph_objectsを使って散布図を作るサンプルコードは以下の通りです。\n",
+      "\n",
+      "```python\n",
+      "import plotly.graph_objects as go\n",
+      "\n",
+      "# サンプルデータ\n",
+      "x = [1, 2, 3, 4, 5]\n",
+      "y = [2, 4, 6, 8, 10]\n",
+      "\n",
+      "# 散布図を作成\n",
+      "fig = go.Figure(data=[go.Scatter(x=x, y=y, mode='markers')])\n",
+      "\n",
+      "# タイトルと軸ラベルを設定\n",
+      "fig.update_layout(\n",
+      "    title='散布図',\n",
+      "    xaxis_title='X軸',\n",
+      "    yaxis_title='Y軸'\n",
+      ")\n",
+      "\n",
+      "# 散布図を表示\n",
+      "fig.show()\n",
+      "```\n",
+      "\n",
+      "このコードでは、`plotly.graph_objects`モジュールをインポートし、`go.Figure`クラスを使用して散布図を作成しています。`go.Scatter`クラスを使用して、X軸とY軸のデータを指定し、`mode='markers'`でマーカーだけを表示します。さらに、`update_layout`メソッドでタイトルと軸ラベルを設定しています。最後に、`show`メソッドで散布図を表示しています。\n",
+      "CPU times: user 13 s, sys: 126 ms, total: 13.2 s\n",
+      "Wall time: 13 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "outputs = llm.generate(prompt, sampling_params)\n",
+    "print(outputs[0].outputs[0].text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20303d20-d806-4836-ab0b-f2399fdddf2d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}