import multiprocessing import os from typing import Optional, Dict, List, Union import dotenv from loguru import logger from pydantic import BaseModel, Field from api.utils.compat import model_json, disable_warnings dotenv.load_dotenv() disable_warnings(BaseModel) def get_bool_env(key, default="false"): return os.environ.get(key, default).lower() == "true" def get_env(key, default): val = os.environ.get(key, "") return val or default class Settings(BaseModel): """ Settings class. """ host: Optional[str] = Field( default=get_env("HOST", "0.0.0.0"), description="Listen address.", ) port: Optional[int] = Field( default=int(get_env("PORT", 8000)), description="Listen port.", ) api_prefix: Optional[str] = Field( default=get_env("API_PREFIX", "/v1"), description="API prefix.", ) engine: Optional[str] = Field( default=get_env("ENGINE", "default"), description="Choices are ['default', 'vllm', 'llama.cpp', 'tgi'].", ) # model related model_name: Optional[str] = Field( default=get_env("MODEL_NAME", None), description="The name of the model to use for generating completions." ) model_path: Optional[str] = Field( default=get_env("MODEL_PATH", None), description="The path to the model to use for generating completions." ) adapter_model_path: Optional[str] = Field( default=get_env("ADAPTER_MODEL_PATH", None), description="Path to a LoRA file to apply to the model." ) resize_embeddings: Optional[bool] = Field( default=get_bool_env("RESIZE_EMBEDDINGS"), description="Whether to resize embeddings." ) dtype: Optional[str] = Field( default=get_env("DTYPE", "half"), description="Precision dtype." ) # device related device: Optional[str] = Field( default=get_env("DEVICE", "cuda"), description="Device to load the model." ) device_map: Optional[Union[str, Dict]] = Field( default=get_env("DEVICE_MAP", None), description="Device map to load the model." ) gpus: Optional[str] = Field( default=get_env("GPUS", None), description="Specify which gpus to load the model." ) num_gpus: Optional[int] = Field( default=int(get_env("NUM_GPUs", 1)), ge=0, description="How many gpus to load the model." ) # embedding related only_embedding: Optional[bool] = Field( default=get_bool_env("ONLY_EMBEDDING"), description="Whether to launch embedding server only." ) embedding_name: Optional[str] = Field( default=get_env("EMBEDDING_NAME", None), description="The path to the model to use for generating embeddings." ) embedding_size: Optional[int] = Field( default=int(get_env("EMBEDDING_SIZE", -1)), description="The embedding size to use for generating embeddings." ) embedding_device: Optional[str] = Field( default=get_env("EMBEDDING_DEVICE", "cuda"), description="Device to load the model." ) # quantize related quantize: Optional[int] = Field( default=int(get_env("QUANTIZE", 16)), description="Quantize level for model." ) load_in_8bit: Optional[bool] = Field( default=get_bool_env("LOAD_IN_8BIT"), description="Whether to load the model in 8 bit." ) load_in_4bit: Optional[bool] = Field( default=get_bool_env("LOAD_IN_4BIT"), description="Whether to load the model in 4 bit." ) using_ptuning_v2: Optional[bool] = Field( default=get_bool_env("USING_PTUNING_V2"), description="Whether to load the model using ptuning_v2." ) pre_seq_len: Optional[int] = Field( default=int(get_env("PRE_SEQ_LEN", 128)), ge=0, description="PRE_SEQ_LEN for ptuning_v2." ) # context related context_length: Optional[int] = Field( default=int(get_env("CONTEXT_LEN", -1)), ge=-1, description="Context length for generating completions." ) chat_template: Optional[str] = Field( default=get_env("PROMPT_NAME", None), description="Chat template for generating completions." ) patch_type: Optional[str] = Field( default=get_env("PATCH_TYPE", None), description="Patch type for generating completions." ) alpha: Optional[Union[str, float]] = Field( default=get_env("ALPHA", "auto"), description="Alpha for generating completions." ) # vllm related trust_remote_code: Optional[bool] = Field( default=get_bool_env("TRUST_REMOTE_CODE"), description="Whether to use remote code." ) tokenize_mode: Optional[str] = Field( default=get_env("TOKENIZE_MODE", "auto"), description="Tokenize mode for vllm server." ) tensor_parallel_size: Optional[int] = Field( default=int(get_env("TENSOR_PARALLEL_SIZE", 1)), ge=1, description="Tensor parallel size for vllm server." ) gpu_memory_utilization: Optional[float] = Field( default=float(get_env("GPU_MEMORY_UTILIZATION", 0.9)), description="GPU memory utilization for vllm server." ) max_num_batched_tokens: Optional[int] = Field( default=int(get_env("MAX_NUM_BATCHED_TOKENS", -1)), ge=-1, description="Max num batched tokens for vllm server." ) max_num_seqs: Optional[int] = Field( default=int(get_env("MAX_NUM_SEQS", 256)), ge=1, description="Max num seqs for vllm server." ) quantization_method: Optional[str] = Field( default=get_env("QUANTIZATION_METHOD", None), description="Quantization method for vllm server." ) # support for transformers.TextIteratorStreamer use_streamer_v2: Optional[bool] = Field( default=get_bool_env("USE_STREAMER_V2"), description="Support for transformers.TextIteratorStreamer." ) # support for api key check api_keys: Optional[List[str]] = Field( default=get_env("API_KEYS", "").split(",") if get_env("API_KEYS", "") else None, description="Support for api key check." ) activate_inference: Optional[bool] = Field( default=get_bool_env("ACTIVATE_INFERENCE", "true"), description="Whether to activate inference." ) interrupt_requests: Optional[bool] = Field( default=get_bool_env("INTERRUPT_REQUESTS", "true"), description="Whether to interrupt requests when a new request is received.", ) # support for llama.cpp n_gpu_layers: Optional[int] = Field( default=int(get_env("N_GPU_LAYERS", 0)), ge=-1, description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", ) main_gpu: Optional[int] = Field( default=int(get_env("MAIN_GPU", 0)), ge=0, description="Main GPU to use.", ) tensor_split: Optional[List[float]] = Field( default=float(get_env("TENSOR_SPLIT", None)) if get_env("TENSOR_SPLIT", None) else None, description="Split layers across multiple GPUs in proportion.", ) n_batch: Optional[int] = Field( default=int(get_env("N_BATCH", 512)), ge=1, description="The batch size to use per eval." ) n_threads: Optional[int] = Field( default=int(get_env("N_THREADS", max(multiprocessing.cpu_count() // 2, 1))), ge=1, description="The number of threads to use.", ) n_threads_batch: Optional[int] = Field( default=int(get_env("N_THREADS_BATCH", max(multiprocessing.cpu_count() // 2, 1))), ge=0, description="The number of threads to use when batch processing.", ) rope_scaling_type: Optional[int] = Field( default=int(get_env("ROPE_SCALING_TYPE", -1)) ) rope_freq_base: Optional[float] = Field( default=float(get_env("ROPE_FREQ_BASE", 0.0)), description="RoPE base frequency" ) rope_freq_scale: Optional[float] = Field( default=float(get_env("ROPE_FREQ_SCALE", 0.0)), description="RoPE frequency scaling factor", ) # support for tgi: https://github.com/huggingface/text-generation-inference tgi_endpoint: Optional[str] = Field( default=get_env("TGI_ENDPOINT", None), description="Text Generation Inference Endpoint.", ) # support for tei: https://github.com/huggingface/text-embeddings-inference tei_endpoint: Optional[str] = Field( default=get_env("TEI_ENDPOINT", None), description="Text Embeddings Inference Endpoint.", ) max_concurrent_requests: Optional[int] = Field( default=int(get_env("MAX_CONCURRENT_REQUESTS", 256)), description="The maximum amount of concurrent requests for this particular deployment." ) max_client_batch_size: Optional[int] = Field( default=int(get_env("MAX_CLIENT_BATCH_SIZE", 32)), description="Control the maximum number of inputs that a client can send in a single request." ) SETTINGS = Settings() logger.debug(f"SETTINGS: {model_json(SETTINGS, indent=4)}") if SETTINGS.gpus: if len(SETTINGS.gpus.split(",")) < SETTINGS.num_gpus: raise ValueError( f"Larger --num_gpus ({SETTINGS.num_gpus}) than --gpus {SETTINGS.gpus}!" ) os.environ["CUDA_VISIBLE_DEVICES"] = SETTINGS.gpus