import os # ! UI Markdown information MODEL_TITLE = "

Multi-Purpose Chatbot

" MODEL_DESC = f"""
A multi-purpose helpful assistant with multiple functionalities (Chat, text-completion, RAG chat, batch inference).
This DEMO is on `export BACKEND=debug` to demonstrate the UI - there is NO LLM running and it always return "Wow that's very very cool, please try again." """.strip() MODEL_INFO = """

Model Name: {model_path} (debug mode)

""" CITE_MARKDOWN = """ ## Citation If you find our project useful, hope you can star our repo and cite our repo as follows: ``` @article{multipurpose_chatbot_2024, author = {Xuan-Phi Nguyen, }, title = {Multipurpose Chatbot}, year = 2024, } ``` """ USE_PANEL = bool(int(os.environ.get("USE_PANEL", "1"))) CHATBOT_HEIGHT = int(os.environ.get("CHATBOT_HEIGHT", "500")) ALLOWED_PATHS = [] DEMOS = os.environ.get("DEMOS", "") DEMOS = DEMOS.split(",") if DEMOS.strip() != "" else [ "DocChatInterfaceDemo", "ChatInterfaceDemo", "TextCompletionDemo", # "RagChatInterfaceDemo", # "VisionChatInterfaceDemo", # "VisionDocChatInterfaceDemo", ] # DEMOS=DocChatInterfaceDemo,ChatInterfaceDemo,RagChatInterfaceDemo,TextCompletionDemo # ! server info PORT = int(os.environ.get("PORT", "7860")) PROXY = os.environ.get("PROXY", "").strip() # ! backend info BACKEND = os.environ.get("BACKEND", "debug") # ! model information # for RAG RAG_EMBED_MODEL_NAME = os.environ.get("RAG_EMBED_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2") CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1024")) CHUNK_OVERLAP = int(os.environ.get("CHUNK_SIZE", "50")) SYSTEM_PROMPT = os.environ.get("SYSTEM_PROMPT", """You are a helpful, respectful, honest and safe AI assistant.""") MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "2048")) TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.7")) # ! these values currently not used FREQUENCE_PENALTY = float(os.environ.get("FREQUENCE_PENALTY", "0.0")) PRESENCE_PENALTY = float(os.environ.get("PRESENCE_PENALTY", "0.0")) # Transformers or vllm MODEL_PATH = os.environ.get("MODEL_PATH", "teknium/OpenHermes-2.5-Mistral-7B") MODEL_NAME = os.environ.get("MODEL_NAME", "Cool-Chatbot") DTYPE = os.environ.get("DTYPE", "bfloat16") DEVICE = os.environ.get("DEVICE", "cuda") # VLLM GPU_MEMORY_UTILIZATION = float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.9")) TENSOR_PARALLEL = int(os.environ.get("TENSOR_PARALLEL", "1")) QUANTIZATION = str(os.environ.get("QUANTIZATION", "")) STREAM_YIELD_MULTIPLE = int(os.environ.get("STREAM_YIELD_MULTIPLE", "1")) # how many iterations to perform safety check on response STREAM_CHECK_MULTIPLE = int(os.environ.get("STREAM_CHECK_MULTIPLE", "0")) # llama.cpp DEFAULT_CHAT_TEMPLATE = os.environ.get("DEFAULT_CHAT_TEMPLATE", "chatml") N_CTX = int(os.environ.get("N_CTX", "4096")) N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1")) # llava.llama.cpp # ! pending development # Multimodal # IMAGE_TOKEN = os.environ.get("IMAGE_TOKEN", "[IMAGE]<|image|>[/IMAGE]") IMAGE_TOKEN = os.environ.get("IMAGE_TOKEN", "") IMAGE_TOKEN_INTERACTIVE = bool(int(os.environ.get("IMAGE_TOKEN_INTERACTIVE", "0"))) # ! IMAGE_TOKEN_LENGTH expected embedding lengths of an image to calculate the actual tokens IMAGE_TOKEN_LENGTH = int(os.environ.get("IMAGE_TOKEN_LENGTH", "576")) # ! Llava1.6 to calculate the maximum number of patches in an image (max=5 for Llava1.6) MAX_PACHES = int(os.environ.get("MAX_PACHES", "1"))