model_list: - model_name: ollama-mistral-7b litellm_params: model: ollama/mistral:7b api_base: https://zhengr-ollama.hf.space api_key: sk-1234 # Model-specific parameters #model: "huggingface/mistralai/Mistral-7B-Instruct-v0.1" #api_base: "" #api_key: "" # [OPTIONAL] for hf inference endpoints #initial_prompt_value: "\n" #roles: {"system":{"pre_message":"<|im_start|>system\n", "post_message":"<|im_end|>"}, "assistant":{"pre_message":"<|im_start|>assistant\n","post_message":"<|im_end|>"}, "user":{"pre_message":"<|im_start|>user\n","post_message":"<|im_end|>"}} #final_prompt_value: "\n" #bos_token: "" #eos_token: "" #max_tokens: 4096 - model_name: xinference-llama-3-instruct litellm_params: model: xinference/llama-3-instruct api_base: https://zhengr-xinference.hf.space/api api_key: sk-1234 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py drop_params: True general_settings: #master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) #alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env