File size: 7,455 Bytes
1174dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Sample YAML file for configuration.
# Comment and uncomment values as needed.
# Every value has a default within the application.
# This file serves to be a drop in for config.yml

# Unless specified in the comments, DO NOT put these options in quotes!
# You can use https://www.yamllint.com/ if you want to check your YAML formatting.

# Options for networking
network:
  # The IP to host on (default: 127.0.0.1).
  # Use 0.0.0.0 to expose on all network adapters.
  host: 0.0.0.0

  # The port to host on (default: 5000).
  port: 5000

  # Disable HTTP token authentication with requests.
  # WARNING: This will make your instance vulnerable!
  # Turn on this option if you are ONLY connecting from localhost.
  disable_auth: false

  # Send tracebacks over the API (default: False).
  # NOTE: Only enable this for debug purposes.
  send_tracebacks: false

  # Select API servers to enable (default: ["OAI"]).
  # Possible values: OAI, Kobold.
  api_servers: ["oai"]

# Options for logging
logging:
  # Enable prompt logging (default: False).
  log_prompt: false

  # Enable generation parameter logging (default: False).
  log_generation_params: false

  # Enable request logging (default: False).
  # NOTE: Only use this for debugging!
  log_requests: false

# Options for model overrides and loading
# Please read the comments to understand how arguments are handled
# between initial and API loads
model:
  # Directory to look for models (default: models).
  # Windows users, do NOT put this path in quotes!
  model_dir: models

  # Allow direct loading of models from a completion or chat completion request (default: False).
  inline_model_loading: false

  # Sends dummy model names when the models endpoint is queried.
  # Enable this if the client is looking for specific OAI models.
  use_dummy_models: false

  # An initial model to load.
  # Make sure the model is located in the model directory!
  # REQUIRED: This must be filled out to load a model on startup.
  model_name: Llama-3.1-Nemotron-70B-Instruct-HF_exl2_4.6bpw

  # Names of args to use as a fallback for API load requests (default: []).
  # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array.
  # Example: ['max_seq_len', 'cache_mode'].
  use_as_default: []

  # Max sequence length (default: Empty).
  # Fetched from the model's base sequence length in config.json by default.
  max_seq_len: 65536
  # Overrides base model context length (default: Empty).
  # WARNING: Don't set this unless you know what you're doing!
  # Again, do NOT use this for configuring context length, use max_seq_len above ^
  override_base_seq_len:

  # Load model with tensor parallelism.
  # Falls back to autosplit if GPU split isn't provided.
  # This ignores the gpu_split_auto value.
  tensor_parallel: false

  # Automatically allocate resources to GPUs (default: True).
  # Not parsed for single GPU users.
  gpu_split_auto: true

  # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).
  # Represented as an array of MB per GPU.
  autosplit_reserve: [0]

  # An integer array of GBs of VRAM to split between GPUs (default: []).
  # Used with tensor parallelism.
  gpu_split: []

  # Rope scale (default: 1.0).
  # Same as compress_pos_emb.
  # Use if the model was trained on long context with rope.
  # Leave blank to pull the value from the model.
  rope_scale: 1.0

  # Rope alpha (default: None).
  # Same as alpha_value. Set to "auto" to auto-calculate.
  # Leaving this value blank will either pull from the model or auto-calculate.
  rope_alpha:

  # Enable different cache modes for VRAM savings (default: FP16).
  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
  cache_mode: Q4

  # Size of the prompt cache to allocate (default: max_seq_len).
  # Must be a multiple of 256 and can't be less than max_seq_len.
  # For CFG, set this to 2 * max_seq_len.
  cache_size:

  # Chunk size for prompt ingestion (default: 2048).
  # A lower value reduces VRAM usage but decreases ingestion speed.
  # NOTE: Effects vary depending on the model.
  # An ideal value is between 512 and 4096.
  chunk_size: 2048

  # Set the maximum number of prompts to process at one time (default: None/Automatic).
  # Automatically calculated if left blank.
  # NOTE: Only available for Nvidia ampere (30 series) and above GPUs.
  max_batch_size:

  # Set the prompt template for this model. (default: None)
  # If empty, attempts to look for the model's chat template.
  # If a model contains multiple templates in its tokenizer_config.json,
  # set prompt_template to the name of the template you want to use.
  # NOTE: Only works with chat completion message lists!
  prompt_template:

  # Number of experts to use per token.
  # Fetched from the model's config.json if empty.
  # NOTE: For MoE models only.
  # WARNING: Don't set this unless you know what you're doing!
  num_experts_per_token:

  # Enables fasttensors to possibly increase model loading speeds (default: False).
  fasttensors: true

# Options for draft models (speculative decoding)
# This will use more VRAM!
draft_model:
  # Directory to look for draft models (default: models)
  draft_model_dir: models

  # An initial draft model to load.
  # Ensure the model is in the model directory.
  draft_model_name:

  # Rope scale for draft models (default: 1.0).
  # Same as compress_pos_emb.
  # Use if the draft model was trained on long context with rope.
  draft_rope_scale: 1.0

  # Rope alpha for draft models (default: None).
  # Same as alpha_value. Set to "auto" to auto-calculate.
  # Leaving this value blank will either pull from the model or auto-calculate.
  draft_rope_alpha:

  # Cache mode for draft models to save VRAM (default: FP16).
  # Possible values: 'FP16', 'Q8', 'Q6', 'Q4'.
  draft_cache_mode: FP16

# Options for Loras
lora:
  # Directory to look for LoRAs (default: loras).
  lora_dir: loras

  # List of LoRAs to load and associated scaling factors (default scale: 1.0).
  # For the YAML file, add each entry as a YAML list:
  # - name: lora1
  #   scaling: 1.0
  loras:

# Options for embedding models and loading.
# NOTE: Embeddings requires the "extras" feature to be installed
# Install it via "pip install .[extras]"
embeddings:
  # Directory to look for embedding models (default: models).
  embedding_model_dir: models

  # Device to load embedding models on (default: cpu).
  # Possible values: cpu, auto, cuda.
  # NOTE: It's recommended to load embedding models on the CPU.
  # If using an AMD GPU, set this value to 'cuda'.
  embeddings_device: cpu

  # An initial embedding model to load on the infinity backend.
  embedding_model_name:
sampling:

# Options for development and experimentation
developer:
  # Skip Exllamav2 version check (default: False).
  # WARNING: It's highly recommended to update your dependencies rather than enabling this flag.
  unsafe_launch: false

  # Disable API request streaming (default: False).
  disable_request_streaming: false

  # Enable the torch CUDA malloc backend (default: False).
  cuda_malloc_backend: true

  # Run asyncio using Uvloop or Winloop which can improve performance.
  # NOTE: It's recommended to enable this, but if something breaks turn this off.
  uvloop: true

  # Set process to use a higher priority.
  # For realtime process priority, run as administrator or sudo.
  # Otherwise, the priority will be set to high.
  realtime_process_priority: true