Spaces:
Paused
Paused
fix(remove): use_cached_output is not an option
Browse files
main.py
CHANGED
@@ -64,7 +64,6 @@ engine_llama_3_2: LLM = LLM(
|
|
64 |
# Your Tesla T4 GPU has compute capability 7.5.
|
65 |
# You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
|
66 |
dtype='half', # Use 'half' for T4
|
67 |
-
use_cached_outputs=True, # Enable caching
|
68 |
)
|
69 |
|
70 |
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
@@ -80,7 +79,6 @@ engine_sailor_chat: LLM = LLM(
|
|
80 |
max_model_len=32768,
|
81 |
enforce_eager=True, # Disable CUDA graph
|
82 |
dtype='half', # Use 'half' for T4
|
83 |
-
use_cached_outputs=True, # Enable caching
|
84 |
)
|
85 |
|
86 |
|
|
|
64 |
# Your Tesla T4 GPU has compute capability 7.5.
|
65 |
# You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
|
66 |
dtype='half', # Use 'half' for T4
|
|
|
67 |
)
|
68 |
|
69 |
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
|
|
79 |
max_model_len=32768,
|
80 |
enforce_eager=True, # Disable CUDA graph
|
81 |
dtype='half', # Use 'half' for T4
|
|
|
82 |
)
|
83 |
|
84 |
|