yusufs commited on
Commit
6b1968a
·
1 Parent(s): c41cdb4

fix(remove): use_cached_output is not an option

Browse files
Files changed (1) hide show
  1. main.py +0 -2
main.py CHANGED
@@ -64,7 +64,6 @@ engine_llama_3_2: LLM = LLM(
64
  # Your Tesla T4 GPU has compute capability 7.5.
65
  # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
66
  dtype='half', # Use 'half' for T4
67
- use_cached_outputs=True, # Enable caching
68
  )
69
 
70
  # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
@@ -80,7 +79,6 @@ engine_sailor_chat: LLM = LLM(
80
  max_model_len=32768,
81
  enforce_eager=True, # Disable CUDA graph
82
  dtype='half', # Use 'half' for T4
83
- use_cached_outputs=True, # Enable caching
84
  )
85
 
86
 
 
64
  # Your Tesla T4 GPU has compute capability 7.5.
65
  # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
66
  dtype='half', # Use 'half' for T4
 
67
  )
68
 
69
  # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
 
79
  max_model_len=32768,
80
  enforce_eager=True, # Disable CUDA graph
81
  dtype='half', # Use 'half' for T4
 
82
  )
83
 
84