dflehel commited on
Commit
d2612d8
·
verified ·
1 Parent(s): 23bf3db

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -78,8 +78,11 @@ ban_list = [
78
  ]
79
  bad_words_ids = [processor.tokenizer(b, add_special_tokens=False).input_ids for b in ban_list]
80
 
 
 
 
81
  gen_cfg = GenerationConfig(
82
- max_new_tokens=120,
83
  do_sample=False,
84
  repetition_penalty=1.12,
85
  no_repeat_ngram_size=6,
@@ -96,7 +99,12 @@ gen_cfg = GenerationConfig(
96
  # function serves as a persistent cache.
97
 
98
 
99
- @spaces.GPU(duration=120)
 
 
 
 
 
100
  def run_model(prompt: str, image: Optional[Image.Image], system_prompt: Optional[str]) -> str:
101
  """Execute the MedGemma model.
102
 
 
78
  ]
79
  bad_words_ids = [processor.tokenizer(b, add_special_tokens=False).input_ids for b in ban_list]
80
 
81
+ # Limit the number of generated tokens to shorten inference time. A smaller
82
+ # ``max_new_tokens`` helps ensure the call completes within the 180‑second
83
+ # runtime. See the ZeroGPU documentation for runtime guidance【666268612876326†L200-L211】.
84
  gen_cfg = GenerationConfig(
85
+ max_new_tokens=60,
86
  do_sample=False,
87
  repetition_penalty=1.12,
88
  no_repeat_ngram_size=6,
 
99
  # function serves as a persistent cache.
100
 
101
 
102
+ # Increase the duration to 180 seconds and enable queueing. The ZeroGPU
103
+ # documentation notes that the default runtime is 60 seconds and that you can
104
+ # specify a longer duration via the ``duration`` parameter【666268612876326†L200-L211】.
105
+ # Enabling the queue prevents immediate failure when GPUs are busy. These
106
+ # adjustments help mitigate intermittent ``GPU task aborted`` errors.
107
+ @spaces.GPU(duration=180, enable_queue=True)
108
  def run_model(prompt: str, image: Optional[Image.Image], system_prompt: Optional[str]) -> str:
109
  """Execute the MedGemma model.
110