Spaces:

dflehel
/

medgemmank

Running on Zero

dflehel commited on Oct 7

Commit

d2612d8

verified ·

1 Parent(s): 23bf3db

Upload app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -78,8 +78,11 @@ ban_list = [
 ]
 bad_words_ids = [processor.tokenizer(b, add_special_tokens=False).input_ids for b in ban_list]
 gen_cfg = GenerationConfig(
-    max_new_tokens=120,
     do_sample=False,
     repetition_penalty=1.12,
     no_repeat_ngram_size=6,
@@ -96,7 +99,12 @@ gen_cfg = GenerationConfig(
 # function serves as a persistent cache.
-@spaces.GPU(duration=120)
 def run_model(prompt: str, image: Optional[Image.Image], system_prompt: Optional[str]) -> str:
     """Execute the MedGemma model.

 ]
 bad_words_ids = [processor.tokenizer(b, add_special_tokens=False).input_ids for b in ban_list]
+# Limit the number of generated tokens to shorten inference time.  A smaller
+# ``max_new_tokens`` helps ensure the call completes within the 180‑second
+# runtime.  See the ZeroGPU documentation for runtime guidance【666268612876326†L200-L211】.
 gen_cfg = GenerationConfig(
+    max_new_tokens=60,
     do_sample=False,
     repetition_penalty=1.12,
     no_repeat_ngram_size=6,
 # function serves as a persistent cache.
+# Increase the duration to 180 seconds and enable queueing.  The ZeroGPU
+# documentation notes that the default runtime is 60 seconds and that you can
+# specify a longer duration via the ``duration`` parameter【666268612876326†L200-L211】.
+# Enabling the queue prevents immediate failure when GPUs are busy.  These
+# adjustments help mitigate intermittent ``GPU task aborted`` errors.
+@spaces.GPU(duration=180, enable_queue=True)
 def run_model(prompt: str, image: Optional[Image.Image], system_prompt: Optional[str]) -> str:
     """Execute the MedGemma model.