Spaces:

lapa-llm
/

lapa

Running on Zero

iamthewalrus67 commited on Aug 23

Commit

d5dc5cf

1 Parent(s): f52933f

Try to make zero gpu work

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,19 +10,23 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 MODEL_ID = "le-llm/gemma-3-12b-it-reasoning"
-# Load model & tokenizer
-device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-    device_map="auto" if device == "cuda" else None,   # helps if multiple GPUs
-)
 SYSTEM_PROMPT = (
     "You are a helpful, concise assistant. Only write replies as the Assistant. Do not invent or continue User messages."
 )
 def respond(
     message,
     history: list[dict[str, str]],
@@ -31,6 +35,9 @@ def respond(
     temperature,
     top_p,
 ):
     # Build conversation
     messages = [{"role": "system", "content": system_message}] + history + [
         {"role": "user", "content": message}
@@ -67,7 +74,7 @@ def respond(
     partial_output = ""
     for new_text in streamer:
         partial_output += new_text
-        yield partial_output  # <- streams to Gradio frontend
 chatbot = gr.ChatInterface(
@@ -87,4 +94,5 @@ chatbot = gr.ChatInterface(
     ],
 )
-chatbot.launch()

 MODEL_ID = "le-llm/gemma-3-12b-it-reasoning"
 SYSTEM_PROMPT = (
     "You are a helpful, concise assistant. Only write replies as the Assistant. Do not invent or continue User messages."
 )
+def load_model():
+    """Lazy-load model & tokenizer (for zeroGPU)."""
+    device = "cuda"# if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+        device_map="auto" if device == "cuda" else None,
+    )
+    return model, tokenizer, device
 def respond(
     message,
     history: list[dict[str, str]],
     temperature,
     top_p,
 ):
+    # Load model/tokenizer each request → allows zeroGPU to cold start & then release
+    model, tokenizer, device = load_model()
     # Build conversation
     messages = [{"role": "system", "content": system_message}] + history + [
         {"role": "user", "content": message}
     partial_output = ""
     for new_text in streamer:
         partial_output += new_text
+        yield partial_output
 chatbot = gr.ChatInterface(
     ],
 )
+if __name__ == "__main__":
+    chatbot.launch()