Spaces:

shri171981
/

hack_doc_deployment

Sleeping

App Files Files Community

shri171981 commited on Dec 4, 2025

Commit

2195ca0

verified ·

1 Parent(s): 74e3539

Using InferenceClient API

Browse files

Files changed (1) hide show

app.py +35 -59

app.py CHANGED Viewed

@@ -1,39 +1,19 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-# 1. Define your model ID
-# REPLACE THIS with your actual username/repo name
-ADAPTER_ID = "shri171981/medical_chat_generative"
-def load_model():
-    # Load Base Model (Llama-3-8B)
-    # We use "cpu" and float32 if you are on the Free Tier (Slow but works)
-    # If you have a GPU in your Space, change device_map to "auto"
-    base_model_name = "unsloth/llama-3-8b-instruct-bnb-4bit"
-    print("Loading base model...")
-    base_model = AutoModelForCausalLM.from_pretrained(
-        base_model_name,
-        device_map="cpu", # Change to "auto" if you have a GPU Space
-        torch_dtype=torch.float32,
-        low_cpu_mem_usage=True
-    )
-    print("Loading adapter...")
-    model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-    return model, tokenizer
-# Load the model once at startup
-model, tokenizer = load_model()
-def ask_doctor(message, history):
-    # 1. Format the input for Llama-3
-    # We strictly enforce the "HACK_DOC" format
     system_prompt = "You are a helpful and empathetic medical doctor. Answer the patient's question based on the input provided."
-    full_prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {system_prompt}
@@ -43,36 +23,32 @@ def ask_doctor(message, history):
 ### Response:
 """
-    # 2. Tokenize and Generate
-    inputs = tokenizer(full_prompt, return_tensors="pt")
-    # Generate response
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
             max_new_tokens=128,
-            temperature=0.7
         )
-    # 3. Decode output
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # 4. Clean up the text (Remove the prompt part)
-    # We split by "Response:" and take the last part
-    clean_answer = response.split("Response:")[-1].strip()
-    return clean_answer
-# 3. Build the UI
-interface = gr.ChatInterface(
-    fn=ask_doctor,
-    title="🚑 HACK_DOC AI",
-    description="I am a specialized medical assistant. Ask me about symptoms!",
-    examples=["I have a sharp pain in my chest.", "What should I take for a fever?", "My skin is itchy and red."],
-    # theme="soft"
 )
-# 4. Launch
 if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+from huggingface_hub import InferenceClient
+import os
+# 1. Setup the Client
+# We fetch the token you just added to Secrets
+client = InferenceClient(token=os.getenv("HF_TOKEN"))
+# 2. Your Model ID (The Adapter)
+# The API is smart enough to see it's an adapter and load the Base Model automatically.
+MODEL_ID = "shri171981/genai_hack_doc"
+def ask_api(message, history):
+    # 3. Format the prompt (Strict Llama-3 format)
     system_prompt = "You are a helpful and empathetic medical doctor. Answer the patient's question based on the input provided."
+    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {system_prompt}
 ### Response:
 """
+    try:
+        # 4. Send to the API
+        response = client.text_generation(
+            prompt,
+            model=MODEL_ID,
             max_new_tokens=128,
+            temperature=0.7,
+            return_full_text=False # We only want the new part
         )
+        return response
+    except Exception as e:
+        # 5. Handle "Model Loading" errors
+        # If the model is cold, the API returns a 503 error.
+        if "Model is loading" in str(e):
+            return "⚠️ The model is waking up (Cold Start). Please wait 30 seconds and try again!"
+        return f"Error: {str(e)}"
+# 6. Launch
+demo = gr.ChatInterface(
+    fn=ask_api,
+    title="🚑 HACK_DOC (API Powered)",
+    description="Running on Hugging Face Serverless GPU via API.",
+    examples=["I have a sharp pain in my chest.", "What is good for a fever?"],
 )
 if __name__ == "__main__":
+    demo.launch()