Spaces:

kaburia
/

policy-analysis

Sleeping

App Files Files Community

kaburia commited on Jul 29

Commit

1eead99

1 Parent(s): da4d8cf

rewrite

Browse files

Files changed (1) hide show

app.py +93 -140

app.py CHANGED Viewed

@@ -1,13 +1,10 @@
-# app.py
 import os
 import uuid
 import time
 import json
 import requests
 import gradio as gr
-# ========= Helpers & Context =========
-# Ensure your local utils module exposes: session_id, retrieve_context, log_interaction_hf, upload_log_to_hf
 import utils.helpers as helpers
 from utils.helpers import retrieve_context, log_interaction_hf, upload_log_to_hf
@@ -15,57 +12,36 @@ from utils.helpers import retrieve_context, log_interaction_hf, upload_log_to_hf
 with open("config.json") as f:
     config = json.load(f)
-DO_API_KEY = config["do_token"]                   # DigitalOcean Model Access Key (serverless inference)
-HF_TOKEN = "hf_" + config["token"]                # Hugging Face token for dataset uploads
-# Stable session id for the whole app lifetime so logs land under a unique folder
 session_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}"
-helpers.session_id = session_id                   # used by your upload_log_to_hf implementation
 BASE_URL = "https://inference.do-ai.run/v1"
-UPLOAD_INTERVAL = 5                               # upload logs to HF every N turns
-REQUEST_TIMEOUT = 60
-STREAM_TIMEOUT = 120
-# ========= Network Utils =========
 def _auth_headers():
-    return {
-        "Authorization": f"Bearer {DO_API_KEY}",
-        "Content-Type": "application/json",
-        "Accept": "application/json",
-    }
 def list_models():
-    """
-    Fetch live model IDs from DO; fall back to a deterministic default on failure.
-    Always return a non-empty list.
-    """
     try:
-        resp = requests.get(f"{BASE_URL}/models", headers=_auth_headers(), timeout=REQUEST_TIMEOUT)
-        resp.raise_for_status()
-        data = resp.json().get("data", [])
-        ids = [m.get("id") for m in data if m.get("id")]
         if ids:
             return ids
     except Exception as e:
         print(f"⚠️ list_models failed: {e}")
-    # Deterministic fallback
     return ["llama3.3-70b-instruct"]
-def _normalize_model_id(model_id: str | None) -> str:
-    if model_id:
-        return model_id
-    return list_models()[0]
-# ========= Inference (non-stream + stream) =========
 def gradient_request(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
-    """
-    Non-streaming completion (used by lightweight tasks like intent detection).
-    Self-heals if model_id is not found by retrying with the first available model.
-    """
     url = f"{BASE_URL}/chat/completions"
     payload = {
-        "model": _normalize_model_id(model_id),
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
@@ -73,42 +49,39 @@ def gradient_request(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.
     }
     for attempt in range(3):
         try:
-            resp = requests.post(url, headers=_auth_headers(), json=payload, timeout=REQUEST_TIMEOUT)
             if resp.status_code == 404:
-                # Model not found → pick first available model and retry once
                 ids = list_models()
-                if ids and payload["model"] not in ids:
                     payload["model"] = ids[0]
                     continue
             resp.raise_for_status()
             j = resp.json()
             return j["choices"][0]["message"]["content"].strip()
         except requests.HTTPError as e:
-            body = getattr(e.response, "text", str(e))
-            raise RuntimeError(f"Inference error ({e.response.status_code}): {body}") from e
         except requests.RequestException as e:
             if attempt == 2:
                 raise
-            time.sleep(0.5)
     raise RuntimeError("Exhausted retries")
 def gradient_stream(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
-    """
-    Streaming generator yielding content chunks.
-    Emits keepalives if the server is quiet for >3s.
-    """
     url = f"{BASE_URL}/chat/completions"
     payload = {
-        "model": _normalize_model_id(model_id),
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
         "stream": True,
     }
     try:
-        with requests.post(url, headers=_auth_headers(), json=payload, stream=True, timeout=STREAM_TIMEOUT) as r:
             if r.status_code != 200:
                 try:
                     err_txt = r.text
@@ -116,40 +89,39 @@ def gradient_stream(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.9
                     err_txt = "<no body>"
                 raise RuntimeError(f"HTTP {r.status_code}: {err_txt}")
-            last_token_ts = time.time()
-            for raw in r.iter_lines(decode_unicode=True):
-                if raw is None or raw == b"" or raw == "":
-                    if time.time() - last_token_ts > 3:
-                        last_token_ts = time.time()
-                        yield ""  # visual keepalive (no-op for UI)
-                    continue
-                if not raw.startswith("data: "):
-                    continue
-                data = raw[6:].strip()
-                if data == "[DONE]":
-                    break
-                try:
-                    chunk = json.loads(data)
-                    delta = chunk["choices"][0]["delta"]
-                    content = delta.get("content", "")
-                    if content:
-                        last_token_ts = time.time()
-                        yield content
-                except Exception:
-                    continue
     except Exception as e:
-        raise
 def gradient_complete(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
     url = f"{BASE_URL}/chat/completions"
     payload = {
-        "model": _normalize_model_id(model_id),
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
     }
-    r = requests.post(url, headers=_auth_headers(), json=payload, timeout=REQUEST_TIMEOUT)
     if r.status_code != 200:
         raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
     j = r.json()
@@ -157,10 +129,6 @@ def gradient_complete(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0
 # ========= Lightweight Intent Detection =========
 def detect_intent(model_id, message: str) -> str:
-    """
-    Classify as 'small_talk' or 'info_query'.
-    Fail-open to 'info_query' on any issue.
-    """
     try:
         out = gradient_request(
             model_id,
@@ -174,26 +142,28 @@ def detect_intent(model_id, message: str) -> str:
         print(f"⚠️ detect_intent failed: {e}")
         return "info_query"
-# ========= Gradio App =========
 with gr.Blocks(title="Gradient AI Chat") as demo:
     turn_counter = gr.State(0)
     gr.Markdown("## Gradient AI Chat")
     gr.Markdown("Select a model and ask your question.")
     with gr.Row():
         model_drop = gr.Dropdown(choices=[], label="Select Model")
         system_msg = gr.Textbox(
-            value="You are a faithful assistant. Prefer provided context, but answer helpfully if none is available.",
             label="System message"
         )
     with gr.Row():
         max_tokens_slider = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
         temperature_slider = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature")
-        top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top‑p")
-    # IMPORTANT: tuples mode → we must pass and replace tuples, not mutate them
     chatbot = gr.Chatbot(height=500, type="tuples")
     msg = gr.Textbox(label="Your message")
@@ -213,8 +183,8 @@ with gr.Blocks(title="Gradient AI Chat") as demo:
     # --- Load models into dropdown at startup
     def load_models():
         ids = list_models()
-        # value must be in choices; guarantee both
-        return gr.Dropdown.update(choices=ids, value=ids[0])
     demo.load(load_models, outputs=[model_drop])
@@ -229,87 +199,70 @@ with gr.Blocks(title="Gradient AI Chat") as demo:
     # --- Event handlers
     def user(user_message, chat_history):
-        chat_history = chat_history or []
-        # Append a tuple and return
-        chat_history = list(chat_history) + [(user_message, "")]
-        return "", chat_history
     def bot(chat_history, current_turn_count, model_id, system_message, max_tokens, temperature, top_p):
-        """
-        Single, clean streaming pass. Replace tuples; never mutate in place.
-        """
-        if not chat_history:
-            # Shouldn't happen, but stay defensive
-            yield chat_history, (current_turn_count or 0)
-            return
         user_message = chat_history[-1][0]
-        # Intent (optional; keeps your original flow)
         intent = detect_intent(model_id, user_message)
-        # Build prompt with a safe fallback when RAG returns nothing
-        context = ""
-        if intent != "small_talk":
             try:
-                context = retrieve_context(user_message, p=5, threshold=0.5) or ""
             except Exception as e:
                 print(f"⚠️ retrieve_context failed: {e}")
                 context = ""
-        if intent == "small_talk":
-            full_prompt = f"[System]: Friendly chat.\n[User]: {user_message}\n[Assistant]: "
-        else:
-            if context.strip():
-                full_prompt = (
-                    f"[System]: {system_message}\n"
-                    "Use the provided context verbatim; if context is insufficient, answer directly.\n\n"
-                    f"Context:\n{context}\n\nQuestion: {user_message}\n"
-                )
-            else:
-                # No context → do not block the model
-                full_prompt = f"[System]: {system_message}\nQuestion: {user_message}\n"
-        # Seed assistant bubble (replace tuple, don’t mutate)
-        chat_history = list(chat_history)
-        chat_history[-1] = (chat_history[-1][0], "")
-        yield chat_history, (current_turn_count or 0)
-        # Stream with fallback
         try:
             received_any = False
-            buffer = ""
             for token in gradient_stream(model_id, full_prompt, max_tokens, temperature, top_p):
-                if token:
                     received_any = True
-                    buffer += token
-                    chat_history[-1] = (chat_history[-1][0], buffer)
-                    yield chat_history, (current_turn_count or 0)
             if not received_any:
-                text = gradient_complete(model_id, full_prompt, max_tokens, temperature, top_p)
-                chat_history[-1] = (chat_history[-1][0], text)
-                yield chat_history, (current_turn_count or 0)
         except Exception as e:
-            chat_history[-1] = (chat_history[-1][0], f"⚠️ Inference failed: {e}")
-            yield chat_history, (current_turn_count or 0)
-            return
-        # Logging & periodic upload (once per turn)
         try:
             log_interaction_hf(user_message, chat_history[-1][1])
         except Exception as e:
             print(f"⚠️ log_interaction_hf failed: {e}")
         new_turn_count = (current_turn_count or 0) + 1
         if new_turn_count % UPLOAD_INTERVAL == 0:
             try:
-                upload_log_to_hf(HF_TOKEN)   # IMPORTANT: HF token, not DO
             except Exception as e:
                 print(f"❌ Log upload failed: {e}")
         yield chat_history, new_turn_count
     # Wiring (streaming generators supported)
@@ -339,4 +292,4 @@ with gr.Blocks(title="Gradient AI Chat") as demo:
 if __name__ == "__main__":
     # On HF Spaces, don't use share=True. Also disable API page to avoid schema churn.
-    demo.launch(show_api=False)

 import os
 import uuid
 import time
 import json
 import requests
 import gradio as gr
+import time
 import utils.helpers as helpers
 from utils.helpers import retrieve_context, log_interaction_hf, upload_log_to_hf
 with open("config.json") as f:
     config = json.load(f)
+DO_API_KEY = config["do_token"]
+token_ = config['token']
+HF_TOKEN = 'hf_' + token_
 session_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}"
+helpers.session_id = session_id
 BASE_URL = "https://inference.do-ai.run/v1"
+UPLOAD_INTERVAL = 5
+# ========= Inference Utilities =========
 def _auth_headers():
+    return {"Authorization": f"Bearer {DO_API_KEY}", "Content-Type": "application/json"}
 def list_models():
     try:
+        r = requests.get(f"{BASE_URL}/models", headers=_auth_headers(), timeout=15)
+        r.raise_for_status()
+        data = r.json().get("data", [])
+        ids = [m["id"] for m in data]
         if ids:
             return ids
     except Exception as e:
         print(f"⚠️ list_models failed: {e}")
     return ["llama3.3-70b-instruct"]
 def gradient_request(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
     url = f"{BASE_URL}/chat/completions"
+    if not model_id:
+        model_id = list_models()[0]
     payload = {
+        "model": model_id,
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
     }
     for attempt in range(3):
         try:
+            resp = requests.post(url, headers=_auth_headers(), json=payload, timeout=30)
             if resp.status_code == 404:
                 ids = list_models()
+                if model_id not in ids and ids:
                     payload["model"] = ids[0]
                     continue
             resp.raise_for_status()
             j = resp.json()
             return j["choices"][0]["message"]["content"].strip()
         except requests.HTTPError as e:
+            msg = getattr(e.response, "text", str(e))
+            raise RuntimeError(f"Inference error ({e.response.status_code}): {msg}") from e
         except requests.RequestException as e:
             if attempt == 2:
                 raise
     raise RuntimeError("Exhausted retries")
 def gradient_stream(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
     url = f"{BASE_URL}/chat/completions"
+    if not model_id:
+        model_id = list_models()[0]
     payload = {
+        "model": model_id,
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
         "stream": True,
     }
+    # Create a generator that yields tokens
     try:
+        with requests.post(url, headers=_auth_headers(), json=payload, stream=True, timeout=120) as r:
             if r.status_code != 200:
                 try:
                     err_txt = r.text
                     err_txt = "<no body>"
                 raise RuntimeError(f"HTTP {r.status_code}: {err_txt}")
+            buffer = ""
+            for line in r.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')
+                    if decoded_line.startswith('data:'):
+                        data = decoded_line[5:].strip()
+                        if data == '[DONE]':
+                            break
+                        try:
+                            json_data = json.loads(data)
+                            if 'choices' in json_data:
+                                for choice in json_data['choices']:
+                                    if 'delta' in choice and 'content' in choice['delta']:
+                                        content = choice['delta']['content']
+                                        buffer += content
+                                        yield content
+                        except json.JSONDecodeError:
+                            continue
+            if not buffer:
+                yield "No response received from the model."
     except Exception as e:
+        raise RuntimeError(f"Streaming error: {str(e)}")
 def gradient_complete(model_id, prompt, max_tokens=512, temperature=0.7, top_p=0.95):
     url = f"{BASE_URL}/chat/completions"
     payload = {
+        "model": model_id,
         "messages": [{"role": "user", "content": prompt}],
         "max_tokens": max_tokens,
         "temperature": temperature,
         "top_p": top_p,
     }
+    r = requests.post(url, headers=_auth_headers(), json=payload, timeout=60)
     if r.status_code != 200:
         raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
     j = r.json()
 # ========= Lightweight Intent Detection =========
 def detect_intent(model_id, message: str) -> str:
     try:
         out = gradient_request(
             model_id,
         print(f"⚠️ detect_intent failed: {e}")
         return "info_query"
+# ========= App Logic (Gradio Blocks) =========
 with gr.Blocks(title="Gradient AI Chat") as demo:
+    # Keep a reactive turn counter in session state
     turn_counter = gr.State(0)
     gr.Markdown("## Gradient AI Chat")
     gr.Markdown("Select a model and ask your question.")
+    # Model dropdown will be populated at runtime with live IDs
     with gr.Row():
         model_drop = gr.Dropdown(choices=[], label="Select Model")
         system_msg = gr.Textbox(
+            value="You are a faithful assistant. Use only the provided context.",
             label="System message"
         )
     with gr.Row():
         max_tokens_slider = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
         temperature_slider = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature")
+        top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
+    # Use tuples to silence deprecation warning in current Gradio
     chatbot = gr.Chatbot(height=500, type="tuples")
     msg = gr.Textbox(label="Your message")
     # --- Load models into dropdown at startup
     def load_models():
         ids = list_models()
+        default = ids[0] if ids else None
+        return gr.Dropdown.update(choices=ids, value=default)
     demo.load(load_models, outputs=[model_drop])
     # --- Event handlers
     def user(user_message, chat_history):
+        # Seed a new assistant message for streaming
+        return "", (chat_history + [[user_message, ""]])
     def bot(chat_history, current_turn_count, model_id, system_message, max_tokens, temperature, top_p):
         user_message = chat_history[-1][0]
+        # Build prompt
         intent = detect_intent(model_id, user_message)
+        if intent == "small_talk":
+            full_prompt = f"[System]: Friendly chat.\n[User]: {user_message}\n[Assistant]: "
+        else:
             try:
+                context = retrieve_context(user_message, p=5, threshold=0.5)
             except Exception as e:
                 print(f"⚠️ retrieve_context failed: {e}")
                 context = ""
+            full_prompt = (
+                f"[System]: {system_message}\n"
+                "Use only the provided context. Quote verbatim; no inference.\n\n"
+                f"Context:\n{context}\n\nQuestion: {user_message}\n"
+            )
+        # Initialize assistant message to empty string and update chat history
+        chat_history[-1][1] = ""
+        yield chat_history, current_turn_count
+        # Attempt to stream the response
         try:
             received_any = False
             for token in gradient_stream(model_id, full_prompt, max_tokens, temperature, top_p):
+                if token:  # Skip empty tokens
                     received_any = True
+                    chat_history[-1][1] += token
+                    yield chat_history, current_turn_count
+            # If we didn't receive any tokens, fall back to non-streaming
             if not received_any:
+                raise RuntimeError("Streaming returned no tokens; falling back.")
         except Exception as e:
+            print(f"⚠️ Streaming failed: {e}")
+            try:
+                # Fall back to non-streaming
+                response = gradient_complete(model_id, full_prompt, max_tokens, temperature, top_p)
+                chat_history[-1][1] = response
+                yield chat_history, current_turn_count
+            except Exception as e2:
+                chat_history[-1][1] = f"⚠️ Inference failed: {e2}"
+                yield chat_history, current_turn_count
+                return
+        # After successful response, log and update turn counter
         try:
             log_interaction_hf(user_message, chat_history[-1][1])
         except Exception as e:
             print(f"⚠️ log_interaction_hf failed: {e}")
         new_turn_count = (current_turn_count or 0) + 1
+        # Periodically upload logs
         if new_turn_count % UPLOAD_INTERVAL == 0:
             try:
+                upload_log_to_hf(HF_TOKEN)
             except Exception as e:
                 print(f"❌ Log upload failed: {e}")
+        # Update the state with the new turn count
         yield chat_history, new_turn_count
     # Wiring (streaming generators supported)
 if __name__ == "__main__":
     # On HF Spaces, don't use share=True. Also disable API page to avoid schema churn.
+    demo.launch(show_api=False)