Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

VEDAGI1 commited on 3 days ago

Commit

d006a6e

verified ·

1 Parent(s): a7e0072

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -62

app.py CHANGED Viewed

@@ -5,7 +5,8 @@
 # - Triple-quoted progress strings (no unterminated literals)
 # - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
 # - Optional HIPAA flags (fallback defaults if not present in settings.py)
-from future import annotations
 import io
 import json
 import os
@@ -13,97 +14,109 @@ import traceback
 from contextlib import redirect_stdout
 from datetime import datetime
 from typing import Any, Dict, List
 import gradio as gr
 import pandas as pd
 import regex as re2
 import re
 from langchain_cohere import ChatCohere  # noqa: F401
 from settings import (
-GENERAL_CONVERSATION_PROMPT,
-COHERE_MODEL_PRIMARY,
-COHERE_TIMEOUT_S,   # noqa: F401
-USE_OPEN_FALLBACKS  # noqa: F401
 )
 # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
 try:
-from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
 except Exception:
-PHI_MODE = False
-PERSIST_HISTORY = True
-HISTORY_TTL_DAYS = 365
-REDACT_BEFORE_LLM = False
-ALLOW_EXTERNAL_PHI = True
 from audit_log import log_event
 from privacy import safety_filter, refusal_reply
 from llm_router import cohere_chat, _co_client, cohere_embed
 # ---------------------- Helpers (analysis logic unchanged) ----------------------
 def load_markdown_text(filepath: str) -> str:
-try:
-with open(filepath, "r", encoding="utf-8") as f:
-return f.read()
-except FileNotFoundError:
-return f"Error: Document {os.path.basename(filepath)} not found."
 def _sanitize_text(s: str) -> str:
-if not isinstance(s, str):
-return s
-# Remove control characters (except newline and tab)
-return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
-Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
 PHI_PATTERNS = [
-(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
-(re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
-(re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
-(re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
-(re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
-(re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
-(re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
 ]
 def redact_phi(text: str) -> str:
-if not isinstance(text, str):
-return text
-t = text
-for pat, repl in PHI_PATTERNS:
-t = pat.sub(repl, t)
-return t
 def safe_log(event_name: str, meta: dict | None = None):
-# Avoid logging raw PHI or payloads
-try:
-meta = (meta or {}).copy()
-meta.pop("raw", None)
-log_event(event_name, None, meta)
-except Exception:
-# Never raise from logging
-pass
 def _create_python_script(user_scenario: str, schema_context: str) -> str:
-EXPERT_ANALYTICAL_GUIDELINES = """
 --- EXPERT ANALYTICAL GUIDELINES ---
 When writing your script, you MUST follow these expert business rules:
-Linking Datasets Rule: If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
-you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
-and then assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
-Prioritization Rule: To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
-to create a multi-factor risk score.
-Capacity Calculation Rule: For capacity over a 3-month window, assume 60 working days.
-Cost Calculation Rule: Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
 """
-prompt_for_coder = f"""
 You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
-You have dataframes in a list dfs.
 {EXPERT_ANALYTICAL_GUIDELINES}
 --- DATA SCHEMA ---
 {schema_context}
 --- END DATA SCHEMA ---
 CRITICAL RULES:
-DO NOT READ FILES: You MUST NOT include pd.read_csv. The data is ALREADY loaded in the dfs variable. You MUST use this variable. Failure to do so will cause a fatal error.
-JSON OUTPUT ONLY: Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
-BE PRECISE: Use the exact, case-sensitive column names from the schema and robustly clean strings (re.sub()) before converting to numbers.
-JSON SERIALIZATION: Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like int64) to standard Python types using .item() for single values or .tolist() for lists.
 --- USER'S SCENARIO ---
 {user_scenario}
 --- PYTHON SCRIPT ---
 Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
-code
-Python
 """
     generated_text = cohere_chat(prompt_for_coder)
     match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
@@ -225,8 +238,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                 )
             yield_update("""```
-✍️ Synthesizing final comprehensive report...
-```""")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, raw_data_output)
             return _sanitize_text(final_report)
@@ -328,7 +340,7 @@ function rs_toggle_stt(elemId){
   __rs_rec.onresult = (ev) => {
     let t = "";
     for (let i = ev.resultIndex; i < ev.results.length; i++){
-      t += ev.results[i][0].transcript;
     }
     box.value = (base + " " + t).trim();
     box.dispatchEvent(new Event("input", { bubbles: true }));
@@ -469,7 +481,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
         if not selection or not history_state_list:
             return ""
         try:
-            selected_id = selection.split(" - ", 1)[0]
         except Exception:
             selected_id = selection

 # - Triple-quoted progress strings (no unterminated literals)
 # - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
 # - Optional HIPAA flags (fallback defaults if not present in settings.py)
+from __future__ import annotations
 import io
 import json
 import os
 from contextlib import redirect_stdout
 from datetime import datetime
 from typing import Any, Dict, List
 import gradio as gr
 import pandas as pd
 import regex as re2
 import re
 from langchain_cohere import ChatCohere  # noqa: F401
 from settings import (
+    GENERAL_CONVERSATION_PROMPT,
+    COHERE_MODEL_PRIMARY,
+    COHERE_TIMEOUT_S,   # noqa: F401
+    USE_OPEN_FALLBACKS  # noqa: F401
 )
 # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
 try:
+    from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
 except Exception:
+    PHI_MODE = False
+    PERSIST_HISTORY = True
+    HISTORY_TTL_DAYS = 365
+    REDACT_BEFORE_LLM = False
+    ALLOW_EXTERNAL_PHI = True
 from audit_log import log_event
 from privacy import safety_filter, refusal_reply
 from llm_router import cohere_chat, _co_client, cohere_embed
 # ---------------------- Helpers (analysis logic unchanged) ----------------------
 def load_markdown_text(filepath: str) -> str:
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            return f.read()
+    except FileNotFoundError:
+        return f"**Error:** Document `{os.path.basename(filepath)}` not found."
 def _sanitize_text(s: str) -> str:
+    if not isinstance(s, str):
+        return s
+    # Remove control characters (except newline and tab)
+    return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
+# Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
 PHI_PATTERNS = [
+    (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
+    (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
+    (re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
+    (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
+    (re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
+    (re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
+    (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
 ]
 def redact_phi(text: str) -> str:
+    if not isinstance(text, str):
+        return text
+    t = text
+    for pat, repl in PHI_PATTERNS:
+        t = pat.sub(repl, t)
+    return t
 def safe_log(event_name: str, meta: dict | None = None):
+    # Avoid logging raw PHI or payloads
+    try:
+        meta = (meta or {}).copy()
+        meta.pop("raw", None)
+        log_event(event_name, None, meta)
+    except Exception:
+        # Never raise from logging
+        pass
 def _create_python_script(user_scenario: str, schema_context: str) -> str:
+    EXPERT_ANALYTICAL_GUIDELINES = """
 --- EXPERT ANALYTICAL GUIDELINES ---
 When writing your script, you MUST follow these expert business rules:
+1.  **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
+    you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
+    and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
+2.  **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
+    to create a multi-factor risk score.
+3.  **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
+4.  **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
 """
+    prompt_for_coder = f"""\
 You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
+You have dataframes in a list `dfs`.
 {EXPERT_ANALYTICAL_GUIDELINES}
 --- DATA SCHEMA ---
 {schema_context}
 --- END DATA SCHEMA ---
 CRITICAL RULES:
+1.  **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
+2.  **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
+3.  **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
+4.  **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
 --- USER'S SCENARIO ---
 {user_scenario}
 --- PYTHON SCRIPT ---
 Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
+```python
 """
     generated_text = cohere_chat(prompt_for_coder)
     match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
                 )
             yield_update("""```
+✍️ Synthesizing final comprehensive report...```""")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, raw_data_output)
             return _sanitize_text(final_report)
   __rs_rec.onresult = (ev) => {
     let t = "";
     for (let i = ev.resultIndex; i < ev.results.length; i++){
+      t += ev.results[i].transcript;
     }
     box.value = (base + " " + t).trim();
     box.dispatchEvent(new Event("input", { bubbles: true }));
         if not selection or not history_state_list:
             return ""
         try:
+            selected_id = selection.split(" - ", 1)
         except Exception:
             selected_id = selection