Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,8 @@
|
|
| 5 |
# - Triple-quoted progress strings (no unterminated literals)
|
| 6 |
# - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
|
| 7 |
# - Optional HIPAA flags (fallback defaults if not present in settings.py)
|
| 8 |
-
from
|
|
|
|
| 9 |
import io
|
| 10 |
import json
|
| 11 |
import os
|
|
@@ -13,97 +14,109 @@ import traceback
|
|
| 13 |
from contextlib import redirect_stdout
|
| 14 |
from datetime import datetime
|
| 15 |
from typing import Any, Dict, List
|
|
|
|
| 16 |
import gradio as gr
|
| 17 |
import pandas as pd
|
| 18 |
import regex as re2
|
| 19 |
import re
|
| 20 |
from langchain_cohere import ChatCohere # noqa: F401
|
| 21 |
from settings import (
|
| 22 |
-
GENERAL_CONVERSATION_PROMPT,
|
| 23 |
-
COHERE_MODEL_PRIMARY,
|
| 24 |
-
COHERE_TIMEOUT_S, # noqa: F401
|
| 25 |
-
USE_OPEN_FALLBACKS # noqa: F401
|
| 26 |
)
|
| 27 |
# Try to import optional HIPAA flags; fall back to safe defaults if not defined.
|
| 28 |
try:
|
| 29 |
-
from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
|
| 30 |
except Exception:
|
| 31 |
-
PHI_MODE = False
|
| 32 |
-
PERSIST_HISTORY = True
|
| 33 |
-
HISTORY_TTL_DAYS = 365
|
| 34 |
-
REDACT_BEFORE_LLM = False
|
| 35 |
-
ALLOW_EXTERNAL_PHI = True
|
|
|
|
| 36 |
from audit_log import log_event
|
| 37 |
from privacy import safety_filter, refusal_reply
|
| 38 |
from llm_router import cohere_chat, _co_client, cohere_embed
|
|
|
|
| 39 |
# ---------------------- Helpers (analysis logic unchanged) ----------------------
|
| 40 |
def load_markdown_text(filepath: str) -> str:
|
| 41 |
-
try:
|
| 42 |
-
with open(filepath, "r", encoding="utf-8") as f:
|
| 43 |
-
return f.read()
|
| 44 |
-
except FileNotFoundError:
|
| 45 |
-
return f"Error
|
|
|
|
| 46 |
def _sanitize_text(s: str) -> str:
|
| 47 |
-
if not isinstance(s, str):
|
| 48 |
-
return s
|
| 49 |
-
# Remove control characters (except newline and tab)
|
| 50 |
-
return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
|
| 51 |
-
|
|
|
|
| 52 |
PHI_PATTERNS = [
|
| 53 |
-
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
|
| 54 |
-
(re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
|
| 55 |
-
(re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
|
| 56 |
-
(re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]
|
| 57 |
-
(re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
|
| 58 |
-
(re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
|
| 59 |
-
(re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
|
| 60 |
]
|
|
|
|
| 61 |
def redact_phi(text: str) -> str:
|
| 62 |
-
if not isinstance(text, str):
|
| 63 |
-
return text
|
| 64 |
-
t = text
|
| 65 |
-
for pat, repl in PHI_PATTERNS:
|
| 66 |
-
t = pat.sub(repl, t)
|
| 67 |
-
return t
|
|
|
|
| 68 |
def safe_log(event_name: str, meta: dict | None = None):
|
| 69 |
-
# Avoid logging raw PHI or payloads
|
| 70 |
-
try:
|
| 71 |
-
meta = (meta or {}).copy()
|
| 72 |
-
meta.pop("raw", None)
|
| 73 |
-
log_event(event_name, None, meta)
|
| 74 |
-
except Exception:
|
| 75 |
-
# Never raise from logging
|
| 76 |
-
pass
|
|
|
|
| 77 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
| 78 |
-
EXPERT_ANALYTICAL_GUIDELINES = """
|
| 79 |
--- EXPERT ANALYTICAL GUIDELINES ---
|
| 80 |
When writing your script, you MUST follow these expert business rules:
|
| 81 |
-
Linking Datasets Rule
|
| 82 |
-
you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
|
| 83 |
-
and then assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
|
| 84 |
-
Prioritization Rule
|
| 85 |
-
to create a multi-factor risk score.
|
| 86 |
-
Capacity Calculation Rule
|
| 87 |
-
Cost Calculation Rule
|
| 88 |
"""
|
| 89 |
-
prompt_for_coder = f"""
|
| 90 |
You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
|
| 91 |
-
You have dataframes in a list dfs
|
|
|
|
| 92 |
{EXPERT_ANALYTICAL_GUIDELINES}
|
|
|
|
| 93 |
--- DATA SCHEMA ---
|
| 94 |
{schema_context}
|
| 95 |
--- END DATA SCHEMA ---
|
|
|
|
| 96 |
CRITICAL RULES:
|
| 97 |
-
DO NOT READ FILES
|
| 98 |
-
JSON OUTPUT ONLY
|
| 99 |
-
BE PRECISE
|
| 100 |
-
JSON SERIALIZATION
|
|
|
|
| 101 |
--- USER'S SCENARIO ---
|
| 102 |
{user_scenario}
|
|
|
|
| 103 |
--- PYTHON SCRIPT ---
|
| 104 |
Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
|
| 105 |
-
|
| 106 |
-
Python
|
| 107 |
"""
|
| 108 |
generated_text = cohere_chat(prompt_for_coder)
|
| 109 |
match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
|
|
@@ -225,8 +238,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 225 |
)
|
| 226 |
|
| 227 |
yield_update("""```
|
| 228 |
-
✍️ Synthesizing final comprehensive report
|
| 229 |
-
```""")
|
| 230 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 231 |
final_report = _generate_final_report(writer_input, raw_data_output)
|
| 232 |
return _sanitize_text(final_report)
|
|
@@ -328,7 +340,7 @@ function rs_toggle_stt(elemId){
|
|
| 328 |
__rs_rec.onresult = (ev) => {
|
| 329 |
let t = "";
|
| 330 |
for (let i = ev.resultIndex; i < ev.results.length; i++){
|
| 331 |
-
t += ev.results[i]
|
| 332 |
}
|
| 333 |
box.value = (base + " " + t).trim();
|
| 334 |
box.dispatchEvent(new Event("input", { bubbles: true }));
|
|
@@ -469,7 +481,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
|
|
| 469 |
if not selection or not history_state_list:
|
| 470 |
return ""
|
| 471 |
try:
|
| 472 |
-
selected_id = selection.split(" - ", 1)
|
| 473 |
except Exception:
|
| 474 |
selected_id = selection
|
| 475 |
|
|
|
|
| 5 |
# - Triple-quoted progress strings (no unterminated literals)
|
| 6 |
# - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
|
| 7 |
# - Optional HIPAA flags (fallback defaults if not present in settings.py)
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
import io
|
| 11 |
import json
|
| 12 |
import os
|
|
|
|
| 14 |
from contextlib import redirect_stdout
|
| 15 |
from datetime import datetime
|
| 16 |
from typing import Any, Dict, List
|
| 17 |
+
|
| 18 |
import gradio as gr
|
| 19 |
import pandas as pd
|
| 20 |
import regex as re2
|
| 21 |
import re
|
| 22 |
from langchain_cohere import ChatCohere # noqa: F401
|
| 23 |
from settings import (
|
| 24 |
+
GENERAL_CONVERSATION_PROMPT,
|
| 25 |
+
COHERE_MODEL_PRIMARY,
|
| 26 |
+
COHERE_TIMEOUT_S, # noqa: F401
|
| 27 |
+
USE_OPEN_FALLBACKS # noqa: F401
|
| 28 |
)
|
| 29 |
# Try to import optional HIPAA flags; fall back to safe defaults if not defined.
|
| 30 |
try:
|
| 31 |
+
from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
|
| 32 |
except Exception:
|
| 33 |
+
PHI_MODE = False
|
| 34 |
+
PERSIST_HISTORY = True
|
| 35 |
+
HISTORY_TTL_DAYS = 365
|
| 36 |
+
REDACT_BEFORE_LLM = False
|
| 37 |
+
ALLOW_EXTERNAL_PHI = True
|
| 38 |
+
|
| 39 |
from audit_log import log_event
|
| 40 |
from privacy import safety_filter, refusal_reply
|
| 41 |
from llm_router import cohere_chat, _co_client, cohere_embed
|
| 42 |
+
|
| 43 |
# ---------------------- Helpers (analysis logic unchanged) ----------------------
|
| 44 |
def load_markdown_text(filepath: str) -> str:
|
| 45 |
+
try:
|
| 46 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 47 |
+
return f.read()
|
| 48 |
+
except FileNotFoundError:
|
| 49 |
+
return f"**Error:** Document `{os.path.basename(filepath)}` not found."
|
| 50 |
+
|
| 51 |
def _sanitize_text(s: str) -> str:
|
| 52 |
+
if not isinstance(s, str):
|
| 53 |
+
return s
|
| 54 |
+
# Remove control characters (except newline and tab)
|
| 55 |
+
return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
|
| 56 |
+
|
| 57 |
+
# Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
|
| 58 |
PHI_PATTERNS = [
|
| 59 |
+
(re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
|
| 60 |
+
(re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
|
| 61 |
+
(re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
|
| 62 |
+
(re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
|
| 63 |
+
(re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
|
| 64 |
+
(re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
|
| 65 |
+
(re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
|
| 66 |
]
|
| 67 |
+
|
| 68 |
def redact_phi(text: str) -> str:
|
| 69 |
+
if not isinstance(text, str):
|
| 70 |
+
return text
|
| 71 |
+
t = text
|
| 72 |
+
for pat, repl in PHI_PATTERNS:
|
| 73 |
+
t = pat.sub(repl, t)
|
| 74 |
+
return t
|
| 75 |
+
|
| 76 |
def safe_log(event_name: str, meta: dict | None = None):
|
| 77 |
+
# Avoid logging raw PHI or payloads
|
| 78 |
+
try:
|
| 79 |
+
meta = (meta or {}).copy()
|
| 80 |
+
meta.pop("raw", None)
|
| 81 |
+
log_event(event_name, None, meta)
|
| 82 |
+
except Exception:
|
| 83 |
+
# Never raise from logging
|
| 84 |
+
pass
|
| 85 |
+
|
| 86 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
| 87 |
+
EXPERT_ANALYTICAL_GUIDELINES = """
|
| 88 |
--- EXPERT ANALYTICAL GUIDELINES ---
|
| 89 |
When writing your script, you MUST follow these expert business rules:
|
| 90 |
+
1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
|
| 91 |
+
you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
|
| 92 |
+
and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
|
| 93 |
+
2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
|
| 94 |
+
to create a multi-factor risk score.
|
| 95 |
+
3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
|
| 96 |
+
4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
|
| 97 |
"""
|
| 98 |
+
prompt_for_coder = f"""\
|
| 99 |
You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
|
| 100 |
+
You have dataframes in a list `dfs`.
|
| 101 |
+
|
| 102 |
{EXPERT_ANALYTICAL_GUIDELINES}
|
| 103 |
+
|
| 104 |
--- DATA SCHEMA ---
|
| 105 |
{schema_context}
|
| 106 |
--- END DATA SCHEMA ---
|
| 107 |
+
|
| 108 |
CRITICAL RULES:
|
| 109 |
+
1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
|
| 110 |
+
2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
|
| 111 |
+
3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
|
| 112 |
+
4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
|
| 113 |
+
|
| 114 |
--- USER'S SCENARIO ---
|
| 115 |
{user_scenario}
|
| 116 |
+
|
| 117 |
--- PYTHON SCRIPT ---
|
| 118 |
Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
|
| 119 |
+
```python
|
|
|
|
| 120 |
"""
|
| 121 |
generated_text = cohere_chat(prompt_for_coder)
|
| 122 |
match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
|
|
|
|
| 238 |
)
|
| 239 |
|
| 240 |
yield_update("""```
|
| 241 |
+
✍️ Synthesizing final comprehensive report...```""")
|
|
|
|
| 242 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 243 |
final_report = _generate_final_report(writer_input, raw_data_output)
|
| 244 |
return _sanitize_text(final_report)
|
|
|
|
| 340 |
__rs_rec.onresult = (ev) => {
|
| 341 |
let t = "";
|
| 342 |
for (let i = ev.resultIndex; i < ev.results.length; i++){
|
| 343 |
+
t += ev.results[i].transcript;
|
| 344 |
}
|
| 345 |
box.value = (base + " " + t).trim();
|
| 346 |
box.dispatchEvent(new Event("input", { bubbles: true }));
|
|
|
|
| 481 |
if not selection or not history_state_list:
|
| 482 |
return ""
|
| 483 |
try:
|
| 484 |
+
selected_id = selection.split(" - ", 1)
|
| 485 |
except Exception:
|
| 486 |
selected_id = selection
|
| 487 |
|