VEDAGI1 commited on
Commit
d006a6e
·
verified ·
1 Parent(s): a7e0072

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -62
app.py CHANGED
@@ -5,7 +5,8 @@
5
  # - Triple-quoted progress strings (no unterminated literals)
6
  # - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
7
  # - Optional HIPAA flags (fallback defaults if not present in settings.py)
8
- from future import annotations
 
9
  import io
10
  import json
11
  import os
@@ -13,97 +14,109 @@ import traceback
13
  from contextlib import redirect_stdout
14
  from datetime import datetime
15
  from typing import Any, Dict, List
 
16
  import gradio as gr
17
  import pandas as pd
18
  import regex as re2
19
  import re
20
  from langchain_cohere import ChatCohere # noqa: F401
21
  from settings import (
22
- GENERAL_CONVERSATION_PROMPT,
23
- COHERE_MODEL_PRIMARY,
24
- COHERE_TIMEOUT_S, # noqa: F401
25
- USE_OPEN_FALLBACKS # noqa: F401
26
  )
27
  # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
28
  try:
29
- from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
30
  except Exception:
31
- PHI_MODE = False
32
- PERSIST_HISTORY = True
33
- HISTORY_TTL_DAYS = 365
34
- REDACT_BEFORE_LLM = False
35
- ALLOW_EXTERNAL_PHI = True
 
36
  from audit_log import log_event
37
  from privacy import safety_filter, refusal_reply
38
  from llm_router import cohere_chat, _co_client, cohere_embed
 
39
  # ---------------------- Helpers (analysis logic unchanged) ----------------------
40
  def load_markdown_text(filepath: str) -> str:
41
- try:
42
- with open(filepath, "r", encoding="utf-8") as f:
43
- return f.read()
44
- except FileNotFoundError:
45
- return f"Error: Document {os.path.basename(filepath)} not found."
 
46
  def _sanitize_text(s: str) -> str:
47
- if not isinstance(s, str):
48
- return s
49
- # Remove control characters (except newline and tab)
50
- return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
51
- Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
 
52
  PHI_PATTERNS = [
53
- (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
54
- (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
55
- (re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
56
- (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
57
- (re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
58
- (re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
59
- (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
60
  ]
 
61
  def redact_phi(text: str) -> str:
62
- if not isinstance(text, str):
63
- return text
64
- t = text
65
- for pat, repl in PHI_PATTERNS:
66
- t = pat.sub(repl, t)
67
- return t
 
68
  def safe_log(event_name: str, meta: dict | None = None):
69
- # Avoid logging raw PHI or payloads
70
- try:
71
- meta = (meta or {}).copy()
72
- meta.pop("raw", None)
73
- log_event(event_name, None, meta)
74
- except Exception:
75
- # Never raise from logging
76
- pass
 
77
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
78
- EXPERT_ANALYTICAL_GUIDELINES = """
79
  --- EXPERT ANALYTICAL GUIDELINES ---
80
  When writing your script, you MUST follow these expert business rules:
81
- Linking Datasets Rule: If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
82
- you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
83
- and then assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
84
- Prioritization Rule: To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
85
- to create a multi-factor risk score.
86
- Capacity Calculation Rule: For capacity over a 3-month window, assume 60 working days.
87
- Cost Calculation Rule: Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
88
  """
89
- prompt_for_coder = f"""
90
  You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
91
- You have dataframes in a list dfs.
 
92
  {EXPERT_ANALYTICAL_GUIDELINES}
 
93
  --- DATA SCHEMA ---
94
  {schema_context}
95
  --- END DATA SCHEMA ---
 
96
  CRITICAL RULES:
97
- DO NOT READ FILES: You MUST NOT include pd.read_csv. The data is ALREADY loaded in the dfs variable. You MUST use this variable. Failure to do so will cause a fatal error.
98
- JSON OUTPUT ONLY: Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
99
- BE PRECISE: Use the exact, case-sensitive column names from the schema and robustly clean strings (re.sub()) before converting to numbers.
100
- JSON SERIALIZATION: Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like int64) to standard Python types using .item() for single values or .tolist() for lists.
 
101
  --- USER'S SCENARIO ---
102
  {user_scenario}
 
103
  --- PYTHON SCRIPT ---
104
  Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
105
- code
106
- Python
107
  """
108
  generated_text = cohere_chat(prompt_for_coder)
109
  match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
@@ -225,8 +238,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
225
  )
226
 
227
  yield_update("""```
228
- ✍️ Synthesizing final comprehensive report...
229
- ```""")
230
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
231
  final_report = _generate_final_report(writer_input, raw_data_output)
232
  return _sanitize_text(final_report)
@@ -328,7 +340,7 @@ function rs_toggle_stt(elemId){
328
  __rs_rec.onresult = (ev) => {
329
  let t = "";
330
  for (let i = ev.resultIndex; i < ev.results.length; i++){
331
- t += ev.results[i][0].transcript;
332
  }
333
  box.value = (base + " " + t).trim();
334
  box.dispatchEvent(new Event("input", { bubbles: true }));
@@ -469,7 +481,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
469
  if not selection or not history_state_list:
470
  return ""
471
  try:
472
- selected_id = selection.split(" - ", 1)[0]
473
  except Exception:
474
  selected_id = selection
475
 
 
5
  # - Triple-quoted progress strings (no unterminated literals)
6
  # - Sleek full-width UI and Voice-to-Text (browser Web Speech API)
7
  # - Optional HIPAA flags (fallback defaults if not present in settings.py)
8
+ from __future__ import annotations
9
+
10
  import io
11
  import json
12
  import os
 
14
  from contextlib import redirect_stdout
15
  from datetime import datetime
16
  from typing import Any, Dict, List
17
+
18
  import gradio as gr
19
  import pandas as pd
20
  import regex as re2
21
  import re
22
  from langchain_cohere import ChatCohere # noqa: F401
23
  from settings import (
24
+ GENERAL_CONVERSATION_PROMPT,
25
+ COHERE_MODEL_PRIMARY,
26
+ COHERE_TIMEOUT_S, # noqa: F401
27
+ USE_OPEN_FALLBACKS # noqa: F401
28
  )
29
  # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
30
  try:
31
+ from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
32
  except Exception:
33
+ PHI_MODE = False
34
+ PERSIST_HISTORY = True
35
+ HISTORY_TTL_DAYS = 365
36
+ REDACT_BEFORE_LLM = False
37
+ ALLOW_EXTERNAL_PHI = True
38
+
39
  from audit_log import log_event
40
  from privacy import safety_filter, refusal_reply
41
  from llm_router import cohere_chat, _co_client, cohere_embed
42
+
43
  # ---------------------- Helpers (analysis logic unchanged) ----------------------
44
  def load_markdown_text(filepath: str) -> str:
45
+ try:
46
+ with open(filepath, "r", encoding="utf-8") as f:
47
+ return f.read()
48
+ except FileNotFoundError:
49
+ return f"**Error:** Document `{os.path.basename(filepath)}` not found."
50
+
51
  def _sanitize_text(s: str) -> str:
52
+ if not isinstance(s, str):
53
+ return s
54
+ # Remove control characters (except newline and tab)
55
+ return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
56
+
57
+ # Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
58
  PHI_PATTERNS = [
59
+ (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
60
+ (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
61
+ (re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
62
+ (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
63
+ (re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
64
+ (re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
65
+ (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
66
  ]
67
+
68
  def redact_phi(text: str) -> str:
69
+ if not isinstance(text, str):
70
+ return text
71
+ t = text
72
+ for pat, repl in PHI_PATTERNS:
73
+ t = pat.sub(repl, t)
74
+ return t
75
+
76
  def safe_log(event_name: str, meta: dict | None = None):
77
+ # Avoid logging raw PHI or payloads
78
+ try:
79
+ meta = (meta or {}).copy()
80
+ meta.pop("raw", None)
81
+ log_event(event_name, None, meta)
82
+ except Exception:
83
+ # Never raise from logging
84
+ pass
85
+
86
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
87
+ EXPERT_ANALYTICAL_GUIDELINES = """
88
  --- EXPERT ANALYTICAL GUIDELINES ---
89
  When writing your script, you MUST follow these expert business rules:
90
+ 1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
91
+ you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
92
+ and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
93
+ 2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
94
+ to create a multi-factor risk score.
95
+ 3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
96
+ 4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
97
  """
98
+ prompt_for_coder = f"""\
99
  You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
100
+ You have dataframes in a list `dfs`.
101
+
102
  {EXPERT_ANALYTICAL_GUIDELINES}
103
+
104
  --- DATA SCHEMA ---
105
  {schema_context}
106
  --- END DATA SCHEMA ---
107
+
108
  CRITICAL RULES:
109
+ 1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
110
+ 2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
111
+ 3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
112
+ 4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
113
+
114
  --- USER'S SCENARIO ---
115
  {user_scenario}
116
+
117
  --- PYTHON SCRIPT ---
118
  Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
119
+ ```python
 
120
  """
121
  generated_text = cohere_chat(prompt_for_coder)
122
  match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
 
238
  )
239
 
240
  yield_update("""```
241
+ ✍️ Synthesizing final comprehensive report...```""")
 
242
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
243
  final_report = _generate_final_report(writer_input, raw_data_output)
244
  return _sanitize_text(final_report)
 
340
  __rs_rec.onresult = (ev) => {
341
  let t = "";
342
  for (let i = ev.resultIndex; i < ev.results.length; i++){
343
+ t += ev.results[i].transcript;
344
  }
345
  box.value = (base + " " + t).trim();
346
  box.dispatchEvent(new Event("input", { bubbles: true }));
 
481
  if not selection or not history_state_list:
482
  return ""
483
  try:
484
+ selected_id = selection.split(" - ", 1)
485
  except Exception:
486
  selected_id = selection
487