Commit Β·
58db664
1
Parent(s): bf7e929
Adding HelpScout to UI
Browse files- .idea/vcs.xml +3 -1
- process_helpscout/README.md +339 -0
- process_helpscout/agents/README.md +310 -0
- process_helpscout/agents/__init__.py +0 -0
- process_helpscout/agents/base_agent.py +58 -0
- process_helpscout/agents/sentiment_analysis_agent.py +229 -0
- process_helpscout/agents/topic_extraction_agent.py +268 -0
- process_helpscout/config_files/processing_config.json +125 -0
- process_helpscout/config_files/topics.json +90 -0
- process_helpscout/data_fetcher.py +77 -0
- process_helpscout/fetch_and_export.py +183 -0
- process_helpscout/html_cleaner.py +169 -0
- process_helpscout/main.py +423 -0
- process_helpscout/snowflake_conn.py +106 -0
- process_helpscout/workflow/__init__.py +0 -0
- process_helpscout/workflow/conversation_processor.py +334 -0
- visualization/README.md +279 -140
- visualization/agents/helpscout_summary_agent.py +309 -0
- visualization/app.py +38 -10
- visualization/components/dashboard.py +55 -1
- visualization/components/helpscout_analysis.py +491 -0
- visualization/components/helpscout_dashboard.py +278 -0
- visualization/components/sentiment_analysis.py +38 -6
- visualization/config/viz_config.json +61 -1
- visualization/data/data_loader.py +25 -5
- visualization/data/helpscout_data_loader.py +382 -0
- visualization/utils/auth.py +0 -2
- visualization/utils/data_processor.py +46 -0
- visualization/utils/helpscout_pdf.py +471 -0
- visualization/utils/helpscout_utils.py +107 -0
- visualization/utils/pdf_exporter.py +80 -0
- visualization/visualizations/distribution_charts.py +131 -0
- visualization/visualizations/helpscout_charts.py +413 -0
.idea/vcs.xml
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
<project version="4">
|
| 3 |
-
<component name="VcsDirectoryMappings"
|
|
|
|
|
|
|
| 4 |
</project>
|
|
|
|
| 1 |
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings">
|
| 4 |
+
<mapping directory="" vcs="Git" />
|
| 5 |
+
</component>
|
| 6 |
</project>
|
process_helpscout/README.md
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HelpScout Processing Pipeline
|
| 2 |
+
|
| 3 |
+
Extracts, cleans, and enriches customer support conversations from HelpScout.
|
| 4 |
+
The module has two distinct responsibilities:
|
| 5 |
+
|
| 6 |
+
1. **Data export** (`fetch_and_export.py`) β fetches raw threads, cleans HTML, and exports CSVs for the Streamlit dashboard.
|
| 7 |
+
2. **AI processing pipeline** (`main.py`) β fetches the same conversations, runs them through a two-step agentic workflow (sentiment + topic extraction), and writes enriched records to Snowflake.
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## Folder Structure
|
| 12 |
+
|
| 13 |
+
```
|
| 14 |
+
process_helpscout/
|
| 15 |
+
β
|
| 16 |
+
βββ main.py # Pipeline entry point (parallel processing)
|
| 17 |
+
βββ data_fetcher.py # Fetches & aggregates conversations; deduplication check
|
| 18 |
+
βββ fetch_and_export.py # CSV export script (separate from the pipeline)
|
| 19 |
+
βββ html_cleaner.py # HTML β clean plain text (shared by both workflows)
|
| 20 |
+
βββ snowflake_conn.py # Snowflake connection wrapper
|
| 21 |
+
β
|
| 22 |
+
βββ agents/ # LLM-based extraction agents
|
| 23 |
+
β βββ README.md # Agent architecture docs (read this to extend)
|
| 24 |
+
β βββ base_agent.py # Abstract base class for all agents
|
| 25 |
+
β βββ sentiment_analysis_agent.py # Classifies sentiment polarity + emotions
|
| 26 |
+
β βββ topic_extraction_agent.py # Assigns topic tags + billing flags
|
| 27 |
+
β
|
| 28 |
+
βββ workflow/
|
| 29 |
+
β βββ conversation_processor.py # LangGraph workflow: sentiment β topics β END
|
| 30 |
+
β
|
| 31 |
+
βββ config_files/
|
| 32 |
+
β βββ processing_config.json # Agent models, batch settings, output table, sentiment categories
|
| 33 |
+
β βββ topics.json # HelpScout topic taxonomy (source of truth for topic extraction)
|
| 34 |
+
β
|
| 35 |
+
βββ queries/
|
| 36 |
+
β βββ helpscout_conversations.sql # SQL that fetches customer threads from Snowflake
|
| 37 |
+
β
|
| 38 |
+
βββ sql/
|
| 39 |
+
β βββ create_features_table.sql # DDL β run once before first pipeline execution
|
| 40 |
+
β
|
| 41 |
+
βββ output/ # Auto-created; holds CSV exports
|
| 42 |
+
β βββ helpscout_threads.csv
|
| 43 |
+
β βββ helpscout_conversations.csv
|
| 44 |
+
β
|
| 45 |
+
βββ visualization/ # Streamlit dashboard (reads from CSV exports)
|
| 46 |
+
βββ app.py
|
| 47 |
+
βββ components/dashboard.py
|
| 48 |
+
βββ utils/data_processor.py
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## Data Flow
|
| 54 |
+
|
| 55 |
+
### CSV Export (Dashboard)
|
| 56 |
+
|
| 57 |
+
```
|
| 58 |
+
Snowflake (STITCH.HELPSCOUT.CONVERSATION_THREADS)
|
| 59 |
+
β queries/helpscout_conversations.sql
|
| 60 |
+
βΌ
|
| 61 |
+
fetch_and_export.py
|
| 62 |
+
β process_threads() β clean HTML, add word_count, date columns
|
| 63 |
+
β aggregate_conversations() β one row per conversation_id
|
| 64 |
+
βΌ
|
| 65 |
+
output/helpscout_threads.csv (one row per message thread)
|
| 66 |
+
output/helpscout_conversations.csv (one row per conversation)
|
| 67 |
+
β
|
| 68 |
+
βΌ
|
| 69 |
+
visualization/app.py β Streamlit dashboard
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### AI Processing Pipeline
|
| 73 |
+
|
| 74 |
+
```
|
| 75 |
+
Snowflake (STITCH.HELPSCOUT.CONVERSATION_THREADS)
|
| 76 |
+
β Same SQL β customer threads only, Feb 17 2026+
|
| 77 |
+
βΌ
|
| 78 |
+
data_fetcher.fetch_conversations()
|
| 79 |
+
β Cleans HTML (html_cleaner.py)
|
| 80 |
+
β Aggregates to one row per conversation
|
| 81 |
+
β Checks HELPSCOUT_CONVERSATION_FEATURES for already-processed IDs
|
| 82 |
+
βΌ
|
| 83 |
+
main.py β splits into parallel batches
|
| 84 |
+
β
|
| 85 |
+
βββ Worker 1: ConversationProcessingWorkflow
|
| 86 |
+
β βββ Node 1: SentimentAnalysisAgent β polarity + emotions
|
| 87 |
+
β βββ Node 2: TopicExtractionAgent β topics + billing flags
|
| 88 |
+
β
|
| 89 |
+
βββ Worker 2: ... (same)
|
| 90 |
+
βββ Worker N: ... (same)
|
| 91 |
+
β
|
| 92 |
+
βΌ
|
| 93 |
+
SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## Setup
|
| 99 |
+
|
| 100 |
+
### 1. Environment variables
|
| 101 |
+
|
| 102 |
+
All credentials are read from the project root `.env` file.
|
| 103 |
+
|
| 104 |
+
| Key | Description |
|
| 105 |
+
|-----|-------------|
|
| 106 |
+
| `SNOWFLAKE_USER` | Snowflake username |
|
| 107 |
+
| `SNOWFLAKE_PASSWORD` | Snowflake password |
|
| 108 |
+
| `SNOWFLAKE_ACCOUNT` | Snowflake account identifier |
|
| 109 |
+
| `SNOWFLAKE_ROLE` | Role with access to `STITCH`, `ESTUARY`, and `SOCIAL_MEDIA_DB` |
|
| 110 |
+
| `SNOWFLAKE_WAREHOUSE` | Compute warehouse |
|
| 111 |
+
| `OPENAI_API_KEY` | Required for the AI pipeline only |
|
| 112 |
+
|
| 113 |
+
### 2. Dependencies
|
| 114 |
+
|
| 115 |
+
All dependencies are in the project root `requirements.txt`:
|
| 116 |
+
- `snowflake-snowpark-python`
|
| 117 |
+
- `beautifulsoup4`
|
| 118 |
+
- `pandas`, `numpy`
|
| 119 |
+
- `langchain-openai`, `langgraph`
|
| 120 |
+
- `python-dotenv`
|
| 121 |
+
- `streamlit`, `plotly` (dashboard only)
|
| 122 |
+
|
| 123 |
+
### 3. Create the output table (once)
|
| 124 |
+
|
| 125 |
+
Before running the pipeline for the first time, execute the DDL in Snowflake:
|
| 126 |
+
|
| 127 |
+
```sql
|
| 128 |
+
-- Run this in your Snowflake worksheet or via the Snowflake CLI
|
| 129 |
+
-- File: sql/create_features_table.sql
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
This creates `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES` with a primary key on `CONVERSATION_ID`. The pipeline always appends β it never truncates the table.
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## Usage
|
| 137 |
+
|
| 138 |
+
### Run the AI processing pipeline
|
| 139 |
+
|
| 140 |
+
```bash
|
| 141 |
+
cd process_helpscout
|
| 142 |
+
|
| 143 |
+
# Process all new conversations (parallel, recommended)
|
| 144 |
+
python main.py
|
| 145 |
+
|
| 146 |
+
# Limit to 100 conversations β useful for a first test run
|
| 147 |
+
python main.py --limit 100
|
| 148 |
+
|
| 149 |
+
# Sequential mode β single process, easier to read logs when debugging
|
| 150 |
+
python main.py --sequential
|
| 151 |
+
|
| 152 |
+
# Use a custom config file
|
| 153 |
+
python main.py --config /path/to/my_config.json
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
On every run the pipeline:
|
| 157 |
+
1. Fetches all conversations (from Feb 17 2026 to today)
|
| 158 |
+
2. Queries the output table for already-processed `CONVERSATION_ID`s
|
| 159 |
+
3. Skips those β only new conversations are sent to the LLM
|
| 160 |
+
4. Appends results to the Snowflake output table
|
| 161 |
+
|
| 162 |
+
### Run the CSV export (dashboard data)
|
| 163 |
+
|
| 164 |
+
```bash
|
| 165 |
+
cd process_helpscout
|
| 166 |
+
python fetch_and_export.py
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
### Launch the Streamlit dashboard
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
cd process_helpscout
|
| 173 |
+
streamlit run visualization/app.py
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
---
|
| 177 |
+
|
| 178 |
+
## Output Table
|
| 179 |
+
|
| 180 |
+
**Table:** `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES`
|
| 181 |
+
|
| 182 |
+
| Column | Type | Description |
|
| 183 |
+
|--------|------|-------------|
|
| 184 |
+
| `CONVERSATION_ID` | VARCHAR | HelpScout conversation ID (primary key) |
|
| 185 |
+
| `CUSTOMER_EMAIL` | VARCHAR | Customer email address |
|
| 186 |
+
| `CUSTOMER_FIRST` | VARCHAR | Customer first name |
|
| 187 |
+
| `CUSTOMER_LAST` | VARCHAR | Customer last name |
|
| 188 |
+
| `CUSTOMER_HS_ID` | NUMBER | HelpScout internal customer ID |
|
| 189 |
+
| `THREAD_COUNT` | NUMBER | Number of customer message threads |
|
| 190 |
+
| `FIRST_MESSAGE_AT` | TIMESTAMP_TZ | When the first customer message was sent |
|
| 191 |
+
| `LAST_MESSAGE_AT` | TIMESTAMP_TZ | When the last customer message was sent |
|
| 192 |
+
| `DURATION_HOURS` | FLOAT | Hours between first and last message |
|
| 193 |
+
| `STATUS` | VARCHAR | Last known HelpScout status |
|
| 194 |
+
| `STATE` | VARCHAR | Last known HelpScout state |
|
| 195 |
+
| `SOURCE_TYPE` | VARCHAR | e.g. `email`, `chat` |
|
| 196 |
+
| `SOURCE_VIA` | VARCHAR | e.g. `api`, `mailbox` |
|
| 197 |
+
| `COMBINED_TEXT` | TEXT | Raw aggregated customer messages |
|
| 198 |
+
| `CONVERSATION_TEXT_USED` | TEXT | Formatted + truncated text sent to the LLM |
|
| 199 |
+
| `SENTIMENT_POLARITY` | VARCHAR | `very_positive` / `positive` / `neutral` / `negative` / `very_negative` |
|
| 200 |
+
| `EMOTIONS` | VARCHAR | Comma-separated emotion values (NULL if none valid) |
|
| 201 |
+
| `SENTIMENT_CONFIDENCE` | VARCHAR | `high` / `medium` / `low` |
|
| 202 |
+
| `SENTIMENT_NOTES` | TEXT | 1-2 sentence LLM explanation of the sentiment |
|
| 203 |
+
| `TOPICS` | VARCHAR | Comma-separated topic IDs (multi-label) |
|
| 204 |
+
| `IS_REFUND_REQUEST` | BOOLEAN | Customer explicitly asked for a refund |
|
| 205 |
+
| `IS_CANCELLATION` | BOOLEAN | Customer explicitly wants to cancel |
|
| 206 |
+
| `IS_MEMBERSHIP` | BOOLEAN | Customer wants to join/rejoin and purchase membership |
|
| 207 |
+
| `TOPIC_CONFIDENCE` | VARCHAR | `high` / `medium` / `low` |
|
| 208 |
+
| `TOPIC_NOTES` | TEXT | 1-2 sentence LLM explanation of topics |
|
| 209 |
+
| `SUMMARY` | TEXT | 2-3 sentence neutral summary of the conversation |
|
| 210 |
+
| `PROCESSING_ERRORS` | TEXT | Semicolon-separated errors (NULL on full success) |
|
| 211 |
+
| `PROCESSED_AT` | TIMESTAMP_NTZ | When this record was written by the pipeline |
|
| 212 |
+
| `WORKFLOW_VERSION` | VARCHAR | Pipeline version for auditability |
|
| 213 |
+
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
## Configuration
|
| 217 |
+
|
| 218 |
+
All pipeline settings live in `config_files/processing_config.json`.
|
| 219 |
+
|
| 220 |
+
### Agent models
|
| 221 |
+
|
| 222 |
+
```json
|
| 223 |
+
"agents": {
|
| 224 |
+
"sentiment_analysis": {
|
| 225 |
+
"model": "gpt-4o-mini",
|
| 226 |
+
"temperature": 0.2,
|
| 227 |
+
"max_retries": 3
|
| 228 |
+
},
|
| 229 |
+
"topic_extraction": {
|
| 230 |
+
"model": "gpt-4o-mini",
|
| 231 |
+
"temperature": 0.2,
|
| 232 |
+
"max_retries": 3
|
| 233 |
+
}
|
| 234 |
+
}
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
Switch any agent to `gpt-4o` for higher accuracy (at higher cost) by changing the `"model"` value.
|
| 238 |
+
|
| 239 |
+
### Conversation length
|
| 240 |
+
|
| 241 |
+
```json
|
| 242 |
+
"processing": {
|
| 243 |
+
"max_conversation_chars": 3000,
|
| 244 |
+
"min_batch_size": 10,
|
| 245 |
+
"max_batch_size": 50
|
| 246 |
+
}
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
`max_conversation_chars` controls how many characters of conversation text are sent to the LLM. Increasing this improves context for long conversations but raises token costs. The workflow formats messages as `[1] msg\n[2] msgβ¦` and truncates at this limit.
|
| 250 |
+
|
| 251 |
+
### Output destination
|
| 252 |
+
|
| 253 |
+
```json
|
| 254 |
+
"output": {
|
| 255 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 256 |
+
"schema": "ML_FEATURES",
|
| 257 |
+
"table": "HELPSCOUT_CONVERSATION_FEATURES"
|
| 258 |
+
}
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
To write to a different table (e.g. a staging or test table), change these values and re-run the DDL in `sql/create_features_table.sql` for the new table name.
|
| 262 |
+
|
| 263 |
+
### Sentiment categories
|
| 264 |
+
|
| 265 |
+
The `sentiment_polarity` and `emotions` blocks in `processing_config.json` define the valid values for classification. Adding, removing, or renaming a category here is automatically reflected in both the LLM prompt and the output validation β no code changes required.
|
| 266 |
+
|
| 267 |
+
### Topic taxonomy
|
| 268 |
+
|
| 269 |
+
Topic definitions live in `config_files/topics.json`. This file is the single source of truth: the `TopicExtractionAgent` builds its system prompt directly from it. To add a new topic:
|
| 270 |
+
|
| 271 |
+
1. Add an entry to the `"topics"` array with a unique `id`, `label`, and `description`.
|
| 272 |
+
2. If the topic has boolean sub-flags (like billing), add a `"flags"` key β then update `topic_extraction_agent.py` to extract those flags.
|
| 273 |
+
3. Re-run the pipeline β the new topic will be available immediately.
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
## SQL Query
|
| 278 |
+
|
| 279 |
+
**File:** `queries/helpscout_conversations.sql`
|
| 280 |
+
|
| 281 |
+
| Design decision | Detail |
|
| 282 |
+
|-----------------|--------|
|
| 283 |
+
| Date filter | `CREATED_AT >= '2026-02-17'` to current date |
|
| 284 |
+
| Team exclusion | Anti-join with `USORA_USERS WHERE access_level = 'team'` β only customer messages reach the pipeline |
|
| 285 |
+
| Thread types | `TYPE IN ('customer', 'message')` β excludes notes, forwarded threads, system messages |
|
| 286 |
+
| JSON extraction | Snowflake semi-structured syntax: `COLUMN:field::VARCHAR` |
|
| 287 |
+
|
| 288 |
+
To change the date range, edit the `WHERE ct.CREATED_AT >= '...'` line in the SQL file.
|
| 289 |
+
|
| 290 |
+
---
|
| 291 |
+
|
| 292 |
+
## HTML Cleaner
|
| 293 |
+
|
| 294 |
+
`html_cleaner.py` runs a four-stage pipeline on every message body:
|
| 295 |
+
|
| 296 |
+
| Stage | What it removes |
|
| 297 |
+
|-------|----------------|
|
| 298 |
+
| `_remove_quoted_sections()` | `<blockquote>` tags and Gmail/Outlook/Yahoo quoted-reply CSS wrappers |
|
| 299 |
+
| `_remove_boilerplate()` | `<table>`, `<img>`, `<script>`, `<style>` tags and footer/unsubscribe blocks |
|
| 300 |
+
| `_extract_text()` | Extracts plain text while preserving line breaks |
|
| 301 |
+
| `_clean_text()` | Strips invisible Unicode, collapses whitespace, removes `>` quote lines, cuts off at "On β¦ wrote:" markers |
|
| 302 |
+
|
| 303 |
+
To add a new boilerplate pattern, append a string to `footer_keywords` inside `_remove_boilerplate()`, or add a CSS class fragment to `_QUOTED_CLASS_PATTERNS` at the top of the file.
|
| 304 |
+
|
| 305 |
+
---
|
| 306 |
+
|
| 307 |
+
## Extending the Pipeline
|
| 308 |
+
|
| 309 |
+
### Add a third agentic step
|
| 310 |
+
|
| 311 |
+
1. Create `agents/your_new_agent.py` inheriting from `BaseAgent` (see `agents/README.md`).
|
| 312 |
+
2. Add a new node method `_your_node()` in `workflow/conversation_processor.py`.
|
| 313 |
+
3. Add the node and a new edge in `_build_workflow()`:
|
| 314 |
+
```python
|
| 315 |
+
graph.add_node("your_step", self._your_node)
|
| 316 |
+
graph.add_edge("topic_extraction", "your_step")
|
| 317 |
+
graph.add_edge("your_step", END)
|
| 318 |
+
```
|
| 319 |
+
4. Add the corresponding output fields to `ConversationState`.
|
| 320 |
+
5. Map new columns in `main.py`'s `column_map` dict and add them to the DDL.
|
| 321 |
+
|
| 322 |
+
### Change the date range
|
| 323 |
+
|
| 324 |
+
Edit `queries/helpscout_conversations.sql`:
|
| 325 |
+
```sql
|
| 326 |
+
ct.CREATED_AT >= '2026-02-17 00:00:00' -- β change start date
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
### Include team replies
|
| 330 |
+
|
| 331 |
+
Remove the anti-join in `helpscout_conversations.sql` and broaden `TYPE` to include `'note'` and `'message'`. Be sure to update the HTML cleaning and aggregation if team messages need different handling.
|
| 332 |
+
|
| 333 |
+
### Process a different HelpScout mailbox
|
| 334 |
+
|
| 335 |
+
Add a `WHERE` clause on a mailbox ID column if available, or filter by `source_via` / `status`.
|
| 336 |
+
|
| 337 |
+
### Automate daily runs
|
| 338 |
+
|
| 339 |
+
Schedule `main.py` with a cron job, Airflow DAG, or any task scheduler. Because the pipeline skips already-processed conversations, re-running it daily processes only new conversations β no manual bookkeeping needed.
|
process_helpscout/agents/README.md
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agents
|
| 2 |
+
|
| 3 |
+
The agents package contains the LLM-based extraction components used in the HelpScout processing pipeline. Each agent is a self-contained class responsible for one well-defined task.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Architecture
|
| 8 |
+
|
| 9 |
+
```
|
| 10 |
+
BaseAgent (base_agent.py)
|
| 11 |
+
β
|
| 12 |
+
βββ SentimentAnalysisAgent (sentiment_analysis_agent.py)
|
| 13 |
+
β Classifies overall sentiment polarity and emotions
|
| 14 |
+
β from a customer support conversation.
|
| 15 |
+
β
|
| 16 |
+
βββ TopicExtractionAgent (topic_extraction_agent.py)
|
| 17 |
+
Assigns one or more topic tags and extracts
|
| 18 |
+
billing-specific boolean flags.
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
All agents follow the same contract defined in `BaseAgent`:
|
| 22 |
+
|
| 23 |
+
| Method | Required | Description |
|
| 24 |
+
|--------|----------|-------------|
|
| 25 |
+
| `validate_input(input_data)` | Yes | Returns `True` if the input dict has the required fields |
|
| 26 |
+
| `process(input_data)` | Yes | Main entry point β validates, calls LLM, returns result dict |
|
| 27 |
+
| `log_processing(message, level)` | Inherited | Logs `[AgentName] message` at the given level |
|
| 28 |
+
| `handle_error(error, context)` | Inherited | Returns a standardised `{"success": False, "error": ...}` dict |
|
| 29 |
+
|
| 30 |
+
The workflow (`workflow/conversation_processor.py`) calls `agent.process(input_data)` for each node. Agents never call each other β they are orchestrated exclusively by the workflow.
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## BaseAgent (`base_agent.py`)
|
| 35 |
+
|
| 36 |
+
Defines the interface every agent must implement. Contains no LLM logic.
|
| 37 |
+
|
| 38 |
+
### Key attributes set from config
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
self.model # LLM model name, e.g. "gpt-4o-mini"
|
| 42 |
+
self.temperature # Sampling temperature (default: 0.2)
|
| 43 |
+
self.max_retries # Reserved for retry logic in subclasses
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
These are read from the agent's block in `config_files/processing_config.json`:
|
| 47 |
+
```json
|
| 48 |
+
"agents": {
|
| 49 |
+
"sentiment_analysis": { "model": "gpt-4o-mini", "temperature": 0.2, "max_retries": 3 }
|
| 50 |
+
}
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### Return contract
|
| 54 |
+
|
| 55 |
+
Every `process()` implementation must return a dict with at minimum:
|
| 56 |
+
```python
|
| 57 |
+
{"success": True, ...} # on success β include extracted fields
|
| 58 |
+
{"success": False, "error": "<reason>"} # on failure
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
The workflow checks `success` to decide whether to mark a conversation as failed.
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## SentimentAnalysisAgent (`sentiment_analysis_agent.py`)
|
| 66 |
+
|
| 67 |
+
Classifies the overall **sentiment polarity** and **emotions** expressed across a customer's conversation messages.
|
| 68 |
+
|
| 69 |
+
### Input
|
| 70 |
+
|
| 71 |
+
```python
|
| 72 |
+
agent.process({
|
| 73 |
+
"conversation_text": "<formatted, truncated customer messages>"
|
| 74 |
+
})
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
The `conversation_text` is prepared by the workflow before calling the agent β it is numbered, pipe-delimited messages truncated to `max_conversation_chars`.
|
| 78 |
+
|
| 79 |
+
### Output (on success)
|
| 80 |
+
|
| 81 |
+
```python
|
| 82 |
+
{
|
| 83 |
+
"success": True,
|
| 84 |
+
"sentiment_polarity": "negative", # one of the 5 polarity values
|
| 85 |
+
"emotions": "frustration, disappointment", # comma-separated, or None (soft-fail)
|
| 86 |
+
"sentiment_confidence": "high",
|
| 87 |
+
"sentiment_notes": "Customer is frustrated by repeated login failures."
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### Validation rules
|
| 92 |
+
|
| 93 |
+
| Field | Behaviour on invalid value |
|
| 94 |
+
|-------|---------------------------|
|
| 95 |
+
| `sentiment_polarity` | Hard fail β conversation is not stored |
|
| 96 |
+
| `emotions` | Soft fail β `None` is stored, conversation is still written |
|
| 97 |
+
| `confidence` | Silently corrected to `"medium"` |
|
| 98 |
+
|
| 99 |
+
### Where categories are defined
|
| 100 |
+
|
| 101 |
+
Polarity and emotion categories (their `value` and `description` strings) live in `config_files/processing_config.json` under `"sentiment_polarity"` and `"emotions"`. The system prompt is **built at init time from the config**, so updating the config is all you need to change what the LLM is instructed to classify.
|
| 102 |
+
|
| 103 |
+
### Modifying the sentiment prompt
|
| 104 |
+
|
| 105 |
+
The system prompt is assembled in `_build_system_prompt()`. To change the framing or add additional instructions, edit that method directly. The category lists are injected automatically from config β do not hardcode them in the prompt.
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## TopicExtractionAgent (`topic_extraction_agent.py`)
|
| 110 |
+
|
| 111 |
+
Assigns one or more **topic tags** from the Musora HelpScout taxonomy, extracts three **billing/membership boolean flags**, and produces a brief **neutral summary** of the conversation.
|
| 112 |
+
|
| 113 |
+
### Input
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
agent.process({
|
| 117 |
+
"conversation_text": "<formatted, truncated customer messages>"
|
| 118 |
+
})
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Output (on success)
|
| 122 |
+
|
| 123 |
+
```python
|
| 124 |
+
{
|
| 125 |
+
"success": True,
|
| 126 |
+
"topics": "billing_and_subscription, account_and_access", # comma-separated IDs
|
| 127 |
+
"is_refund_request": True, # customer explicitly asked for money back
|
| 128 |
+
"is_cancellation": False, # customer did NOT explicitly ask to cancel
|
| 129 |
+
"is_membership": False, # customer wants to join/rejoin and purchase membership
|
| 130 |
+
"topic_confidence": "high",
|
| 131 |
+
"topic_notes": "Customer was unexpectedly charged and is requesting a refund.",
|
| 132 |
+
"summary": "The customer reports being charged after believing they had cancelled their subscription. They are requesting a full refund and confirmation that no further charges will occur."
|
| 133 |
+
}
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
### Validation rules
|
| 137 |
+
|
| 138 |
+
| Field | Behaviour on invalid value |
|
| 139 |
+
|-------|---------------------------|
|
| 140 |
+
| `topics` | Hard fail if no valid topic IDs remain after filtering |
|
| 141 |
+
| `is_refund_request` / `is_cancellation` / `is_membership` | Coerced to `bool`; defaults to `False` if missing |
|
| 142 |
+
| `confidence` | Silently corrected to `"medium"` |
|
| 143 |
+
| `summary` | Soft fail β `""` stored if missing; conversation still written |
|
| 144 |
+
|
| 145 |
+
### Where topics are defined
|
| 146 |
+
|
| 147 |
+
All topic definitions live in `config_files/topics.json`. The agent builds its system prompt directly from this file at init time β adding, removing, or rewriting a topic description requires only a config change.
|
| 148 |
+
|
| 149 |
+
### Billing and membership flags
|
| 150 |
+
|
| 151 |
+
`is_refund_request`, `is_cancellation`, and `is_membership` are extracted on every conversation regardless of which topics are assigned. They are defined in `topics.json` under `billing_and_subscription.flags` for documentation purposes, but the agent always asks the LLM to evaluate them independently.
|
| 152 |
+
|
| 153 |
+
### Summary
|
| 154 |
+
|
| 155 |
+
The `summary` field is a 2-3 sentence factual, third-person overview of the conversation β what the customer contacted support about, relevant context they provided, and their core request. It is designed to give a reader instant context without reading the full conversation, and can also be used as compact input when chaining LLM calls.
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
## How to Add a New Agent
|
| 160 |
+
|
| 161 |
+
Follow these steps to add a third extraction step (e.g. urgency scoring):
|
| 162 |
+
|
| 163 |
+
### Step 1 β Create the agent file
|
| 164 |
+
|
| 165 |
+
```python
|
| 166 |
+
# agents/urgency_agent.py
|
| 167 |
+
from agents.base_agent import BaseAgent
|
| 168 |
+
from langchain_openai import ChatOpenAI
|
| 169 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 170 |
+
import json, logging
|
| 171 |
+
|
| 172 |
+
logger = logging.getLogger(__name__)
|
| 173 |
+
|
| 174 |
+
class UrgencyAgent(BaseAgent):
|
| 175 |
+
|
| 176 |
+
def __init__(self, config, api_key):
|
| 177 |
+
super().__init__("UrgencyAgent", config)
|
| 178 |
+
self.llm = ChatOpenAI(
|
| 179 |
+
model=self.model,
|
| 180 |
+
temperature=self.temperature,
|
| 181 |
+
api_key=api_key,
|
| 182 |
+
model_kwargs={"response_format": {"type": "json_object"}},
|
| 183 |
+
)
|
| 184 |
+
self._system_prompt = (
|
| 185 |
+
"Classify the urgency of this customer support conversation.\n"
|
| 186 |
+
'Return JSON: {"urgency": "high"|"medium"|"low", "urgency_notes": "<reason>"}'
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
def validate_input(self, input_data):
|
| 190 |
+
return "conversation_text" in input_data and bool(input_data["conversation_text"])
|
| 191 |
+
|
| 192 |
+
def process(self, input_data):
|
| 193 |
+
if not self.validate_input(input_data):
|
| 194 |
+
return {"success": False, "error": "Missing conversation_text"}
|
| 195 |
+
try:
|
| 196 |
+
response = self.llm.invoke([
|
| 197 |
+
SystemMessage(content=self._system_prompt),
|
| 198 |
+
HumanMessage(content=input_data["conversation_text"]),
|
| 199 |
+
])
|
| 200 |
+
raw = json.loads(response.content)
|
| 201 |
+
urgency = raw.get("urgency", "medium")
|
| 202 |
+
if urgency not in {"high", "medium", "low"}:
|
| 203 |
+
urgency = "medium"
|
| 204 |
+
return {
|
| 205 |
+
"success": True,
|
| 206 |
+
"urgency": urgency,
|
| 207 |
+
"urgency_notes": raw.get("urgency_notes", ""),
|
| 208 |
+
}
|
| 209 |
+
except Exception as e:
|
| 210 |
+
return self.handle_error(e, "urgency_classification")
|
| 211 |
+
```
|
| 212 |
+
|
| 213 |
+
### Step 2 β Add config for the new agent
|
| 214 |
+
|
| 215 |
+
In `config_files/processing_config.json`:
|
| 216 |
+
```json
|
| 217 |
+
"agents": {
|
| 218 |
+
"sentiment_analysis": { ... },
|
| 219 |
+
"topic_extraction": { ... },
|
| 220 |
+
"urgency": {
|
| 221 |
+
"model": "gpt-4o-mini",
|
| 222 |
+
"temperature": 0.1,
|
| 223 |
+
"max_retries": 3
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
### Step 3 β Add a node to the workflow
|
| 229 |
+
|
| 230 |
+
In `workflow/conversation_processor.py`:
|
| 231 |
+
|
| 232 |
+
```python
|
| 233 |
+
# 1. Import the new agent
|
| 234 |
+
from agents.urgency_agent import UrgencyAgent
|
| 235 |
+
|
| 236 |
+
# 2. Instantiate in __init__
|
| 237 |
+
self.urgency_agent = UrgencyAgent(config["agents"]["urgency"], api_key)
|
| 238 |
+
|
| 239 |
+
# 3. Add fields to ConversationState
|
| 240 |
+
urgency: str
|
| 241 |
+
urgency_notes: str
|
| 242 |
+
|
| 243 |
+
# 4. Add the node method
|
| 244 |
+
def _urgency_node(self, state):
|
| 245 |
+
try:
|
| 246 |
+
result = self.urgency_agent.process({"conversation_text": state["conversation_text"]})
|
| 247 |
+
if result.get("success"):
|
| 248 |
+
state["urgency"] = result.get("urgency")
|
| 249 |
+
state["urgency_notes"] = result.get("urgency_notes", "")
|
| 250 |
+
else:
|
| 251 |
+
state["processing_errors"] = state.get("processing_errors", []) + [
|
| 252 |
+
f"Urgency failed: {result.get('error')}"
|
| 253 |
+
]
|
| 254 |
+
state["urgency"] = None
|
| 255 |
+
except Exception as e:
|
| 256 |
+
state["processing_errors"] = state.get("processing_errors", []) + [str(e)]
|
| 257 |
+
return state
|
| 258 |
+
|
| 259 |
+
# 5. Wire into the graph in _build_workflow()
|
| 260 |
+
graph.add_node("urgency", self._urgency_node)
|
| 261 |
+
graph.add_edge("topic_extraction", "urgency") # replaces the old edge to END
|
| 262 |
+
graph.add_edge("urgency", END)
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
### Step 4 β Add output columns
|
| 266 |
+
|
| 267 |
+
In `main.py`, add to the `column_map` dict:
|
| 268 |
+
```python
|
| 269 |
+
"urgency": "URGENCY",
|
| 270 |
+
"urgency_notes": "URGENCY_NOTES",
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
In `sql/create_features_table.sql`, add:
|
| 274 |
+
```sql
|
| 275 |
+
URGENCY VARCHAR(20),
|
| 276 |
+
URGENCY_NOTES TEXT,
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
Run `ALTER TABLE` or recreate the table for the new columns to appear.
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## How to Modify an Existing Agent
|
| 284 |
+
|
| 285 |
+
### Change the LLM model or temperature
|
| 286 |
+
|
| 287 |
+
Edit `config_files/processing_config.json` β no code change needed.
|
| 288 |
+
|
| 289 |
+
### Add or rename a sentiment category
|
| 290 |
+
|
| 291 |
+
In `config_files/processing_config.json`, update `sentiment_polarity.categories` or `emotions.categories`. The agent reads these at init and builds the prompt and validation set dynamically. The only code-level change is updating the output table column type/constraint if the new value is longer than the current `VARCHAR` size.
|
| 292 |
+
|
| 293 |
+
### Add or rename a topic
|
| 294 |
+
|
| 295 |
+
In `config_files/topics.json`, add or edit an entry in the `"topics"` array. The `TopicExtractionAgent` reads this file at init β the new topic appears in the prompt and validation automatically.
|
| 296 |
+
|
| 297 |
+
### Change the conversation truncation limit
|
| 298 |
+
|
| 299 |
+
In `config_files/processing_config.json`:
|
| 300 |
+
```json
|
| 301 |
+
"processing": {
|
| 302 |
+
"max_conversation_chars": 3000
|
| 303 |
+
}
|
| 304 |
+
```
|
| 305 |
+
|
| 306 |
+
This is read by the workflow (`conversation_processor.py`) before formatting the conversation text β no agent code changes needed.
|
| 307 |
+
|
| 308 |
+
### Modify the system prompt framing
|
| 309 |
+
|
| 310 |
+
Each agent builds its prompt in a `_build_system_prompt()` method. Edit that method directly. Category lists are always injected from config β avoid hardcoding values that already live in the JSON.
|
process_helpscout/agents/__init__.py
ADDED
|
File without changes
|
process_helpscout/agents/base_agent.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Agent class for all agents in the HelpScout processing workflow.
|
| 3 |
+
Provides a common interface and consistent error handling.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BaseAgent(ABC):
|
| 14 |
+
"""
|
| 15 |
+
Abstract base class for all agents in the agentic workflow.
|
| 16 |
+
Enforces a consistent interface and provides shared utilities.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, name: str, config: Dict[str, Any]):
|
| 20 |
+
self.name = name
|
| 21 |
+
self.config = config
|
| 22 |
+
self.model = config.get("model", "gpt-5-nano")
|
| 23 |
+
self.temperature = config.get("temperature", 0.2)
|
| 24 |
+
self.max_retries = config.get("max_retries", 3)
|
| 25 |
+
logger.info(f"Initialized {self.name} with model {self.model}")
|
| 26 |
+
|
| 27 |
+
@abstractmethod
|
| 28 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 29 |
+
"""
|
| 30 |
+
Process input data and return results.
|
| 31 |
+
Must be implemented by all concrete agent classes.
|
| 32 |
+
"""
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
@abstractmethod
|
| 36 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 37 |
+
"""
|
| 38 |
+
Validate input data before processing.
|
| 39 |
+
Returns True if input is valid, False otherwise.
|
| 40 |
+
"""
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
def log_processing(self, message: str, level: str = "info"):
|
| 44 |
+
log_method = getattr(logger, level, logger.info)
|
| 45 |
+
log_method(f"[{self.name}] {message}")
|
| 46 |
+
|
| 47 |
+
def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
|
| 48 |
+
error_msg = f"Error in {self.name}"
|
| 49 |
+
if context:
|
| 50 |
+
error_msg += f" ({context})"
|
| 51 |
+
error_msg += f": {str(error)}"
|
| 52 |
+
logger.error(error_msg)
|
| 53 |
+
return {
|
| 54 |
+
"success": False,
|
| 55 |
+
"error": str(error),
|
| 56 |
+
"agent": self.name,
|
| 57 |
+
"context": context,
|
| 58 |
+
}
|
process_helpscout/agents/sentiment_analysis_agent.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sentiment Analysis Agent for HelpScout customer support conversations.
|
| 3 |
+
|
| 4 |
+
Classifies the overall sentiment polarity and emotions from a customer's
|
| 5 |
+
conversation with Musora support. Unlike the social media variant, this
|
| 6 |
+
agent operates on full conversations (multiple messages) rather than
|
| 7 |
+
individual comments, and does not extract intents or compute requires_reply
|
| 8 |
+
(all support tickets inherently require a response).
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from typing import Dict, Any, List, Optional
|
| 12 |
+
import json
|
| 13 |
+
from langchain_openai import ChatOpenAI
|
| 14 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 15 |
+
from agents.base_agent import BaseAgent
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class SentimentAnalysisAgent(BaseAgent):
|
| 22 |
+
"""
|
| 23 |
+
Classifies the sentiment polarity and emotions of a customer support
|
| 24 |
+
conversation from HelpScout.
|
| 25 |
+
|
| 26 |
+
Design decisions:
|
| 27 |
+
- System prompt is built once at init from the config categories
|
| 28 |
+
- Emotions are soft-fail: None stored when the field is missing or invalid
|
| 29 |
+
- Input is the formatted conversation text (already truncated upstream)
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, config: Dict[str, Any], api_key: str, processing_config: Dict[str, Any]):
|
| 33 |
+
"""
|
| 34 |
+
Args:
|
| 35 |
+
config: Agent-level config dict (model, temperature, max_retries)
|
| 36 |
+
api_key: OpenAI API key
|
| 37 |
+
processing_config: Full processing_config.json content (for categories)
|
| 38 |
+
"""
|
| 39 |
+
super().__init__("SentimentAnalysisAgent", config)
|
| 40 |
+
self.api_key = api_key
|
| 41 |
+
|
| 42 |
+
# Pre-compute valid value sets from config for O(1) validation
|
| 43 |
+
self._valid_polarities = {
|
| 44 |
+
cat["value"] for cat in processing_config["sentiment_polarity"]["categories"]
|
| 45 |
+
}
|
| 46 |
+
self._valid_emotions = {
|
| 47 |
+
cat["value"] for cat in processing_config["emotions"]["categories"]
|
| 48 |
+
}
|
| 49 |
+
self._emotions_soft_fail = processing_config["emotions"].get("soft_fail", True)
|
| 50 |
+
|
| 51 |
+
self.llm = ChatOpenAI(
|
| 52 |
+
model=self.model,
|
| 53 |
+
temperature=self.temperature,
|
| 54 |
+
api_key=self.api_key,
|
| 55 |
+
model_kwargs={"response_format": {"type": "json_object"}},
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# Build system prompt once β reused for every LLM call
|
| 59 |
+
self._system_prompt = self._build_system_prompt(processing_config)
|
| 60 |
+
|
| 61 |
+
# ------------------------------------------------------------------
|
| 62 |
+
# Prompt construction
|
| 63 |
+
# ------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
def _build_system_prompt(self, processing_config: Dict[str, Any]) -> str:
|
| 66 |
+
polarity_lines = "\n".join(
|
| 67 |
+
f"- {cat['value']}: {cat['description']}"
|
| 68 |
+
for cat in processing_config["sentiment_polarity"]["categories"]
|
| 69 |
+
)
|
| 70 |
+
emotion_lines = "\n".join(
|
| 71 |
+
f"- {cat['value']}: {cat['description']}"
|
| 72 |
+
for cat in processing_config["emotions"]["categories"]
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return (
|
| 76 |
+
"You are analyzing customer support conversations for Musora, a music education platform.\n\n"
|
| 77 |
+
"You will receive one or more messages from a customer (team responses are excluded). "
|
| 78 |
+
"Classify the overall sentiment and emotional tone of the CUSTOMER's messages as a whole.\n\n"
|
| 79 |
+
"Return JSON only:\n"
|
| 80 |
+
'{"sentiment_polarity": <value>, "emotions": [<values>], '
|
| 81 |
+
'"confidence": "high"|"medium"|"low", "analysis_notes": "<1-2 sentences>"}\n\n'
|
| 82 |
+
f"POLARITY (pick one):\n{polarity_lines}\n\n"
|
| 83 |
+
f"EMOTIONS (multi-label, pick all that apply; use [\"neutral\"] if none detected):\n{emotion_lines}\n\n"
|
| 84 |
+
"Guidelines:\n"
|
| 85 |
+
"- Base your classification on the customer's overall tone, not isolated words\n"
|
| 86 |
+
"- A customer reporting a technical issue with no emotional language β neutral\n"
|
| 87 |
+
"- A customer expressing frustration alongside their issue β negative\n"
|
| 88 |
+
"- analysis_notes: 1-2 sentences highlighting the key sentiment drivers"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def _build_user_prompt(self, conversation_text: str) -> str:
|
| 92 |
+
return f"Customer conversation:\n\n{conversation_text}"
|
| 93 |
+
|
| 94 |
+
# ------------------------------------------------------------------
|
| 95 |
+
# Output validation
|
| 96 |
+
# ------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
def _parse_emotions(self, raw_emotions: Any) -> Optional[List[str]]:
|
| 99 |
+
"""Soft-fail emotion parsing β returns None instead of raising."""
|
| 100 |
+
if not raw_emotions:
|
| 101 |
+
return None
|
| 102 |
+
if isinstance(raw_emotions, str):
|
| 103 |
+
raw_emotions = [e.strip() for e in raw_emotions.split(",")]
|
| 104 |
+
if not isinstance(raw_emotions, list):
|
| 105 |
+
return None
|
| 106 |
+
valid = [e for e in raw_emotions if e in self._valid_emotions]
|
| 107 |
+
return valid if valid else None
|
| 108 |
+
|
| 109 |
+
def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
|
| 110 |
+
"""
|
| 111 |
+
Validate LLM output against config-defined allowed values.
|
| 112 |
+
- Invalid polarity β hard fail (conversation will not be stored)
|
| 113 |
+
- Invalid emotions β soft fail (None; conversation still stored)
|
| 114 |
+
- Invalid confidence β corrected to "medium"
|
| 115 |
+
"""
|
| 116 |
+
polarity = raw.get("sentiment_polarity")
|
| 117 |
+
if not polarity or polarity not in self._valid_polarities:
|
| 118 |
+
return {
|
| 119 |
+
"success": False,
|
| 120 |
+
"error": (
|
| 121 |
+
f"Invalid sentiment_polarity '{polarity}'. "
|
| 122 |
+
f"Expected one of: {sorted(self._valid_polarities)}"
|
| 123 |
+
),
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
confidence = raw.get("confidence", "medium")
|
| 127 |
+
if confidence not in {"high", "medium", "low"}:
|
| 128 |
+
confidence = "medium"
|
| 129 |
+
|
| 130 |
+
emotions = self._parse_emotions(raw.get("emotions"))
|
| 131 |
+
|
| 132 |
+
return {
|
| 133 |
+
"success": True,
|
| 134 |
+
"sentiment_polarity": polarity,
|
| 135 |
+
"emotions": emotions,
|
| 136 |
+
"confidence": confidence,
|
| 137 |
+
"analysis_notes": str(raw.get("analysis_notes", "")).strip(),
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# ------------------------------------------------------------------
|
| 141 |
+
# Core analysis
|
| 142 |
+
# ------------------------------------------------------------------
|
| 143 |
+
|
| 144 |
+
def analyze(self, conversation_text: str) -> Dict[str, Any]:
|
| 145 |
+
"""
|
| 146 |
+
Call the LLM to classify sentiment of the customer conversation.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
conversation_text: Pre-formatted, truncated conversation text
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
Success dict with sentiment fields, or failure dict with error key.
|
| 153 |
+
"""
|
| 154 |
+
user_prompt = self._build_user_prompt(conversation_text)
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
messages = [
|
| 158 |
+
SystemMessage(content=self._system_prompt),
|
| 159 |
+
HumanMessage(content=user_prompt),
|
| 160 |
+
]
|
| 161 |
+
response = self.llm.invoke(messages)
|
| 162 |
+
raw = json.loads(response.content)
|
| 163 |
+
|
| 164 |
+
validated = self._validate_result(raw)
|
| 165 |
+
if not validated["success"]:
|
| 166 |
+
self.log_processing(f"Validation failed: {validated['error']}", "warning")
|
| 167 |
+
return validated
|
| 168 |
+
|
| 169 |
+
emotions_list = validated.get("emotions")
|
| 170 |
+
return {
|
| 171 |
+
"success": True,
|
| 172 |
+
"sentiment_polarity": validated["sentiment_polarity"],
|
| 173 |
+
"emotions": ", ".join(emotions_list) if emotions_list else None,
|
| 174 |
+
"sentiment_confidence": validated["confidence"],
|
| 175 |
+
"sentiment_notes": validated["analysis_notes"],
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
except json.JSONDecodeError as e:
|
| 179 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 180 |
+
return {"success": False, "error": f"JSON parse error: {e}"}
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
self.log_processing(f"Sentiment analysis failed: {e}", "error")
|
| 184 |
+
return {"success": False, "error": str(e)}
|
| 185 |
+
|
| 186 |
+
# ------------------------------------------------------------------
|
| 187 |
+
# Agent interface
|
| 188 |
+
# ------------------------------------------------------------------
|
| 189 |
+
|
| 190 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 191 |
+
return "conversation_text" in input_data and bool(input_data["conversation_text"])
|
| 192 |
+
|
| 193 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 194 |
+
"""
|
| 195 |
+
Args:
|
| 196 |
+
input_data: Must contain 'conversation_text' (formatted, truncated).
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Dict with sentiment fields merged on top of input_data.
|
| 200 |
+
"""
|
| 201 |
+
try:
|
| 202 |
+
if not self.validate_input(input_data):
|
| 203 |
+
return {
|
| 204 |
+
"success": False,
|
| 205 |
+
"error": "Invalid input: 'conversation_text' is required and must be non-empty",
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
self.log_processing("Analyzing conversation sentiment", "debug")
|
| 209 |
+
result = self.analyze(input_data["conversation_text"])
|
| 210 |
+
|
| 211 |
+
output = {
|
| 212 |
+
"success": result.get("success", False),
|
| 213 |
+
"sentiment_polarity": result.get("sentiment_polarity"),
|
| 214 |
+
"emotions": result.get("emotions"),
|
| 215 |
+
"sentiment_confidence": result.get("sentiment_confidence"),
|
| 216 |
+
"sentiment_notes": result.get("sentiment_notes", ""),
|
| 217 |
+
}
|
| 218 |
+
if "error" in result:
|
| 219 |
+
output["sentiment_error"] = result["error"]
|
| 220 |
+
|
| 221 |
+
# Preserve all original input fields
|
| 222 |
+
for key, value in input_data.items():
|
| 223 |
+
if key not in output:
|
| 224 |
+
output[key] = value
|
| 225 |
+
|
| 226 |
+
return output
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
return self.handle_error(e, "sentiment_analysis")
|
process_helpscout/agents/topic_extraction_agent.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Topic Extraction Agent for HelpScout customer support conversations.
|
| 3 |
+
|
| 4 |
+
Assigns one or more topic tags from the Musora HelpScout taxonomy to a
|
| 5 |
+
customer conversation. Also extracts three boolean billing signals:
|
| 6 |
+
- is_refund_request: customer explicitly wants their money back
|
| 7 |
+
- is_cancellation: customer wants to cancel their subscription
|
| 8 |
+
- is_membership: customer wants to join/rejoin and purchase membership
|
| 9 |
+
|
| 10 |
+
Also produces a brief neutral summary (2-3 sentences) of the conversation.
|
| 11 |
+
|
| 12 |
+
Topic definitions are loaded from config_files/topics.json so any taxonomy
|
| 13 |
+
update is automatically reflected in the prompt without code changes.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from typing import Dict, Any, List, Optional
|
| 17 |
+
import json
|
| 18 |
+
from langchain_openai import ChatOpenAI
|
| 19 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 20 |
+
from agents.base_agent import BaseAgent
|
| 21 |
+
import logging
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TopicExtractionAgent(BaseAgent):
|
| 27 |
+
"""
|
| 28 |
+
Extracts topic tags and billing flags from a customer support conversation.
|
| 29 |
+
|
| 30 |
+
Design decisions:
|
| 31 |
+
- Topics are multi-label: a conversation can receive multiple tags
|
| 32 |
+
- The 'uncategorized' topic is valid but discouraged (see topics.json notes)
|
| 33 |
+
- is_refund_request / is_cancellation are always extracted independently,
|
| 34 |
+
even when billing_and_subscription is not the primary topic
|
| 35 |
+
- System prompt is built once at init from topics.json
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, config: Dict[str, Any], api_key: str, topics_config: Dict[str, Any]):
|
| 39 |
+
"""
|
| 40 |
+
Args:
|
| 41 |
+
config: Agent-level config dict (model, temperature, max_retries)
|
| 42 |
+
api_key: OpenAI API key
|
| 43 |
+
topics_config: Parsed topics.json content
|
| 44 |
+
"""
|
| 45 |
+
super().__init__("TopicExtractionAgent", config)
|
| 46 |
+
self.api_key = api_key
|
| 47 |
+
self.topics_config = topics_config
|
| 48 |
+
|
| 49 |
+
# Pre-compute valid topic ID set for O(1) validation
|
| 50 |
+
self._valid_topics = {topic["id"] for topic in topics_config["topics"]}
|
| 51 |
+
|
| 52 |
+
self.llm = ChatOpenAI(
|
| 53 |
+
model=self.model,
|
| 54 |
+
temperature=self.temperature,
|
| 55 |
+
api_key=self.api_key,
|
| 56 |
+
model_kwargs={"response_format": {"type": "json_object"}},
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Build system prompt once β reused for every LLM call
|
| 60 |
+
self._system_prompt = self._build_system_prompt()
|
| 61 |
+
|
| 62 |
+
# ------------------------------------------------------------------
|
| 63 |
+
# Prompt construction
|
| 64 |
+
# ------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
def _build_system_prompt(self) -> str:
|
| 67 |
+
topic_lines = "\n".join(
|
| 68 |
+
f"- {topic['id']}: {topic['description']}"
|
| 69 |
+
for topic in self.topics_config["topics"]
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
usage_notes = "\n".join(
|
| 73 |
+
f" β’ {note}"
|
| 74 |
+
for note in self.topics_config.get("_meta", {}).get("usage_notes", [])
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
return (
|
| 78 |
+
"You are classifying customer support conversations for Musora, a music education platform.\n\n"
|
| 79 |
+
"Assign one or more topic tags to the customer's conversation based on what they are "
|
| 80 |
+
"contacting support about.\n\n"
|
| 81 |
+
"Return JSON only:\n"
|
| 82 |
+
'{\n'
|
| 83 |
+
' "topics": [<topic_ids>],\n'
|
| 84 |
+
' "is_refund_request": true|false,\n'
|
| 85 |
+
' "is_cancellation": true|false,\n'
|
| 86 |
+
' "is_membership": true|false,\n'
|
| 87 |
+
' "confidence": "high"|"medium"|"low",\n'
|
| 88 |
+
' "topic_notes": "<1-2 sentences explaining the classification>",\n'
|
| 89 |
+
' "summary": "<2-3 sentence neutral summary of the conversation>"\n'
|
| 90 |
+
'}\n\n'
|
| 91 |
+
f"AVAILABLE TOPICS (use the id values exactly):\n{topic_lines}\n\n"
|
| 92 |
+
f"RULES:\n{usage_notes}\n\n"
|
| 93 |
+
"BILLING FLAGS (always extract, regardless of topic):\n"
|
| 94 |
+
" β’ is_refund_request: true ONLY when the customer explicitly asks for money back\n"
|
| 95 |
+
" β’ is_cancellation: true ONLY when the customer explicitly wants to cancel their subscription\n"
|
| 96 |
+
" β’ is_membership: true ONLY when the customer wants to join or rejoin and purchase a membership\n\n"
|
| 97 |
+
"SUMMARY GUIDELINES:\n"
|
| 98 |
+
" β’ Write 2-3 sentences maximum\n"
|
| 99 |
+
" β’ Be factual and neutral β do not repeat sentiment or topic labels\n"
|
| 100 |
+
" β’ Capture: what the customer contacted support about, any key context or history they provided, "
|
| 101 |
+
"and the core request or outcome they are seeking\n"
|
| 102 |
+
" β’ Write in third person (e.g. 'The customer reports...')\n\n"
|
| 103 |
+
"IMPORTANT:\n"
|
| 104 |
+
" - Focus on the customer's messages; ignore any team response context\n"
|
| 105 |
+
" - Use exact topic id strings from the list above\n"
|
| 106 |
+
" - topic_notes: briefly explain why you chose these topics"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def _build_user_prompt(self, conversation_text: str) -> str:
|
| 110 |
+
return f"Customer conversation:\n\n{conversation_text}"
|
| 111 |
+
|
| 112 |
+
# ------------------------------------------------------------------
|
| 113 |
+
# Output validation
|
| 114 |
+
# ------------------------------------------------------------------
|
| 115 |
+
|
| 116 |
+
def _validate_topics(self, raw_topics: Any) -> Optional[List[str]]:
|
| 117 |
+
"""
|
| 118 |
+
Validate and filter the topics list from LLM output.
|
| 119 |
+
Returns None if no valid topics remain (hard fail).
|
| 120 |
+
"""
|
| 121 |
+
if not raw_topics:
|
| 122 |
+
return None
|
| 123 |
+
if isinstance(raw_topics, str):
|
| 124 |
+
raw_topics = [t.strip() for t in raw_topics.split(",")]
|
| 125 |
+
if not isinstance(raw_topics, list):
|
| 126 |
+
return None
|
| 127 |
+
valid = [t for t in raw_topics if t in self._valid_topics]
|
| 128 |
+
return valid if valid else None
|
| 129 |
+
|
| 130 |
+
def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
|
| 131 |
+
"""
|
| 132 |
+
Validate LLM output.
|
| 133 |
+
- No valid topics β hard fail
|
| 134 |
+
- Invalid confidence β corrected to "medium"
|
| 135 |
+
- Boolean flags: default to False if missing or non-boolean
|
| 136 |
+
"""
|
| 137 |
+
topics = self._validate_topics(raw.get("topics"))
|
| 138 |
+
if not topics:
|
| 139 |
+
return {
|
| 140 |
+
"success": False,
|
| 141 |
+
"error": (
|
| 142 |
+
f"No valid topics in response: {raw.get('topics')}. "
|
| 143 |
+
f"Expected values from: {sorted(self._valid_topics)}"
|
| 144 |
+
),
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
confidence = raw.get("confidence", "medium")
|
| 148 |
+
if confidence not in {"high", "medium", "low"}:
|
| 149 |
+
confidence = "medium"
|
| 150 |
+
|
| 151 |
+
is_refund = raw.get("is_refund_request", False)
|
| 152 |
+
is_cancel = raw.get("is_cancellation", False)
|
| 153 |
+
is_membership = raw.get("is_membership", False)
|
| 154 |
+
|
| 155 |
+
# Coerce to bool in case LLM returns strings
|
| 156 |
+
if not isinstance(is_refund, bool):
|
| 157 |
+
is_refund = str(is_refund).lower() in ("true", "1", "yes")
|
| 158 |
+
if not isinstance(is_cancel, bool):
|
| 159 |
+
is_cancel = str(is_cancel).lower() in ("true", "1", "yes")
|
| 160 |
+
if not isinstance(is_membership, bool):
|
| 161 |
+
is_membership = str(is_membership).lower() in ("true", "1", "yes")
|
| 162 |
+
|
| 163 |
+
return {
|
| 164 |
+
"success": True,
|
| 165 |
+
"topics": topics,
|
| 166 |
+
"is_refund_request": is_refund,
|
| 167 |
+
"is_cancellation": is_cancel,
|
| 168 |
+
"is_membership": is_membership,
|
| 169 |
+
"confidence": confidence,
|
| 170 |
+
"topic_notes": str(raw.get("topic_notes", "")).strip(),
|
| 171 |
+
"summary": str(raw.get("summary", "")).strip(),
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
# ------------------------------------------------------------------
|
| 175 |
+
# Core extraction
|
| 176 |
+
# ------------------------------------------------------------------
|
| 177 |
+
|
| 178 |
+
def extract(self, conversation_text: str) -> Dict[str, Any]:
|
| 179 |
+
"""
|
| 180 |
+
Call the LLM to assign topics and billing flags.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
conversation_text: Pre-formatted, truncated conversation text
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Success dict with topic fields, or failure dict with error key.
|
| 187 |
+
"""
|
| 188 |
+
user_prompt = self._build_user_prompt(conversation_text)
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
messages = [
|
| 192 |
+
SystemMessage(content=self._system_prompt),
|
| 193 |
+
HumanMessage(content=user_prompt),
|
| 194 |
+
]
|
| 195 |
+
response = self.llm.invoke(messages)
|
| 196 |
+
raw = json.loads(response.content)
|
| 197 |
+
|
| 198 |
+
validated = self._validate_result(raw)
|
| 199 |
+
if not validated["success"]:
|
| 200 |
+
self.log_processing(f"Validation failed: {validated['error']}", "warning")
|
| 201 |
+
return validated
|
| 202 |
+
|
| 203 |
+
return {
|
| 204 |
+
"success": True,
|
| 205 |
+
"topics": ", ".join(validated["topics"]), # comma-separated for DB storage
|
| 206 |
+
"is_refund_request": validated["is_refund_request"],
|
| 207 |
+
"is_cancellation": validated["is_cancellation"],
|
| 208 |
+
"is_membership": validated["is_membership"],
|
| 209 |
+
"topic_confidence": validated["confidence"],
|
| 210 |
+
"topic_notes": validated["topic_notes"],
|
| 211 |
+
"summary": validated["summary"],
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
except json.JSONDecodeError as e:
|
| 215 |
+
self.log_processing(f"JSON decode error: {e}", "warning")
|
| 216 |
+
return {"success": False, "error": f"JSON parse error: {e}"}
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
self.log_processing(f"Topic extraction failed: {e}", "error")
|
| 220 |
+
return {"success": False, "error": str(e)}
|
| 221 |
+
|
| 222 |
+
# ------------------------------------------------------------------
|
| 223 |
+
# Agent interface
|
| 224 |
+
# ------------------------------------------------------------------
|
| 225 |
+
|
| 226 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 227 |
+
return "conversation_text" in input_data and bool(input_data["conversation_text"])
|
| 228 |
+
|
| 229 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 230 |
+
"""
|
| 231 |
+
Args:
|
| 232 |
+
input_data: Must contain 'conversation_text'.
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
Dict with topic fields merged on top of input_data.
|
| 236 |
+
"""
|
| 237 |
+
try:
|
| 238 |
+
if not self.validate_input(input_data):
|
| 239 |
+
return {
|
| 240 |
+
"success": False,
|
| 241 |
+
"error": "Invalid input: 'conversation_text' is required and must be non-empty",
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
self.log_processing("Extracting topics from conversation", "debug")
|
| 245 |
+
result = self.extract(input_data["conversation_text"])
|
| 246 |
+
|
| 247 |
+
output = {
|
| 248 |
+
"success": result.get("success", False),
|
| 249 |
+
"topics": result.get("topics"),
|
| 250 |
+
"is_refund_request": result.get("is_refund_request", False),
|
| 251 |
+
"is_cancellation": result.get("is_cancellation", False),
|
| 252 |
+
"is_membership": result.get("is_membership", False),
|
| 253 |
+
"topic_confidence": result.get("topic_confidence"),
|
| 254 |
+
"topic_notes": result.get("topic_notes", ""),
|
| 255 |
+
"summary": result.get("summary", ""),
|
| 256 |
+
}
|
| 257 |
+
if "error" in result:
|
| 258 |
+
output["topic_error"] = result["error"]
|
| 259 |
+
|
| 260 |
+
# Preserve all original input fields
|
| 261 |
+
for key, value in input_data.items():
|
| 262 |
+
if key not in output:
|
| 263 |
+
output[key] = value
|
| 264 |
+
|
| 265 |
+
return output
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
return self.handle_error(e, "topic_extraction")
|
process_helpscout/config_files/processing_config.json
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_meta": {
|
| 3 |
+
"description": "Configuration for the HelpScout conversation processing pipeline. Controls agent models, processing limits, and output destination.",
|
| 4 |
+
"version": "1.0.0"
|
| 5 |
+
},
|
| 6 |
+
|
| 7 |
+
"agents": {
|
| 8 |
+
"sentiment_analysis": {
|
| 9 |
+
"model": "gpt-5-nano",
|
| 10 |
+
"temperature": 0.2,
|
| 11 |
+
"max_retries": 3
|
| 12 |
+
},
|
| 13 |
+
"topic_extraction": {
|
| 14 |
+
"model": "gpt-5-nano",
|
| 15 |
+
"temperature": 0.2,
|
| 16 |
+
"max_retries": 3
|
| 17 |
+
}
|
| 18 |
+
},
|
| 19 |
+
|
| 20 |
+
"sentiment_polarity": {
|
| 21 |
+
"categories": [
|
| 22 |
+
{
|
| 23 |
+
"value": "very_positive",
|
| 24 |
+
"label": "Very Positive",
|
| 25 |
+
"description": "Extremely enthusiastic, excited, deeply grateful, or highly satisfied"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"value": "positive",
|
| 29 |
+
"label": "Positive",
|
| 30 |
+
"description": "Generally positive, appreciative, supportive, or encouraging"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"value": "neutral",
|
| 34 |
+
"label": "Neutral",
|
| 35 |
+
"description": "Factual, informational, balanced, or lacking clear emotional tone"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"value": "negative",
|
| 39 |
+
"label": "Negative",
|
| 40 |
+
"description": "Disappointed, critical, frustrated, or mildly dissatisfied"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"value": "very_negative",
|
| 44 |
+
"label": "Very Negative",
|
| 45 |
+
"description": "Highly critical, angry, abusive, or extremely dissatisfied"
|
| 46 |
+
}
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
|
| 50 |
+
"emotions": {
|
| 51 |
+
"soft_fail": true,
|
| 52 |
+
"multi_label": true,
|
| 53 |
+
"categories": [
|
| 54 |
+
{
|
| 55 |
+
"value": "joy",
|
| 56 |
+
"label": "Joy",
|
| 57 |
+
"description": "Happiness, delight, or elation"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"value": "excitement",
|
| 61 |
+
"label": "Excitement",
|
| 62 |
+
"description": "Enthusiasm, energy, or eagerness"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"value": "gratitude",
|
| 66 |
+
"label": "Gratitude",
|
| 67 |
+
"description": "Thankfulness or appreciation"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"value": "admiration",
|
| 71 |
+
"label": "Admiration",
|
| 72 |
+
"description": "Deep respect or positive regard for the platform, team or products"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"value": "curiosity",
|
| 76 |
+
"label": "Curiosity",
|
| 77 |
+
"description": "Interest, eagerness to learn, or wondering about something"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"value": "frustration",
|
| 81 |
+
"label": "Frustration",
|
| 82 |
+
"description": "Irritation, annoyance, or blocked goals"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"value": "disappointment",
|
| 86 |
+
"label": "Disappointment",
|
| 87 |
+
"description": "Unmet expectations or a let-down feeling"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"value": "sadness",
|
| 91 |
+
"label": "Sadness",
|
| 92 |
+
"description": "Sorrow, emotional heaviness, or distress"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"value": "anger",
|
| 96 |
+
"label": "Anger",
|
| 97 |
+
"description": "Strong outrage or hostility"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"value": "humor",
|
| 101 |
+
"label": "Humor",
|
| 102 |
+
"description": "Amusement, playfulness, or levity in tone"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"value": "neutral",
|
| 106 |
+
"label": "Neutral",
|
| 107 |
+
"description": "No discernible emotion; use only when no other emotion applies"
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
|
| 112 |
+
"processing": {
|
| 113 |
+
"max_conversation_chars": 5000,
|
| 114 |
+
"min_batch_size": 10,
|
| 115 |
+
"max_batch_size": 50
|
| 116 |
+
},
|
| 117 |
+
|
| 118 |
+
"output": {
|
| 119 |
+
"database": "SOCIAL_MEDIA_DB",
|
| 120 |
+
"schema": "ML_FEATURES",
|
| 121 |
+
"table": "HELPSCOUT_CONVERSATION_FEATURES"
|
| 122 |
+
},
|
| 123 |
+
|
| 124 |
+
"sql_query_file": "queries/helpscout_conversations.sql"
|
| 125 |
+
}
|
process_helpscout/config_files/topics.json
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_meta": {
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"last_updated": "2025-04-09",
|
| 5 |
+
"description": "Musora HelpScout auto-tagging taxonomy. Used as the source configuration for the LLM-based tagging pipeline. Topics are mutually exclusive at the top level; a conversation may receive multiple topic tags. Sub-categories are listed for reference and future use in a separate config. Special boolean flags are defined inline for high-signal billing events.",
|
| 6 |
+
"usage_notes": [
|
| 7 |
+
"Assign one or more topic tags per conversation.",
|
| 8 |
+
"Boolean flags under billing_and_subscription should be extracted independently even when the parent topic is detected.",
|
| 9 |
+
"Use the 'uncategorized' topic when no other topic clearly applies β never as a fallback for uncertain cases.",
|
| 10 |
+
"feedback_and_suggestions should be used as a supplementary tag alongside a primary topic when applicable."
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
|
| 14 |
+
"topics": [
|
| 15 |
+
|
| 16 |
+
{
|
| 17 |
+
"id": "video_and_playback",
|
| 18 |
+
"label": "Video & Playback",
|
| 19 |
+
"description": "The student is experiencing a problem with audio or video content during viewing. The issue is with how media plays, not with the surrounding app or UI. "
|
| 20 |
+
},
|
| 21 |
+
|
| 22 |
+
{
|
| 23 |
+
"id": "app_and_technical_errors",
|
| 24 |
+
"label": "App & Technical Errors",
|
| 25 |
+
"description": "A software bug, crash, or system failure that is NOT limited to video playback. The app, website, technology related, or a specific feature is broken, unresponsive, or showing an error message. Use this when the problem is with the platform itself rather than the content being watched."
|
| 26 |
+
},
|
| 27 |
+
|
| 28 |
+
{
|
| 29 |
+
"id": "navigation_and_ux",
|
| 30 |
+
"label": "Navigation & UX",
|
| 31 |
+
"description": "The student is confused by the interface or cannot find something, but is not technically blocked from accessing it. The issue is about discoverability, layout clarity, or unintuitive design rather than a bug or access restriction. Often triggered by redesigns or renamed features."
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"id": "account_and_access",
|
| 35 |
+
"label": "Account & Access",
|
| 36 |
+
"description": "The student cannot log in, is locked out, or cannot access content they are entitled to. Also covers profile and settings issues. Distinct from billing: use this when the problem is authentication or permissions, even if the underlying cause might be a billing state."
|
| 37 |
+
},
|
| 38 |
+
|
| 39 |
+
{
|
| 40 |
+
"id": "billing_and_subscription",
|
| 41 |
+
"label": "Billing & Subscription",
|
| 42 |
+
"description": "Any conversation involving money, charges, plan status, or membership. This includes unexpected charges, plan changes, promotions, and invoice requests. ",
|
| 43 |
+
"flags": {
|
| 44 |
+
"is_refund_request": {
|
| 45 |
+
"type": "boolean",
|
| 46 |
+
"description": "True when the student is explicitly asking for their money back, regardless of reason."
|
| 47 |
+
},
|
| 48 |
+
"is_cancellation": {
|
| 49 |
+
"type": "boolean",
|
| 50 |
+
"description": "True when the student wants to cancel their subscription or membership, even if they haven't asked for a refund."
|
| 51 |
+
},
|
| 52 |
+
"is_membership": {
|
| 53 |
+
"type": "boolean",
|
| 54 |
+
"description": "True when the student wants to join/rejoin and purchase membership."
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
},
|
| 58 |
+
|
| 59 |
+
{
|
| 60 |
+
"id": "learning_and_progress",
|
| 61 |
+
"label": "Learning & Progress",
|
| 62 |
+
"description": "Issues with how the student's learning journey, including asking for help or recommendations, is tracked or structured over time. Covers broken progress tracking, practice session logging, playlist management, curriculum navigation, and access to legacy or assigned content. The problem is with the learning system, not the content itself."
|
| 63 |
+
},
|
| 64 |
+
|
| 65 |
+
{
|
| 66 |
+
"id": "content_and_resources",
|
| 67 |
+
"label": "Content & Resources",
|
| 68 |
+
"description": "Problems with the lesson content itself or supplementary learning materials β not the video player. Covers missing PDFs, sheet music, backing tracks, incorrect lesson information, requests for new content, and missing assignment or review links."
|
| 69 |
+
},
|
| 70 |
+
|
| 71 |
+
{
|
| 72 |
+
"id": "community_and_notifications",
|
| 73 |
+
"label": "Community & Notifications",
|
| 74 |
+
"description": "Issues involving forums, comments, student profiles, social features, or the delivery of notifications. Use this when the problem is about communication and social interaction within the platform, not content access or playback."
|
| 75 |
+
},
|
| 76 |
+
|
| 77 |
+
{
|
| 78 |
+
"id": "feedback_and_suggestions",
|
| 79 |
+
"label": "Feedback & Suggestions",
|
| 80 |
+
"description": "The student is sharing an opinion, making a feature request, or expressing general satisfaction or dissatisfaction β not reporting a specific failure. This should typically be applied as a supplementary tag alongside a primary topic when a complaint conversation also carries strong sentiment or a request for new functionality."
|
| 81 |
+
},
|
| 82 |
+
|
| 83 |
+
{
|
| 84 |
+
"id": "uncategorized",
|
| 85 |
+
"label": "Uncategorized",
|
| 86 |
+
"description": "Assign ONLY when no other topic clearly applies after careful consideration. Do not use as a fallback for low-confidence cases where a topic still partially fits β prefer the closest matching topic. The primary purpose of this tag is to surface new conversation patterns that may warrant expanding the taxonomy."
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
]
|
| 90 |
+
}
|
process_helpscout/data_fetcher.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Fetcher for the HelpScout processing pipeline.
|
| 3 |
+
|
| 4 |
+
Responsible for:
|
| 5 |
+
1. Fetching raw customer threads from Snowflake (reusing fetch_and_export logic)
|
| 6 |
+
2. Cleaning HTML and aggregating to conversation level
|
| 7 |
+
3. Checking which conversations have already been processed (for deduplication)
|
| 8 |
+
|
| 9 |
+
Reuses fetch_raw(), process_threads(), and aggregate_conversations() from
|
| 10 |
+
fetch_and_export.py so the cleaning and aggregation logic stays in one place.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Set
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
from snowflake_conn import SnowflakeConn
|
| 20 |
+
from fetch_and_export import fetch_raw, process_threads, aggregate_conversations
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def fetch_conversations(conn: SnowflakeConn) -> pd.DataFrame:
|
| 26 |
+
"""
|
| 27 |
+
Fetch, clean, and aggregate all customer conversations from HelpScout.
|
| 28 |
+
|
| 29 |
+
Returns one row per conversation_id with the following key columns:
|
| 30 |
+
- conversation_id
|
| 31 |
+
- combined_text (all customer messages joined with ' | ')
|
| 32 |
+
- customer_email, customer_first, customer_last, customer_hs_id
|
| 33 |
+
- thread_count, first_message_at, last_message_at, duration_hours
|
| 34 |
+
- status, state, source_type, source_via
|
| 35 |
+
|
| 36 |
+
Returns an empty DataFrame if no data is available.
|
| 37 |
+
"""
|
| 38 |
+
raw_df = fetch_raw(conn)
|
| 39 |
+
if raw_df.empty:
|
| 40 |
+
logger.warning("No raw threads returned from Snowflake.")
|
| 41 |
+
return pd.DataFrame()
|
| 42 |
+
|
| 43 |
+
threads_df = process_threads(raw_df)
|
| 44 |
+
if threads_df.empty:
|
| 45 |
+
logger.warning("All threads were empty after HTML cleaning.")
|
| 46 |
+
return pd.DataFrame()
|
| 47 |
+
|
| 48 |
+
conversations_df = aggregate_conversations(threads_df)
|
| 49 |
+
logger.info(f"Ready to process: {len(conversations_df):,} conversations")
|
| 50 |
+
return conversations_df
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def fetch_processed_ids(
|
| 54 |
+
conn: SnowflakeConn,
|
| 55 |
+
database: str,
|
| 56 |
+
schema: str,
|
| 57 |
+
table: str,
|
| 58 |
+
) -> Set[str]:
|
| 59 |
+
"""
|
| 60 |
+
Return the set of conversation_ids already stored in the output table.
|
| 61 |
+
|
| 62 |
+
Returns an empty set if the table does not exist yet (first run) or if
|
| 63 |
+
the query fails for any other reason β the pipeline will then process
|
| 64 |
+
all conversations.
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
query = f"SELECT CONVERSATION_ID FROM {database}.{schema}.{table}"
|
| 68 |
+
df = conn.run_query(query, description="fetch_processed_ids")
|
| 69 |
+
ids = set(df["conversation_id"].dropna().astype(str).tolist())
|
| 70 |
+
logger.info(f"Found {len(ids):,} already-processed conversations in {table}")
|
| 71 |
+
return ids
|
| 72 |
+
except Exception as exc:
|
| 73 |
+
logger.warning(
|
| 74 |
+
f"Could not fetch processed IDs from {database}.{schema}.{table} "
|
| 75 |
+
f"(table may not exist yet): {exc}"
|
| 76 |
+
)
|
| 77 |
+
return set()
|
process_helpscout/fetch_and_export.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout Data Fetcher & Exporter
|
| 3 |
+
==================================
|
| 4 |
+
Fetches raw conversation data from Snowflake, cleans HTML bodies,
|
| 5 |
+
computes derived columns, and exports two CSV files:
|
| 6 |
+
|
| 7 |
+
output/helpscout_threads.csv β one row per message thread
|
| 8 |
+
output/helpscout_conversations.csv β one row per conversation (aggregated)
|
| 9 |
+
|
| 10 |
+
Run:
|
| 11 |
+
python fetch_and_export.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import logging
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
# Local modules
|
| 22 |
+
from snowflake_conn import SnowflakeConn
|
| 23 |
+
from html_cleaner import clean_html_series
|
| 24 |
+
|
| 25 |
+
logging.basicConfig(
|
| 26 |
+
level=logging.INFO,
|
| 27 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 28 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 29 |
+
)
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# Paths
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 36 |
+
SQL_FILE = BASE_DIR / "queries" / "helpscout_conversations.sql"
|
| 37 |
+
OUTPUT_DIR = BASE_DIR / "output"
|
| 38 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 39 |
+
|
| 40 |
+
THREADS_CSV = OUTPUT_DIR / "helpscout_threads.csv"
|
| 41 |
+
CONVERSATIONS_CSV = OUTPUT_DIR / "helpscout_conversations.csv"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# Fetch
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
def fetch_raw(conn: SnowflakeConn) -> pd.DataFrame:
|
| 48 |
+
logger.info("Fetching HelpScout threads from Snowflakeβ¦")
|
| 49 |
+
df = conn.run_query_from_file(SQL_FILE, description="helpscout_conversations")
|
| 50 |
+
logger.info(f"Fetched {len(df):,} raw thread rows.")
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
# Clean & enrich threads
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
def process_threads(df: pd.DataFrame) -> pd.DataFrame:
|
| 58 |
+
logger.info("Cleaning HTML bodiesβ¦")
|
| 59 |
+
df = df.copy()
|
| 60 |
+
|
| 61 |
+
# Parse timestamps
|
| 62 |
+
for col in ("created_at", "opened_at"):
|
| 63 |
+
if col in df.columns:
|
| 64 |
+
df[col] = pd.to_datetime(df[col], utc=True, errors="coerce")
|
| 65 |
+
|
| 66 |
+
# Clean HTML β plain text
|
| 67 |
+
df["body_clean"] = clean_html_series(df["body"])
|
| 68 |
+
|
| 69 |
+
# Drop rows where cleaning produced empty text
|
| 70 |
+
before = len(df)
|
| 71 |
+
df = df[df["body_clean"].str.strip().str.len() > 0].copy()
|
| 72 |
+
logger.info(f"Dropped {before - len(df):,} rows with empty body after cleaning.")
|
| 73 |
+
|
| 74 |
+
# Derived columns
|
| 75 |
+
df["word_count"] = df["body_clean"].str.split().str.len().fillna(0).astype(int)
|
| 76 |
+
df["char_count"] = df["body_clean"].str.len().fillna(0).astype(int)
|
| 77 |
+
|
| 78 |
+
# Date helpers
|
| 79 |
+
df["date"] = df["created_at"].dt.date
|
| 80 |
+
df["week"] = df["created_at"].dt.to_period("W").dt.start_time
|
| 81 |
+
df["month"] = df["created_at"].dt.to_period("M").dt.start_time
|
| 82 |
+
df["hour_of_day"] = df["created_at"].dt.hour
|
| 83 |
+
df["day_of_week"] = df["created_at"].dt.day_name()
|
| 84 |
+
|
| 85 |
+
# Normalise free-text columns
|
| 86 |
+
for col in ("source_type", "source_via", "status", "state", "type"):
|
| 87 |
+
if col in df.columns:
|
| 88 |
+
df[col] = df[col].fillna("unknown").str.lower().str.strip()
|
| 89 |
+
|
| 90 |
+
# Identify the display name for the sender
|
| 91 |
+
df["sender_name"] = (
|
| 92 |
+
(df.get("created_by_first", "").fillna("") + " " +
|
| 93 |
+
df.get("created_by_last", "").fillna("")).str.strip()
|
| 94 |
+
)
|
| 95 |
+
df["sender_name"] = df["sender_name"].replace("", "Unknown")
|
| 96 |
+
|
| 97 |
+
logger.info(f"Processed threads: {len(df):,} rows.")
|
| 98 |
+
return df
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
# Aggregate to conversation level
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
def aggregate_conversations(threads: pd.DataFrame) -> pd.DataFrame:
|
| 105 |
+
logger.info("Aggregating to conversation levelβ¦")
|
| 106 |
+
|
| 107 |
+
agg = (
|
| 108 |
+
threads.groupby("conversation_id")
|
| 109 |
+
.agg(
|
| 110 |
+
first_message_at=("created_at", "min"),
|
| 111 |
+
last_message_at=("created_at", "max"),
|
| 112 |
+
thread_count=("thread_id", "count"),
|
| 113 |
+
customer_email=("customer_email", "first"),
|
| 114 |
+
customer_first=("customer_first", "first"),
|
| 115 |
+
customer_last=("customer_last", "first"),
|
| 116 |
+
customer_hs_id=("customer_hs_id", "first"),
|
| 117 |
+
source_type=("source_type", "first"),
|
| 118 |
+
source_via=("source_via", "first"),
|
| 119 |
+
status=("status", "last"), # last known status
|
| 120 |
+
state=("state", "last"),
|
| 121 |
+
total_word_count=("word_count", "sum"),
|
| 122 |
+
avg_word_count=("word_count", "mean"),
|
| 123 |
+
combined_text=("body_clean", lambda x: " | ".join(x.dropna())),
|
| 124 |
+
)
|
| 125 |
+
.reset_index()
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Duration in hours from first to last message
|
| 129 |
+
agg["duration_hours"] = (
|
| 130 |
+
(agg["last_message_at"] - agg["first_message_at"])
|
| 131 |
+
.dt.total_seconds()
|
| 132 |
+
.div(3600)
|
| 133 |
+
.round(2)
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
agg["date"] = agg["first_message_at"].dt.date
|
| 137 |
+
agg["week"] = agg["first_message_at"].dt.to_period("W").dt.start_time
|
| 138 |
+
agg["month"] = agg["first_message_at"].dt.to_period("M").dt.start_time
|
| 139 |
+
|
| 140 |
+
logger.info(f"Aggregated {len(agg):,} unique conversations.")
|
| 141 |
+
return agg
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ---------------------------------------------------------------------------
|
| 145 |
+
# Export
|
| 146 |
+
# ---------------------------------------------------------------------------
|
| 147 |
+
def export(threads: pd.DataFrame, conversations: pd.DataFrame) -> None:
|
| 148 |
+
# Drop raw HTML before saving (keeps CSV manageable)
|
| 149 |
+
threads_export = threads.drop(columns=["body"], errors="ignore")
|
| 150 |
+
|
| 151 |
+
threads_export.to_csv(THREADS_CSV, index=False, encoding="utf-8-sig")
|
| 152 |
+
logger.info(f"Exported threads β {THREADS_CSV}")
|
| 153 |
+
|
| 154 |
+
conversations.to_csv(CONVERSATIONS_CSV, index=False, encoding="utf-8-sig")
|
| 155 |
+
logger.info(f"Exported conversations β {CONVERSATIONS_CSV}")
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ---------------------------------------------------------------------------
|
| 159 |
+
# Main
|
| 160 |
+
# ---------------------------------------------------------------------------
|
| 161 |
+
def main():
|
| 162 |
+
conn = SnowflakeConn()
|
| 163 |
+
try:
|
| 164 |
+
raw_df = fetch_raw(conn)
|
| 165 |
+
if raw_df.empty:
|
| 166 |
+
logger.warning("No data returned. Check date range and table access.")
|
| 167 |
+
return
|
| 168 |
+
|
| 169 |
+
threads_df = process_threads(raw_df)
|
| 170 |
+
conversations_df = aggregate_conversations(threads_df)
|
| 171 |
+
export(threads_df, conversations_df)
|
| 172 |
+
|
| 173 |
+
logger.info("Done.")
|
| 174 |
+
logger.info(f" Threads: {len(threads_df):,}")
|
| 175 |
+
logger.info(f" Conversations: {len(conversations_df):,}")
|
| 176 |
+
logger.info(f" Unique customers: {conversations_df['customer_email'].nunique():,}")
|
| 177 |
+
|
| 178 |
+
finally:
|
| 179 |
+
conn.close()
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__ == "__main__":
|
| 183 |
+
main()
|
process_helpscout/html_cleaner.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HTML Cleaner for HelpScout message bodies.
|
| 3 |
+
|
| 4 |
+
Strategy:
|
| 5 |
+
1. Remove blockquotes (quoted previous email threads).
|
| 6 |
+
2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
|
| 7 |
+
3. Remove HelpScout / marketing email boilerplate sections.
|
| 8 |
+
4. Extract plain text from the remaining DOM.
|
| 9 |
+
5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import unicodedata
|
| 14 |
+
from bs4 import BeautifulSoup, Comment
|
| 15 |
+
|
| 16 |
+
# CSS class / id fragments that indicate quoted / boilerplate content
|
| 17 |
+
_QUOTED_CLASS_PATTERNS = [
|
| 18 |
+
"gmail_extra",
|
| 19 |
+
"gmail_quote",
|
| 20 |
+
"ex-gmail",
|
| 21 |
+
"yahoo_quoted",
|
| 22 |
+
"moz-cite-prefix",
|
| 23 |
+
"OutlookMessageHeader",
|
| 24 |
+
"protonmail_quote",
|
| 25 |
+
"apple-mail-previous",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
# Markers that indicate the start of a quoted section (text-based heuristics)
|
| 29 |
+
_QUOTE_TEXT_MARKERS = [
|
| 30 |
+
r"On .{5,80} wrote:", # "On Mar 2, 2026 ... wrote:"
|
| 31 |
+
r"From:\s",
|
| 32 |
+
r"Sent:\s",
|
| 33 |
+
r"To:\s.*\nCc:",
|
| 34 |
+
r">{1,}", # > quoted lines (plain text fallback)
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
_COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]
|
| 38 |
+
|
| 39 |
+
# Tags whose entire sub-tree we drop unconditionally
|
| 40 |
+
_DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}
|
| 41 |
+
|
| 42 |
+
# Invisible / spacer Unicode characters
|
| 43 |
+
_INVISIBLE_CHARS = re.compile(
|
| 44 |
+
r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Collapse multiple blank lines to one
|
| 48 |
+
_MULTI_BLANK = re.compile(r"\n{3,}")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _remove_quoted_sections(soup: BeautifulSoup) -> None:
|
| 52 |
+
"""Remove DOM nodes that represent quoted/threaded email history."""
|
| 53 |
+
|
| 54 |
+
# 1. All <blockquote> tags
|
| 55 |
+
for tag in soup.find_all("blockquote"):
|
| 56 |
+
tag.decompose()
|
| 57 |
+
|
| 58 |
+
# 2. Divs / spans with known quoted-reply class names
|
| 59 |
+
# Collect candidates first; decompose() invalidates attrs on child nodes
|
| 60 |
+
# that may still appear later in the iteration, so we guard with a check.
|
| 61 |
+
candidates = soup.find_all(True)
|
| 62 |
+
for tag in candidates:
|
| 63 |
+
if tag.attrs is None:
|
| 64 |
+
# Already decomposed (child of a previously decomposed parent)
|
| 65 |
+
continue
|
| 66 |
+
css_classes = " ".join(tag.get("class") or []).lower()
|
| 67 |
+
tag_id = (tag.get("id") or "").lower()
|
| 68 |
+
combined = css_classes + " " + tag_id
|
| 69 |
+
if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
|
| 70 |
+
tag.decompose()
|
| 71 |
+
|
| 72 |
+
# 3. HTML comments (<!-- --> contain no user text)
|
| 73 |
+
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
| 74 |
+
comment.extract()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _remove_boilerplate(soup: BeautifulSoup) -> None:
|
| 78 |
+
"""Remove marketing / footer / unsubscribe sections."""
|
| 79 |
+
|
| 80 |
+
# Drop heavy layout tags entirely (tables, images carry no message text)
|
| 81 |
+
for tag in soup.find_all(_DROP_TAGS):
|
| 82 |
+
tag.decompose()
|
| 83 |
+
|
| 84 |
+
# Drop any element whose text is purely an unsubscribe / footer line
|
| 85 |
+
footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
|
| 86 |
+
for tag in soup.find_all(True):
|
| 87 |
+
if tag.attrs is None:
|
| 88 |
+
continue
|
| 89 |
+
text = tag.get_text(separator=" ", strip=True).lower()
|
| 90 |
+
if any(kw in text for kw in footer_keywords) and len(text) < 300:
|
| 91 |
+
tag.decompose()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _extract_text(soup: BeautifulSoup) -> str:
|
| 95 |
+
"""Get plain text from the cleaned soup, preserving line breaks."""
|
| 96 |
+
lines = []
|
| 97 |
+
for element in soup.recursiveChildGenerator():
|
| 98 |
+
if isinstance(element, str):
|
| 99 |
+
stripped = element.strip()
|
| 100 |
+
if stripped:
|
| 101 |
+
lines.append(stripped)
|
| 102 |
+
elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
|
| 103 |
+
lines.append("\n")
|
| 104 |
+
return " ".join(lines)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _clean_text(raw: str) -> str:
|
| 108 |
+
"""Final text cleanup: invisible chars, excessive whitespace, quote markers."""
|
| 109 |
+
|
| 110 |
+
# Remove invisible spacers
|
| 111 |
+
text = _INVISIBLE_CHARS.sub("", raw)
|
| 112 |
+
|
| 113 |
+
# Normalize unicode (e.g. soft-hyphen variants)
|
| 114 |
+
text = unicodedata.normalize("NFKC", text)
|
| 115 |
+
|
| 116 |
+
# Collapse whitespace sequences (keep single newlines intentional)
|
| 117 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 118 |
+
text = re.sub(r" \n", "\n", text)
|
| 119 |
+
text = re.sub(r"\n ", "\n", text)
|
| 120 |
+
text = _MULTI_BLANK.sub("\n\n", text)
|
| 121 |
+
|
| 122 |
+
# Remove lines that are purely quote markers ("> some text")
|
| 123 |
+
lines = text.split("\n")
|
| 124 |
+
lines = [ln for ln in lines if not ln.strip().startswith(">")]
|
| 125 |
+
text = "\n".join(lines)
|
| 126 |
+
|
| 127 |
+
# Cut off at first "On <date> wrote:" marker (inline quoted replies)
|
| 128 |
+
for pattern in _COMPILED_QUOTE_MARKERS:
|
| 129 |
+
match = pattern.search(text)
|
| 130 |
+
if match and match.start() > 20: # don't cut if marker is at very start
|
| 131 |
+
text = text[: match.start()].strip()
|
| 132 |
+
break
|
| 133 |
+
|
| 134 |
+
return text.strip()
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def clean_html(html_body: str) -> str:
|
| 138 |
+
"""
|
| 139 |
+
Full pipeline: HTML β clean plain text containing only the customer's message.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
html_body: Raw HTML string from CONVERSATION_THREADS.BODY
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
Clean UTF-8 plain text string.
|
| 146 |
+
"""
|
| 147 |
+
if not html_body or not html_body.strip():
|
| 148 |
+
return ""
|
| 149 |
+
|
| 150 |
+
soup = BeautifulSoup(html_body, "html.parser")
|
| 151 |
+
|
| 152 |
+
_remove_quoted_sections(soup)
|
| 153 |
+
_remove_boilerplate(soup)
|
| 154 |
+
|
| 155 |
+
raw_text = _extract_text(soup)
|
| 156 |
+
return _clean_text(raw_text)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def clean_html_series(series):
|
| 160 |
+
"""
|
| 161 |
+
Vectorized version for a pandas Series.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
series: pd.Series of HTML strings
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
pd.Series of cleaned plain text strings
|
| 168 |
+
"""
|
| 169 |
+
return series.fillna("").apply(clean_html)
|
process_helpscout/main.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main execution script for the HelpScout conversation processing pipeline.
|
| 3 |
+
|
| 4 |
+
Steps:
|
| 5 |
+
1. Fetch all customer conversations from Snowflake (HTML cleaned + aggregated)
|
| 6 |
+
2. Filter out conversations already in the output table
|
| 7 |
+
3. Run sentiment analysis + topic extraction in parallel batches
|
| 8 |
+
4. Append results to SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
|
| 9 |
+
|
| 10 |
+
Run:
|
| 11 |
+
python main.py # process all new conversations, parallel
|
| 12 |
+
python main.py --limit 100 # process at most 100 conversations
|
| 13 |
+
python main.py --sequential # single-process mode (useful for debugging)
|
| 14 |
+
python main.py --config <path> # use a custom config file
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import logging
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import argparse
|
| 22 |
+
import traceback
|
| 23 |
+
from datetime import datetime
|
| 24 |
+
from multiprocessing import Pool, cpu_count
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Any, Dict, List
|
| 27 |
+
|
| 28 |
+
import pandas as pd
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Path setup β allows imports from the process_helpscout package directory
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 35 |
+
ROOT_DIR = SCRIPT_DIR.parent
|
| 36 |
+
|
| 37 |
+
load_dotenv(ROOT_DIR / ".env")
|
| 38 |
+
sys.path.insert(0, str(SCRIPT_DIR))
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Logging β file + console; log directory is created on first run
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
_logs_dir = SCRIPT_DIR / "logs"
|
| 44 |
+
_logs_dir.mkdir(exist_ok=True)
|
| 45 |
+
|
| 46 |
+
logging.basicConfig(
|
| 47 |
+
level=logging.INFO,
|
| 48 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 49 |
+
handlers=[
|
| 50 |
+
logging.FileHandler(
|
| 51 |
+
_logs_dir / f"helpscout_processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
| 52 |
+
),
|
| 53 |
+
logging.StreamHandler(),
|
| 54 |
+
],
|
| 55 |
+
)
|
| 56 |
+
logger = logging.getLogger(__name__)
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Local imports (after sys.path is set)
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
from snowflake_conn import SnowflakeConn
|
| 62 |
+
from data_fetcher import fetch_conversations, fetch_processed_ids
|
| 63 |
+
from workflow.conversation_processor import ConversationProcessingWorkflow
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
# Batch size helper
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
def calculate_optimal_batch_size(
|
| 71 |
+
total: int,
|
| 72 |
+
num_workers: int,
|
| 73 |
+
min_batch: int = 10,
|
| 74 |
+
max_batch: int = 50,
|
| 75 |
+
) -> int:
|
| 76 |
+
"""
|
| 77 |
+
Distribute work evenly across workers within the configured min/max bounds.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
total: Total number of conversations to process
|
| 81 |
+
num_workers: Number of parallel worker processes
|
| 82 |
+
min_batch: Minimum conversations per batch
|
| 83 |
+
max_batch: Maximum conversations per batch
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Optimal batch size
|
| 87 |
+
"""
|
| 88 |
+
if total <= min_batch:
|
| 89 |
+
return total
|
| 90 |
+
batch_size = total // num_workers
|
| 91 |
+
return max(min_batch, min(max_batch, batch_size))
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
# Batch worker β runs in a separate process (must be module-level for pickle)
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
def process_batch_worker(batch_data: tuple) -> dict:
|
| 99 |
+
"""
|
| 100 |
+
Worker function executed in a separate process for one batch of conversations.
|
| 101 |
+
|
| 102 |
+
Each worker creates its own Snowflake connection and workflow instance so
|
| 103 |
+
resources are not shared across processes.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
batch_data: (batch_num, conversations, config, api_key)
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Statistics dict for this batch.
|
| 110 |
+
"""
|
| 111 |
+
batch_num, batch_conversations, config, api_key = batch_data
|
| 112 |
+
worker_logger = logging.getLogger(f"Worker-{batch_num}")
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
worker_logger.info(f"Batch {batch_num}: Processing {len(batch_conversations)} conversations")
|
| 116 |
+
|
| 117 |
+
# Worker-local Snowflake connection and workflow
|
| 118 |
+
conn = SnowflakeConn()
|
| 119 |
+
workflow = ConversationProcessingWorkflow(config, api_key)
|
| 120 |
+
|
| 121 |
+
# Run the workflow
|
| 122 |
+
results = workflow.process_batch(batch_conversations)
|
| 123 |
+
results_df = pd.DataFrame(results)
|
| 124 |
+
|
| 125 |
+
# Separate successful results
|
| 126 |
+
initial_count = len(results_df)
|
| 127 |
+
df_ok = results_df[results_df["success"] == True].copy()
|
| 128 |
+
failed_count = initial_count - len(df_ok)
|
| 129 |
+
|
| 130 |
+
worker_logger.info(
|
| 131 |
+
f"Batch {batch_num}: {len(df_ok)} successful, {failed_count} failed"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# ----------------------------------------------------------------
|
| 135 |
+
# Build output DataFrame with Snowflake column names
|
| 136 |
+
# ----------------------------------------------------------------
|
| 137 |
+
column_map = {
|
| 138 |
+
"conversation_id": "CONVERSATION_ID",
|
| 139 |
+
"customer_email": "CUSTOMER_EMAIL",
|
| 140 |
+
"customer_first": "CUSTOMER_FIRST",
|
| 141 |
+
"customer_last": "CUSTOMER_LAST",
|
| 142 |
+
"customer_hs_id": "CUSTOMER_HS_ID",
|
| 143 |
+
"thread_count": "THREAD_COUNT",
|
| 144 |
+
"first_message_at": "FIRST_MESSAGE_AT",
|
| 145 |
+
"last_message_at": "LAST_MESSAGE_AT",
|
| 146 |
+
"duration_hours": "DURATION_HOURS",
|
| 147 |
+
"status": "STATUS",
|
| 148 |
+
"state": "STATE",
|
| 149 |
+
"source_type": "SOURCE_TYPE",
|
| 150 |
+
"source_via": "SOURCE_VIA",
|
| 151 |
+
"combined_text": "COMBINED_TEXT",
|
| 152 |
+
"conversation_text": "CONVERSATION_TEXT_USED",
|
| 153 |
+
"sentiment_polarity": "SENTIMENT_POLARITY",
|
| 154 |
+
"emotions": "EMOTIONS",
|
| 155 |
+
"sentiment_confidence": "SENTIMENT_CONFIDENCE",
|
| 156 |
+
"sentiment_notes": "SENTIMENT_NOTES",
|
| 157 |
+
"topics": "TOPICS",
|
| 158 |
+
"is_refund_request": "IS_REFUND_REQUEST",
|
| 159 |
+
"is_cancellation": "IS_CANCELLATION",
|
| 160 |
+
"is_membership": "IS_MEMBERSHIP",
|
| 161 |
+
"topic_confidence": "TOPIC_CONFIDENCE",
|
| 162 |
+
"topic_notes": "TOPIC_NOTES",
|
| 163 |
+
"summary": "SUMMARY",
|
| 164 |
+
"processing_errors": "PROCESSING_ERRORS",
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
output_df = pd.DataFrame()
|
| 168 |
+
for src_col, tgt_col in column_map.items():
|
| 169 |
+
output_df[tgt_col] = df_ok[src_col] if src_col in df_ok.columns else None
|
| 170 |
+
|
| 171 |
+
# Flatten processing_errors list to a semicolon-separated string
|
| 172 |
+
if "PROCESSING_ERRORS" in output_df.columns:
|
| 173 |
+
output_df["PROCESSING_ERRORS"] = output_df["PROCESSING_ERRORS"].apply(
|
| 174 |
+
lambda x: "; ".join(x) if isinstance(x, list) else (str(x) if x else None)
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Pipeline metadata
|
| 178 |
+
output_df["PROCESSED_AT"] = datetime.now()
|
| 179 |
+
output_df["WORKFLOW_VERSION"] = "1.0"
|
| 180 |
+
|
| 181 |
+
# ----------------------------------------------------------------
|
| 182 |
+
# Store to Snowflake
|
| 183 |
+
# ----------------------------------------------------------------
|
| 184 |
+
out_cfg = config["output"]
|
| 185 |
+
if not output_df.empty:
|
| 186 |
+
conn.store_df_to_snowflake(
|
| 187 |
+
table_name=out_cfg["table"],
|
| 188 |
+
dataframe=output_df,
|
| 189 |
+
database=out_cfg["database"],
|
| 190 |
+
schema=out_cfg["schema"],
|
| 191 |
+
overwrite=False, # Always append; deduplication is handled upstream
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
conn.close()
|
| 195 |
+
|
| 196 |
+
return {
|
| 197 |
+
"batch_num": batch_num,
|
| 198 |
+
"success": True,
|
| 199 |
+
"total_processed": initial_count,
|
| 200 |
+
"total_stored": len(output_df),
|
| 201 |
+
"failed_count": failed_count,
|
| 202 |
+
"error": None,
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
except Exception as exc:
|
| 206 |
+
error_msg = f"Batch {batch_num} failed: {exc}"
|
| 207 |
+
worker_logger.error(error_msg)
|
| 208 |
+
worker_logger.error(traceback.format_exc())
|
| 209 |
+
return {
|
| 210 |
+
"batch_num": batch_num,
|
| 211 |
+
"success": False,
|
| 212 |
+
"total_processed": len(batch_conversations),
|
| 213 |
+
"total_stored": 0,
|
| 214 |
+
"failed_count": len(batch_conversations),
|
| 215 |
+
"error": str(exc),
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# ---------------------------------------------------------------------------
|
| 220 |
+
# Main processor class
|
| 221 |
+
# ---------------------------------------------------------------------------
|
| 222 |
+
|
| 223 |
+
class HelpScoutProcessor:
|
| 224 |
+
"""
|
| 225 |
+
Orchestrates the end-to-end HelpScout conversation processing pipeline.
|
| 226 |
+
|
| 227 |
+
Typical usage:
|
| 228 |
+
processor = HelpScoutProcessor()
|
| 229 |
+
processor.run(limit=500)
|
| 230 |
+
"""
|
| 231 |
+
|
| 232 |
+
def __init__(self, config_path: str = None):
|
| 233 |
+
"""
|
| 234 |
+
Args:
|
| 235 |
+
config_path: Path to processing_config.json.
|
| 236 |
+
Defaults to config_files/processing_config.json
|
| 237 |
+
relative to this script.
|
| 238 |
+
"""
|
| 239 |
+
if config_path is None:
|
| 240 |
+
config_path = SCRIPT_DIR / "config_files" / "processing_config.json"
|
| 241 |
+
|
| 242 |
+
with open(config_path, "r") as f:
|
| 243 |
+
self.config = json.load(f)
|
| 244 |
+
|
| 245 |
+
self.conn = SnowflakeConn()
|
| 246 |
+
|
| 247 |
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
| 248 |
+
if not self.api_key:
|
| 249 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 250 |
+
|
| 251 |
+
logger.info("HelpScoutProcessor initialized")
|
| 252 |
+
|
| 253 |
+
def _calculate_num_workers(self) -> int:
|
| 254 |
+
"""CPU count minus 2, capped at 5 β mirrors the processing_comments pattern."""
|
| 255 |
+
num_cpus = cpu_count()
|
| 256 |
+
num_workers = max(1, min(5, num_cpus - 2))
|
| 257 |
+
logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
|
| 258 |
+
return num_workers
|
| 259 |
+
|
| 260 |
+
def run(self, limit: int = None, sequential: bool = False):
|
| 261 |
+
"""
|
| 262 |
+
Execute the full pipeline.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
limit: Cap the number of conversations processed in this run.
|
| 266 |
+
Useful for incremental or test runs. Default: process all new.
|
| 267 |
+
sequential: If True, bypass multiprocessing (single-process debug mode).
|
| 268 |
+
"""
|
| 269 |
+
try:
|
| 270 |
+
logger.info("=" * 70)
|
| 271 |
+
logger.info("HelpScout Conversation Processing Pipeline")
|
| 272 |
+
logger.info(f"Mode: {'SEQUENTIAL (debug)' if sequential else 'PARALLEL'}")
|
| 273 |
+
logger.info("=" * 70)
|
| 274 |
+
|
| 275 |
+
# ------------------------------------------------------------------
|
| 276 |
+
# Step 1: Fetch + preprocess conversations
|
| 277 |
+
# ------------------------------------------------------------------
|
| 278 |
+
logger.info("Step 1: Fetching conversations from Snowflake...")
|
| 279 |
+
conversations_df = fetch_conversations(self.conn)
|
| 280 |
+
|
| 281 |
+
if conversations_df.empty:
|
| 282 |
+
logger.warning("No conversations returned. Exiting.")
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
logger.info(f"Fetched {len(conversations_df):,} total conversations")
|
| 286 |
+
|
| 287 |
+
# ------------------------------------------------------------------
|
| 288 |
+
# Step 2: Skip already-processed conversations
|
| 289 |
+
# ------------------------------------------------------------------
|
| 290 |
+
out_cfg = self.config["output"]
|
| 291 |
+
processed_ids = fetch_processed_ids(
|
| 292 |
+
self.conn,
|
| 293 |
+
out_cfg["database"],
|
| 294 |
+
out_cfg["schema"],
|
| 295 |
+
out_cfg["table"],
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
if processed_ids:
|
| 299 |
+
before = len(conversations_df)
|
| 300 |
+
conversations_df = conversations_df[
|
| 301 |
+
~conversations_df["conversation_id"].astype(str).isin(processed_ids)
|
| 302 |
+
].copy()
|
| 303 |
+
skipped = before - len(conversations_df)
|
| 304 |
+
logger.info(f"Skipped {skipped:,} already-processed conversations")
|
| 305 |
+
|
| 306 |
+
if conversations_df.empty:
|
| 307 |
+
logger.info("All conversations are already processed. Nothing to do.")
|
| 308 |
+
return
|
| 309 |
+
|
| 310 |
+
# ------------------------------------------------------------------
|
| 311 |
+
# Step 3: Apply optional limit
|
| 312 |
+
# ------------------------------------------------------------------
|
| 313 |
+
if limit:
|
| 314 |
+
conversations_df = conversations_df.head(limit)
|
| 315 |
+
logger.info(f"Limit applied: processing {len(conversations_df):,} conversations")
|
| 316 |
+
|
| 317 |
+
total = len(conversations_df)
|
| 318 |
+
logger.info(f"Processing {total:,} new conversations...")
|
| 319 |
+
|
| 320 |
+
# ------------------------------------------------------------------
|
| 321 |
+
# Step 4: Split into batches
|
| 322 |
+
# ------------------------------------------------------------------
|
| 323 |
+
num_workers = self._calculate_num_workers()
|
| 324 |
+
proc_cfg = self.config.get("processing", {})
|
| 325 |
+
batch_size = calculate_optimal_batch_size(
|
| 326 |
+
total,
|
| 327 |
+
num_workers,
|
| 328 |
+
min_batch=proc_cfg.get("min_batch_size", 10),
|
| 329 |
+
max_batch=proc_cfg.get("max_batch_size", 50),
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
conversations = conversations_df.to_dict("records")
|
| 333 |
+
batches = []
|
| 334 |
+
for i in range(0, total, batch_size):
|
| 335 |
+
batch = conversations[i : i + batch_size]
|
| 336 |
+
batch_num = (i // batch_size) + 1
|
| 337 |
+
batches.append((batch_num, batch, self.config, self.api_key))
|
| 338 |
+
|
| 339 |
+
logger.info(
|
| 340 |
+
f"Split into {len(batches)} batch(es) "
|
| 341 |
+
f"(batch size: {batch_size}, workers: {num_workers})"
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# ------------------------------------------------------------------
|
| 345 |
+
# Step 5: Run batches
|
| 346 |
+
# ------------------------------------------------------------------
|
| 347 |
+
start_time = datetime.now()
|
| 348 |
+
|
| 349 |
+
if sequential:
|
| 350 |
+
results = [process_batch_worker(b) for b in batches]
|
| 351 |
+
else:
|
| 352 |
+
with Pool(processes=num_workers) as pool:
|
| 353 |
+
results = pool.map(process_batch_worker, batches)
|
| 354 |
+
|
| 355 |
+
elapsed = (datetime.now() - start_time).total_seconds()
|
| 356 |
+
|
| 357 |
+
# ------------------------------------------------------------------
|
| 358 |
+
# Step 6: Summary
|
| 359 |
+
# ------------------------------------------------------------------
|
| 360 |
+
total_processed = sum(r["total_processed"] for r in results)
|
| 361 |
+
total_stored = sum(r["total_stored"] for r in results)
|
| 362 |
+
total_failed = sum(r["failed_count"] for r in results)
|
| 363 |
+
failed_batches = [r for r in results if not r["success"]]
|
| 364 |
+
|
| 365 |
+
logger.info("=" * 70)
|
| 366 |
+
logger.info("Pipeline Summary")
|
| 367 |
+
logger.info(f" Output table : {out_cfg['database']}.{out_cfg['schema']}.{out_cfg['table']}")
|
| 368 |
+
logger.info(f" Processed : {total_processed:,}")
|
| 369 |
+
logger.info(f" Stored : {total_stored:,}")
|
| 370 |
+
logger.info(f" Failed : {total_failed:,}")
|
| 371 |
+
if failed_batches:
|
| 372 |
+
logger.error(f" Failed batches ({len(failed_batches)}):")
|
| 373 |
+
for fb in failed_batches:
|
| 374 |
+
logger.error(f" Batch {fb['batch_num']}: {fb['error']}")
|
| 375 |
+
logger.info(f" Elapsed : {elapsed:.1f}s")
|
| 376 |
+
logger.info(
|
| 377 |
+
f" Avg per conv : {elapsed / max(total_processed, 1):.2f}s"
|
| 378 |
+
)
|
| 379 |
+
logger.info("=" * 70)
|
| 380 |
+
|
| 381 |
+
except Exception as exc:
|
| 382 |
+
logger.error(f"Pipeline failed: {exc}", exc_info=True)
|
| 383 |
+
raise
|
| 384 |
+
|
| 385 |
+
finally:
|
| 386 |
+
self.conn.close()
|
| 387 |
+
logger.info("Snowflake connection closed")
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
# ---------------------------------------------------------------------------
|
| 391 |
+
# CLI entry point
|
| 392 |
+
# ---------------------------------------------------------------------------
|
| 393 |
+
|
| 394 |
+
def main():
|
| 395 |
+
parser = argparse.ArgumentParser(
|
| 396 |
+
description="Process HelpScout conversations: sentiment analysis + topic extraction"
|
| 397 |
+
)
|
| 398 |
+
parser.add_argument(
|
| 399 |
+
"--limit",
|
| 400 |
+
type=int,
|
| 401 |
+
default=None,
|
| 402 |
+
help="Maximum number of new conversations to process in this run (default: all)",
|
| 403 |
+
)
|
| 404 |
+
parser.add_argument(
|
| 405 |
+
"--sequential",
|
| 406 |
+
action="store_true",
|
| 407 |
+
default=False,
|
| 408 |
+
help="Single-process mode β useful for debugging (default: parallel)",
|
| 409 |
+
)
|
| 410 |
+
parser.add_argument(
|
| 411 |
+
"--config",
|
| 412 |
+
type=str,
|
| 413 |
+
default=None,
|
| 414 |
+
help="Path to processing_config.json (default: config_files/processing_config.json)",
|
| 415 |
+
)
|
| 416 |
+
args = parser.parse_args()
|
| 417 |
+
|
| 418 |
+
processor = HelpScoutProcessor(config_path=args.config)
|
| 419 |
+
processor.run(limit=args.limit, sequential=args.sequential)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
if __name__ == "__main__":
|
| 423 |
+
main()
|
process_helpscout/snowflake_conn.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Snowflake connection layer for the HelpScout processing module.
|
| 3 |
+
Adapted from processing_comments/SnowFlakeConnection.py.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from snowflake.snowpark import Session
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
# Load .env from the project root (two levels up from this file)
|
| 14 |
+
_root_env = Path(__file__).resolve().parent.parent / ".env"
|
| 15 |
+
load_dotenv(dotenv_path=_root_env)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SnowflakeConn:
|
| 19 |
+
"""Thin wrapper around Snowpark Session for running read queries."""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.session = self._connect()
|
| 23 |
+
|
| 24 |
+
# ------------------------------------------------------------------
|
| 25 |
+
def _connect(self) -> Session:
|
| 26 |
+
conn_params = dict(
|
| 27 |
+
user=os.getenv("SNOWFLAKE_USER"),
|
| 28 |
+
password=os.getenv("SNOWFLAKE_PASSWORD"),
|
| 29 |
+
account=os.getenv("SNOWFLAKE_ACCOUNT"),
|
| 30 |
+
role=os.getenv("SNOWFLAKE_ROLE"),
|
| 31 |
+
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
|
| 32 |
+
# No default database/schema β queries use fully-qualified names
|
| 33 |
+
)
|
| 34 |
+
session = Session.builder.configs(conn_params).create()
|
| 35 |
+
logger.info("Snowflake session created successfully.")
|
| 36 |
+
return session
|
| 37 |
+
|
| 38 |
+
# ------------------------------------------------------------------
|
| 39 |
+
def run_query(self, query: str, description: str = "query"):
|
| 40 |
+
"""Execute a SELECT query and return a pandas DataFrame."""
|
| 41 |
+
try:
|
| 42 |
+
df = self.session.sql(query).to_pandas()
|
| 43 |
+
df.columns = df.columns.str.lower()
|
| 44 |
+
logger.info(f"Query '{description}' returned {len(df):,} rows.")
|
| 45 |
+
return df
|
| 46 |
+
except Exception as exc:
|
| 47 |
+
logger.error(f"Error executing '{description}': {exc}")
|
| 48 |
+
raise
|
| 49 |
+
|
| 50 |
+
# ------------------------------------------------------------------
|
| 51 |
+
def run_query_from_file(self, sql_path: str, description: str = ""):
|
| 52 |
+
"""Read a .sql file and execute it, returning a pandas DataFrame."""
|
| 53 |
+
sql_path = Path(sql_path)
|
| 54 |
+
if not sql_path.exists():
|
| 55 |
+
raise FileNotFoundError(f"SQL file not found: {sql_path}")
|
| 56 |
+
query = sql_path.read_text(encoding="utf-8")
|
| 57 |
+
return self.run_query(query, description or sql_path.name)
|
| 58 |
+
|
| 59 |
+
# ------------------------------------------------------------------
|
| 60 |
+
def store_df_to_snowflake(
|
| 61 |
+
self,
|
| 62 |
+
table_name: str,
|
| 63 |
+
dataframe,
|
| 64 |
+
database: str,
|
| 65 |
+
schema: str,
|
| 66 |
+
overwrite: bool = False,
|
| 67 |
+
):
|
| 68 |
+
"""
|
| 69 |
+
Write a pandas DataFrame to a Snowflake table.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
table_name: Target table name (without database/schema prefix)
|
| 73 |
+
dataframe: pandas DataFrame to write
|
| 74 |
+
database: Target Snowflake database
|
| 75 |
+
schema: Target Snowflake schema
|
| 76 |
+
overwrite: If True, truncate the table before inserting;
|
| 77 |
+
if False (default), append rows
|
| 78 |
+
"""
|
| 79 |
+
if dataframe is None or len(dataframe) == 0:
|
| 80 |
+
logger.warning(f"store_df_to_snowflake: empty DataFrame, skipping write to {table_name}")
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
mode = "overwrite" if overwrite else "append"
|
| 84 |
+
try:
|
| 85 |
+
self.session.write_pandas(
|
| 86 |
+
df=dataframe,
|
| 87 |
+
table_name=table_name,
|
| 88 |
+
database=database,
|
| 89 |
+
schema=schema,
|
| 90 |
+
overwrite=overwrite,
|
| 91 |
+
auto_create_table=False, # Table must be created via SQL first
|
| 92 |
+
quote_identifiers=False,
|
| 93 |
+
use_logical_type = True
|
| 94 |
+
)
|
| 95 |
+
logger.info(
|
| 96 |
+
f"Stored {len(dataframe):,} rows to {database}.{schema}.{table_name} "
|
| 97 |
+
f"(mode={mode})"
|
| 98 |
+
)
|
| 99 |
+
except Exception as exc:
|
| 100 |
+
logger.error(f"Error storing to {database}.{schema}.{table_name}: {exc}")
|
| 101 |
+
raise
|
| 102 |
+
|
| 103 |
+
# ------------------------------------------------------------------
|
| 104 |
+
def close(self):
|
| 105 |
+
self.session.close()
|
| 106 |
+
logger.info("Snowflake session closed.")
|
process_helpscout/workflow/__init__.py
ADDED
|
File without changes
|
process_helpscout/workflow/conversation_processor.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Conversation Processing Workflow using LangGraph.
|
| 3 |
+
|
| 4 |
+
Two-node linear graph:
|
| 5 |
+
sentiment_analysis β topic_extraction β END
|
| 6 |
+
|
| 7 |
+
All conversations are assumed to be in English (no translation step).
|
| 8 |
+
The workflow operates on the full customer conversation text, pre-formatted
|
| 9 |
+
and truncated upstream before entering the graph.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, Any, List, TypedDict, Annotated
|
| 13 |
+
import operator
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from langgraph.graph import StateGraph, END
|
| 18 |
+
from agents.sentiment_analysis_agent import SentimentAnalysisAgent
|
| 19 |
+
from agents.topic_extraction_agent import TopicExtractionAgent
|
| 20 |
+
import logging
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Maximum characters to send to the LLM β balances context richness vs. cost
|
| 25 |
+
_MAX_CONVERSATION_CHARS = 5000
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ConversationState(TypedDict):
|
| 29 |
+
"""
|
| 30 |
+
State flowing through the conversation processing workflow.
|
| 31 |
+
|
| 32 |
+
Source fields come from the aggregated conversations DataFrame.
|
| 33 |
+
Processing fields are added/updated by each workflow node.
|
| 34 |
+
"""
|
| 35 |
+
# --- Source / aggregation fields ---
|
| 36 |
+
conversation_id: str
|
| 37 |
+
customer_email: str
|
| 38 |
+
customer_first: str
|
| 39 |
+
customer_last: str
|
| 40 |
+
customer_hs_id: Any
|
| 41 |
+
thread_count: int
|
| 42 |
+
first_message_at: Any
|
| 43 |
+
last_message_at: Any
|
| 44 |
+
duration_hours: float
|
| 45 |
+
status: str
|
| 46 |
+
state: str
|
| 47 |
+
source_type: str
|
| 48 |
+
source_via: str
|
| 49 |
+
combined_text: str # Raw aggregated customer messages (pipe-separated)
|
| 50 |
+
|
| 51 |
+
# --- Pipeline input ---
|
| 52 |
+
conversation_text: str # Formatted + truncated text sent to agents
|
| 53 |
+
|
| 54 |
+
# --- Sentiment analysis outputs ---
|
| 55 |
+
sentiment_polarity: str
|
| 56 |
+
emotions: str # Comma-separated emotion values, or None
|
| 57 |
+
sentiment_confidence: str
|
| 58 |
+
sentiment_notes: str
|
| 59 |
+
|
| 60 |
+
# --- Topic extraction outputs ---
|
| 61 |
+
topics: str # Comma-separated topic IDs
|
| 62 |
+
is_refund_request: bool
|
| 63 |
+
is_cancellation: bool
|
| 64 |
+
is_membership: bool
|
| 65 |
+
topic_confidence: str
|
| 66 |
+
topic_notes: str
|
| 67 |
+
summary: str # 2-3 sentence neutral conversation summary
|
| 68 |
+
|
| 69 |
+
# --- Metadata ---
|
| 70 |
+
processing_errors: Annotated[List[str], operator.add]
|
| 71 |
+
success: bool
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class ConversationProcessingWorkflow:
|
| 75 |
+
"""
|
| 76 |
+
LangGraph-based workflow for processing HelpScout conversations.
|
| 77 |
+
|
| 78 |
+
Graph structure:
|
| 79 |
+
[START] β sentiment_analysis β topic_extraction β [END]
|
| 80 |
+
|
| 81 |
+
Both nodes receive the same conversation_text. The workflow is
|
| 82 |
+
intentionally linear β no conditional edges β because every
|
| 83 |
+
conversation goes through both steps.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
def __init__(self, config: Dict[str, Any], api_key: str):
|
| 87 |
+
"""
|
| 88 |
+
Args:
|
| 89 |
+
config: Full processing_config.json content
|
| 90 |
+
api_key: OpenAI API key
|
| 91 |
+
"""
|
| 92 |
+
self.config = config
|
| 93 |
+
self.api_key = api_key
|
| 94 |
+
|
| 95 |
+
# Agent-level configs
|
| 96 |
+
sentiment_agent_config = config["agents"]["sentiment_analysis"]
|
| 97 |
+
topic_agent_config = config["agents"]["topic_extraction"]
|
| 98 |
+
|
| 99 |
+
# Load topics.json β path is relative to this file's parent directory
|
| 100 |
+
workflow_dir = Path(__file__).resolve().parent
|
| 101 |
+
module_dir = workflow_dir.parent
|
| 102 |
+
topics_path = module_dir / "config_files" / "topics.json"
|
| 103 |
+
|
| 104 |
+
with open(topics_path, "r") as f:
|
| 105 |
+
topics_config = json.load(f)
|
| 106 |
+
|
| 107 |
+
# Override max chars from config if provided
|
| 108 |
+
proc = config.get("processing", {})
|
| 109 |
+
self._max_chars = proc.get("max_conversation_chars", _MAX_CONVERSATION_CHARS)
|
| 110 |
+
|
| 111 |
+
# Initialize agents
|
| 112 |
+
self.sentiment_agent = SentimentAnalysisAgent(sentiment_agent_config, api_key, config)
|
| 113 |
+
self.topic_agent = TopicExtractionAgent(topic_agent_config, api_key, topics_config)
|
| 114 |
+
|
| 115 |
+
# Compile workflow graph
|
| 116 |
+
self.workflow = self._build_workflow()
|
| 117 |
+
logger.info("ConversationProcessingWorkflow initialized")
|
| 118 |
+
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
# Graph construction
|
| 121 |
+
# ------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
def _build_workflow(self) -> StateGraph:
|
| 124 |
+
graph = StateGraph(ConversationState)
|
| 125 |
+
|
| 126 |
+
graph.add_node("sentiment_analysis", self._sentiment_node)
|
| 127 |
+
graph.add_node("topic_extraction", self._topic_node)
|
| 128 |
+
|
| 129 |
+
graph.set_entry_point("sentiment_analysis")
|
| 130 |
+
graph.add_edge("sentiment_analysis", "topic_extraction")
|
| 131 |
+
graph.add_edge("topic_extraction", END)
|
| 132 |
+
|
| 133 |
+
return graph.compile()
|
| 134 |
+
|
| 135 |
+
# ------------------------------------------------------------------
|
| 136 |
+
# Preprocessing
|
| 137 |
+
# ------------------------------------------------------------------
|
| 138 |
+
|
| 139 |
+
def _format_conversation(self, combined_text: str) -> str:
|
| 140 |
+
"""
|
| 141 |
+
Convert pipe-separated combined_text into a numbered message format
|
| 142 |
+
suitable for the LLM, truncated to self._max_chars.
|
| 143 |
+
|
| 144 |
+
Input: "I can't log in | Still not working | Please help!"
|
| 145 |
+
Output: "[1] I can't log in\n[2] Still not working\n[3] Please help!"
|
| 146 |
+
"""
|
| 147 |
+
if not combined_text or not str(combined_text).strip():
|
| 148 |
+
return ""
|
| 149 |
+
|
| 150 |
+
messages = [m.strip() for m in str(combined_text).split("|") if m.strip()]
|
| 151 |
+
total_messages = len(messages)
|
| 152 |
+
|
| 153 |
+
parts = []
|
| 154 |
+
char_count = 0
|
| 155 |
+
|
| 156 |
+
for i, msg in enumerate(messages, 1):
|
| 157 |
+
entry = f"[{i}] {msg}"
|
| 158 |
+
if char_count + len(entry) + 1 > self._max_chars:
|
| 159 |
+
parts.append(f"[...truncated after {i - 1} of {total_messages} messages]")
|
| 160 |
+
break
|
| 161 |
+
parts.append(entry)
|
| 162 |
+
char_count += len(entry) + 1
|
| 163 |
+
|
| 164 |
+
return "\n".join(parts)
|
| 165 |
+
|
| 166 |
+
# ------------------------------------------------------------------
|
| 167 |
+
# Workflow nodes
|
| 168 |
+
# ------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
def _sentiment_node(self, state: ConversationState) -> ConversationState:
|
| 171 |
+
"""Node 1: Classify sentiment polarity and emotions."""
|
| 172 |
+
try:
|
| 173 |
+
# Format conversation text once β reused by both nodes
|
| 174 |
+
state["conversation_text"] = self._format_conversation(state.get("combined_text", ""))
|
| 175 |
+
|
| 176 |
+
if not state["conversation_text"]:
|
| 177 |
+
state["processing_errors"] = state.get("processing_errors", []) + [
|
| 178 |
+
"Empty conversation text after formatting"
|
| 179 |
+
]
|
| 180 |
+
state["success"] = False
|
| 181 |
+
return state
|
| 182 |
+
|
| 183 |
+
result = self.sentiment_agent.process({"conversation_text": state["conversation_text"]})
|
| 184 |
+
|
| 185 |
+
if result.get("success", False):
|
| 186 |
+
state["sentiment_polarity"] = result.get("sentiment_polarity")
|
| 187 |
+
state["emotions"] = result.get("emotions")
|
| 188 |
+
state["sentiment_confidence"] = result.get("sentiment_confidence")
|
| 189 |
+
state["sentiment_notes"] = result.get("sentiment_notes", "")
|
| 190 |
+
state["success"] = True
|
| 191 |
+
else:
|
| 192 |
+
error_msg = f"Sentiment analysis failed: {result.get('error', 'Unknown error')}"
|
| 193 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 194 |
+
state["success"] = False
|
| 195 |
+
state["sentiment_polarity"] = None
|
| 196 |
+
state["emotions"] = None
|
| 197 |
+
state["sentiment_confidence"] = None
|
| 198 |
+
state["sentiment_notes"] = ""
|
| 199 |
+
|
| 200 |
+
logger.debug(f"Sentiment: {state['sentiment_polarity']} | Conversation: {state['conversation_id']}")
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
error_msg = f"Sentiment node error: {str(e)}"
|
| 204 |
+
logger.error(error_msg)
|
| 205 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 206 |
+
state["success"] = False
|
| 207 |
+
|
| 208 |
+
return state
|
| 209 |
+
|
| 210 |
+
def _topic_node(self, state: ConversationState) -> ConversationState:
|
| 211 |
+
"""Node 2: Extract topic tags and billing flags."""
|
| 212 |
+
try:
|
| 213 |
+
# Skip topic extraction if sentiment already failed β no point in a partial record
|
| 214 |
+
if not state.get("success", False):
|
| 215 |
+
state["topics"] = None
|
| 216 |
+
state["is_refund_request"] = False
|
| 217 |
+
state["is_cancellation"] = False
|
| 218 |
+
state["is_membership"] = False
|
| 219 |
+
state["topic_confidence"] = None
|
| 220 |
+
state["topic_notes"] = ""
|
| 221 |
+
state["summary"] = ""
|
| 222 |
+
return state
|
| 223 |
+
|
| 224 |
+
result = self.topic_agent.process({"conversation_text": state["conversation_text"]})
|
| 225 |
+
|
| 226 |
+
if result.get("success", False):
|
| 227 |
+
state["topics"] = result.get("topics")
|
| 228 |
+
state["is_refund_request"] = result.get("is_refund_request", False)
|
| 229 |
+
state["is_cancellation"] = result.get("is_cancellation", False)
|
| 230 |
+
state["is_membership"] = result.get("is_membership", False)
|
| 231 |
+
state["topic_confidence"] = result.get("topic_confidence")
|
| 232 |
+
state["topic_notes"] = result.get("topic_notes", "")
|
| 233 |
+
state["summary"] = result.get("summary", "")
|
| 234 |
+
state["success"] = True
|
| 235 |
+
else:
|
| 236 |
+
error_msg = f"Topic extraction failed: {result.get('error', 'Unknown error')}"
|
| 237 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 238 |
+
state["success"] = False
|
| 239 |
+
state["topics"] = None
|
| 240 |
+
state["is_refund_request"] = False
|
| 241 |
+
state["is_cancellation"] = False
|
| 242 |
+
state["is_membership"] = False
|
| 243 |
+
state["topic_confidence"] = None
|
| 244 |
+
state["topic_notes"] = ""
|
| 245 |
+
state["summary"] = ""
|
| 246 |
+
|
| 247 |
+
logger.debug(f"Topics: {state['topics']} | Conversation: {state['conversation_id']}")
|
| 248 |
+
|
| 249 |
+
except Exception as e:
|
| 250 |
+
error_msg = f"Topic node error: {str(e)}"
|
| 251 |
+
logger.error(error_msg)
|
| 252 |
+
state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
|
| 253 |
+
state["success"] = False
|
| 254 |
+
|
| 255 |
+
return state
|
| 256 |
+
|
| 257 |
+
# ------------------------------------------------------------------
|
| 258 |
+
# Public API
|
| 259 |
+
# ------------------------------------------------------------------
|
| 260 |
+
|
| 261 |
+
def process_conversation(self, conversation_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 262 |
+
"""
|
| 263 |
+
Process a single conversation through the full workflow.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
conversation_data: Dict with aggregated conversation fields
|
| 267 |
+
(conversation_id, combined_text, customer_*, etc.)
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
Dict with all original fields plus extracted sentiment and topic fields.
|
| 271 |
+
"""
|
| 272 |
+
combined_text = conversation_data.get("combined_text", "")
|
| 273 |
+
|
| 274 |
+
if not combined_text or not str(combined_text).strip():
|
| 275 |
+
logger.warning(f"Skipping conversation with empty text: {conversation_data.get('conversation_id')}")
|
| 276 |
+
return {
|
| 277 |
+
**conversation_data,
|
| 278 |
+
"success": False,
|
| 279 |
+
"processing_errors": ["combined_text is empty β nothing to analyze"],
|
| 280 |
+
"conversation_text": "",
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
initial_state = {
|
| 284 |
+
"conversation_id": str(conversation_data.get("conversation_id", "")),
|
| 285 |
+
"customer_email": conversation_data.get("customer_email"),
|
| 286 |
+
"customer_first": conversation_data.get("customer_first"),
|
| 287 |
+
"customer_last": conversation_data.get("customer_last"),
|
| 288 |
+
"customer_hs_id": conversation_data.get("customer_hs_id"),
|
| 289 |
+
"thread_count": conversation_data.get("thread_count"),
|
| 290 |
+
"first_message_at": conversation_data.get("first_message_at"),
|
| 291 |
+
"last_message_at": conversation_data.get("last_message_at"),
|
| 292 |
+
"duration_hours": conversation_data.get("duration_hours"),
|
| 293 |
+
"status": conversation_data.get("status"),
|
| 294 |
+
"state": conversation_data.get("state"),
|
| 295 |
+
"source_type": conversation_data.get("source_type"),
|
| 296 |
+
"source_via": conversation_data.get("source_via"),
|
| 297 |
+
"combined_text": str(combined_text).strip(),
|
| 298 |
+
"conversation_text": "", # filled by sentiment node
|
| 299 |
+
"processing_errors": [],
|
| 300 |
+
"success": True,
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
try:
|
| 304 |
+
final_state = self.workflow.invoke(initial_state)
|
| 305 |
+
|
| 306 |
+
# Merge any extra fields from the source that weren't in initial_state
|
| 307 |
+
result = dict(final_state)
|
| 308 |
+
for key, value in conversation_data.items():
|
| 309 |
+
if key not in result:
|
| 310 |
+
result[key] = value
|
| 311 |
+
|
| 312 |
+
return result
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"Workflow execution error for {conversation_data.get('conversation_id')}: {e}")
|
| 316 |
+
return {
|
| 317 |
+
**conversation_data,
|
| 318 |
+
"success": False,
|
| 319 |
+
"processing_errors": [str(e)],
|
| 320 |
+
"conversation_text": "",
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
def process_batch(self, conversations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 324 |
+
"""Process a list of conversations sequentially within the batch."""
|
| 325 |
+
results = []
|
| 326 |
+
total = len(conversations)
|
| 327 |
+
|
| 328 |
+
for idx, conv in enumerate(conversations, 1):
|
| 329 |
+
logger.info(f"Processing conversation {idx}/{total} (id={conv.get('conversation_id')})")
|
| 330 |
+
result = self.process_conversation(conv)
|
| 331 |
+
results.append(result)
|
| 332 |
+
|
| 333 |
+
logger.info(f"Batch complete: {total} conversations processed")
|
| 334 |
+
return results
|
visualization/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Musora Sentiment Analysis Dashboard
|
| 2 |
|
| 3 |
-
A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter)
|
| 4 |
|
| 5 |
---
|
| 6 |
|
|
@@ -12,9 +12,12 @@ A Streamlit dashboard for visualising sentiment analysis results from **social m
|
|
| 12 |
4. [Pages](#pages)
|
| 13 |
5. [Global Filters & Session State](#global-filters--session-state)
|
| 14 |
6. [Snowflake Queries](#snowflake-queries)
|
| 15 |
-
7. [
|
| 16 |
-
8. [
|
| 17 |
-
9. [
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
---
|
| 20 |
|
|
@@ -22,28 +25,38 @@ A Streamlit dashboard for visualising sentiment analysis results from **social m
|
|
| 22 |
|
| 23 |
```
|
| 24 |
visualization/
|
| 25 |
-
βββ app.py
|
| 26 |
βββ config/
|
| 27 |
-
β βββ viz_config.json
|
| 28 |
βββ data/
|
| 29 |
-
β
|
|
|
|
| 30 |
βββ utils/
|
| 31 |
-
β βββ
|
| 32 |
-
β
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
βββ components/
|
| 34 |
-
β βββ dashboard.py
|
| 35 |
-
β βββ sentiment_analysis.py
|
| 36 |
-
β
|
|
|
|
|
|
|
| 37 |
βββ visualizations/
|
| 38 |
-
β βββ sentiment_charts.py
|
| 39 |
-
β βββ distribution_charts.py
|
| 40 |
-
β βββ demographic_charts.py
|
| 41 |
-
β
|
|
|
|
| 42 |
βββ agents/
|
| 43 |
-
β
|
|
|
|
|
|
|
| 44 |
βββ img/
|
| 45 |
-
β βββ musora.png
|
| 46 |
-
βββ SnowFlakeConnection.py
|
| 47 |
```
|
| 48 |
|
| 49 |
---
|
|
@@ -53,213 +66,331 @@ visualization/
|
|
| 53 |
```
|
| 54 |
Snowflake
|
| 55 |
β
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
β
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
β βββΊ sentiment_analysis.py
|
| 66 |
-
β
|
| 67 |
-
βββ load_reply_required_data() βΊ st.session_state['rr_df']
|
| 68 |
-
(on-demand, button) βββΊ reply_required.py
|
| 69 |
```
|
| 70 |
|
| 71 |
**Key principle:** Data is loaded as little as possible, as late as possible.
|
| 72 |
|
| 73 |
-
-
|
| 74 |
-
-
|
| 75 |
-
- All data
|
| 76 |
|
| 77 |
---
|
| 78 |
|
| 79 |
## Data Loading Strategy
|
| 80 |
|
| 81 |
-
|
| 82 |
|
| 83 |
-
### `load_dashboard_data()`
|
| 84 |
-
- Uses `dashboard_query` from `viz_config.json`.
|
| 85 |
- Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
|
| 86 |
-
- No text columns, no `DIM_CONTENT` join
|
| 87 |
-
-
|
| 88 |
-
- Cached
|
| 89 |
-
- Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`.
|
| 90 |
|
| 91 |
-
### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)`
|
| 92 |
-
- Runs
|
| 93 |
1. **Content aggregation** β groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
|
| 94 |
-
2. **Sampled comments** β
|
| 95 |
-
- Returns
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
### `
|
| 100 |
-
-
|
| 101 |
-
-
|
| 102 |
-
-
|
| 103 |
-
- Cached for **24 hours**.
|
| 104 |
-
- Called only when the user clicks **Fetch Data** on the Reply Required page.
|
| 105 |
|
| 106 |
-
###
|
| 107 |
-
|
| 108 |
|
| 109 |
---
|
| 110 |
|
| 111 |
## Pages
|
| 112 |
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
**
|
| 118 |
|
| 119 |
**Key sections:**
|
| 120 |
- Summary stats + health indicator
|
| 121 |
- Sentiment distribution (pie + gauge)
|
| 122 |
- Sentiment by brand and platform (stacked + percentage bar charts)
|
| 123 |
-
- Intent analysis
|
| 124 |
-
-
|
|
|
|
| 125 |
- Reply requirements + urgency breakdown
|
| 126 |
-
- Demographics (age, timezone, experience
|
| 127 |
-
|
| 128 |
-
**To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`.
|
| 129 |
|
| 130 |
---
|
| 131 |
|
| 132 |
-
### Sentiment
|
| 133 |
|
| 134 |
-
**Receives:** `data_loader` instance only
|
| 135 |
|
| 136 |
**Flow:**
|
| 137 |
-
1. Reads `st.session_state['dashboard_df']` for filter option lists
|
| 138 |
2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
|
| 139 |
-
3.
|
| 140 |
-
4.
|
| 141 |
-
5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders.
|
| 142 |
|
| 143 |
**Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
|
| 144 |
|
| 145 |
-
**Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch β no extra query is needed when the user expands a comment section.
|
| 146 |
-
|
| 147 |
-
**AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`.
|
| 148 |
-
|
| 149 |
---
|
| 150 |
|
| 151 |
-
### Reply Required (`components/reply_required.py`)
|
| 152 |
|
| 153 |
**Receives:** `data_loader` instance only.
|
| 154 |
|
| 155 |
**Flow:**
|
| 156 |
-
1.
|
| 157 |
-
2.
|
| 158 |
-
3.
|
| 159 |
-
4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent β applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary.
|
| 160 |
|
| 161 |
**Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
|
| 162 |
|
| 163 |
---
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
## Global Filters & Session State
|
| 166 |
|
| 167 |
-
Global filters
|
| 168 |
|
| 169 |
```python
|
| 170 |
-
{
|
| 171 |
-
'platforms': ['facebook', 'instagram'],
|
| 172 |
'brands': ['drumeo'],
|
| 173 |
'sentiments': [],
|
| 174 |
'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
|
| 175 |
}
|
| 176 |
```
|
| 177 |
|
| 178 |
-
- **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`.
|
| 179 |
-
- **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe.
|
| 180 |
-
|
| 181 |
### Full session state key reference
|
| 182 |
|
| 183 |
| Key | Set by | Used by |
|
| 184 |
|-----|--------|---------|
|
| 185 |
-
| `dashboard_df` | `app.py`
|
| 186 |
-
| `global_filters` | sidebar "Apply Filters"
|
| 187 |
-
| `filters_applied` | sidebar buttons | app.py
|
| 188 |
-
| `sa_contents` | SA fetch button |
|
| 189 |
-
| `sa_comments` | SA fetch button |
|
| 190 |
-
| `sa_fetch_key` | SA fetch button | SA
|
| 191 |
-
| `rr_df` | RR fetch button |
|
| 192 |
-
| `rr_fetch_key` | RR fetch button | RR
|
| 193 |
| `sentiment_page` | SA page / fetch | SA pagination |
|
| 194 |
| `reply_page` | RR page / fetch | RR pagination |
|
| 195 |
-
| `content_summaries` | AI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
---
|
| 198 |
|
| 199 |
## Snowflake Queries
|
| 200 |
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
### Static queries (in `viz_config.json`)
|
| 204 |
|
| 205 |
| Key | Purpose |
|
| 206 |
|-----|---------|
|
| 207 |
-
| `
|
| 208 |
-
| `
|
| 209 |
-
| `
|
|
|
|
| 210 |
|
| 211 |
-
### Dynamic queries (built in `
|
| 212 |
|
| 213 |
| Method | Description |
|
| 214 |
|--------|-------------|
|
| 215 |
-
| `
|
| 216 |
-
| `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` |
|
| 217 |
-
| `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table |
|
| 218 |
|
| 219 |
-
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
---
|
| 227 |
|
| 228 |
## Adding or Changing Things
|
| 229 |
|
| 230 |
-
### Add a new chart to the Dashboard
|
| 231 |
1. Write the chart function in the appropriate `visualizations/` file.
|
| 232 |
-
2. Call it from `render_dashboard()` in `components/dashboard.py`
|
| 233 |
-
3. The chart function receives a lightweight df β it has no text columns but has all the columns listed in `dashboard_query`.
|
| 234 |
|
| 235 |
-
### Add a new
|
| 236 |
-
1. Add the
|
| 237 |
-
2.
|
| 238 |
-
3. Pass it to `data_loader.apply_filters()`.
|
| 239 |
|
| 240 |
-
###
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
|
| 245 |
-
###
|
| 246 |
-
- Edit `
|
| 247 |
-
-
|
| 248 |
|
| 249 |
### Change the cache duration
|
| 250 |
-
|
| 251 |
-
- Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry.
|
| 252 |
-
- Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`).
|
| 253 |
|
| 254 |
### Add a new page
|
| 255 |
-
1. Create `components/new_page.py` with a `render_new_page(
|
| 256 |
2. Import and add a radio option in `app.py`.
|
| 257 |
-
3.
|
|
|
|
| 258 |
|
| 259 |
-
###
|
| 260 |
-
- Edit `
|
| 261 |
-
-
|
| 262 |
-
- `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting β add processing there if needed.
|
| 263 |
|
| 264 |
---
|
| 265 |
|
|
@@ -280,6 +411,8 @@ SNOWFLAKE_ROLE
|
|
| 280 |
SNOWFLAKE_DATABASE
|
| 281 |
SNOWFLAKE_WAREHOUSE
|
| 282 |
SNOWFLAKE_SCHEMA
|
|
|
|
|
|
|
| 283 |
```
|
| 284 |
|
| 285 |
---
|
|
@@ -291,19 +424,25 @@ SNOWFLAKE_SCHEMA
|
|
| 291 |
| Section | What it configures |
|
| 292 |
|---------|-------------------|
|
| 293 |
| `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
|
| 294 |
-
| `color_schemes.intent` | Hex colors
|
| 295 |
-
| `color_schemes.
|
| 296 |
-
| `color_schemes.
|
| 297 |
-
| `
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
| `intent_order` | Display order for intent categories |
|
|
|
|
| 299 |
| `negative_sentiments` | Which sentiment values count as "negative" |
|
| 300 |
-
| `dashboard.default_date_range_days` | Default date filter window
|
| 301 |
-
| `
|
| 302 |
-
| `
|
| 303 |
-
| `
|
| 304 |
-
| `snowflake.
|
| 305 |
-
| `snowflake.
|
| 306 |
-
| `
|
|
|
|
| 307 |
| `demographics.age_groups` | Age bucket definitions (label β [min, max]) |
|
| 308 |
| `demographics.experience_groups` | Experience bucket definitions |
|
| 309 |
| `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
|
|
|
|
| 1 |
# Musora Sentiment Analysis Dashboard
|
| 2 |
|
| 3 |
+
A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter), the **Musora internal app**, and **HelpScout customer support conversations** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora).
|
| 4 |
|
| 5 |
---
|
| 6 |
|
|
|
|
| 12 |
4. [Pages](#pages)
|
| 13 |
5. [Global Filters & Session State](#global-filters--session-state)
|
| 14 |
6. [Snowflake Queries](#snowflake-queries)
|
| 15 |
+
7. [Authentication](#authentication)
|
| 16 |
+
8. [PDF Reports](#pdf-reports)
|
| 17 |
+
9. [AI Agents](#ai-agents)
|
| 18 |
+
10. [Adding or Changing Things](#adding-or-changing-things)
|
| 19 |
+
11. [Running the App](#running-the-app)
|
| 20 |
+
12. [Configuration Reference](#configuration-reference)
|
| 21 |
|
| 22 |
---
|
| 23 |
|
|
|
|
| 25 |
|
| 26 |
```
|
| 27 |
visualization/
|
| 28 |
+
βββ app.py # Entry point β routing, sidebar, session state
|
| 29 |
βββ config/
|
| 30 |
+
β βββ viz_config.json # Colors, query strings, dashboard settings
|
| 31 |
βββ data/
|
| 32 |
+
β βββ data_loader.py # Comment Snowflake queries and caching
|
| 33 |
+
β βββ helpscout_data_loader.py # HelpScout Snowflake queries and caching
|
| 34 |
βββ utils/
|
| 35 |
+
β βββ auth.py # Login page, authentication helpers
|
| 36 |
+
β βββ data_processor.py # Pandas aggregations (intent dist, content summary, etc.)
|
| 37 |
+
β βββ metrics.py # KPI calculations (sentiment score, urgency, etc.)
|
| 38 |
+
β βββ pdf_exporter.py # DashboardPDFExporter (comment dashboard PDF)
|
| 39 |
+
β βββ helpscout_utils.py # Pure helpers: parse_topics, explode_topics, boolean_flag_counts
|
| 40 |
+
β βββ helpscout_pdf.py # HelpScoutDashboardPDF + HelpScoutAnalysisPDF
|
| 41 |
βββ components/
|
| 42 |
+
β βββ dashboard.py # Comment Dashboard page renderer
|
| 43 |
+
β βββ sentiment_analysis.py # Sentiment Analysis page renderer
|
| 44 |
+
β βββ reply_required.py # Reply Required page renderer
|
| 45 |
+
β βββ helpscout_dashboard.py # HelpScout Dashboard page + compact summary widget
|
| 46 |
+
β βββ helpscout_analysis.py # HelpScout Analysis page (filterβfetchβchartsβLLMβPDF)
|
| 47 |
βββ visualizations/
|
| 48 |
+
β βββ sentiment_charts.py # Plotly sentiment chart functions
|
| 49 |
+
β βββ distribution_charts.py # Plotly distribution / heatmap / scatter functions
|
| 50 |
+
β βββ demographic_charts.py # Plotly demographic chart functions
|
| 51 |
+
β βββ content_cards.py # Streamlit card components (comment + content cards)
|
| 52 |
+
β βββ helpscout_charts.py # HelpScoutCharts Plotly factory (16 chart types)
|
| 53 |
βββ agents/
|
| 54 |
+
β βββ base_agent.py # BaseVisualizationAgent (shared interface)
|
| 55 |
+
β βββ content_summary_agent.py # AI analysis for comment content summarisation
|
| 56 |
+
β βββ helpscout_summary_agent.py # HelpScoutSummaryAgent β page-level LLM summary from SUMMARY fields
|
| 57 |
βββ img/
|
| 58 |
+
β βββ musora.png # Sidebar logo
|
| 59 |
+
βββ SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session)
|
| 60 |
```
|
| 61 |
|
| 62 |
---
|
|
|
|
| 66 |
```
|
| 67 |
Snowflake
|
| 68 |
β
|
| 69 |
+
βββ data_loader.py (SentimentDataLoader)
|
| 70 |
+
β βββ load_dashboard_data() βββΊ st.session_state['dashboard_df']
|
| 71 |
+
β β βββΊ sidebar (filter options, counts)
|
| 72 |
+
β β βββΊ dashboard.py (all charts)
|
| 73 |
+
β βββ load_sa_data() βββΊ st.session_state['sa_contents', 'sa_comments']
|
| 74 |
+
β β (on-demand, Fetch button) βββΊ sentiment_analysis.py
|
| 75 |
+
β βββ load_reply_required_data() βββΊ st.session_state['rr_df']
|
| 76 |
+
β (on-demand, Fetch button) βββΊ reply_required.py
|
| 77 |
β
|
| 78 |
+
βββ helpscout_data_loader.py (HelpScoutDataLoader)
|
| 79 |
+
βββ load_dashboard_data() βββΊ st.session_state['helpscout_df']
|
| 80 |
+
β βββΊ helpscout_dashboard.py
|
| 81 |
+
β βββΊ dashboard.py (compact summary)
|
| 82 |
+
βββ load_analysis_data() βββΊ st.session_state['hs_analysis_df']
|
| 83 |
+
(on-demand, Fetch button) βββΊ helpscout_analysis.py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
```
|
| 85 |
|
| 86 |
**Key principle:** Data is loaded as little as possible, as late as possible.
|
| 87 |
|
| 88 |
+
- **Dashboard** queries are lightweight (no text columns, no content join) and cached 24 hours.
|
| 89 |
+
- **Sentiment Analysis**, **Reply Required**, and **HelpScout Analysis** pages wait for the user to click **Fetch Data**.
|
| 90 |
+
- All data lives in `st.session_state` so page navigation and widget interactions never re-trigger Snowflake queries.
|
| 91 |
|
| 92 |
---
|
| 93 |
|
| 94 |
## Data Loading Strategy
|
| 95 |
|
| 96 |
+
### Comment data (`data/data_loader.py` β `SentimentDataLoader`)
|
| 97 |
|
| 98 |
+
#### `load_dashboard_data()`
|
|
|
|
| 99 |
- Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
|
| 100 |
+
- No text columns, no `DIM_CONTENT` join.
|
| 101 |
+
- Merges demographics data if `demographics_query` is configured.
|
| 102 |
+
- Cached **24 hours**. Called once at startup; stored in `st.session_state['dashboard_df']`.
|
|
|
|
| 103 |
|
| 104 |
+
#### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, emotions, date_range)`
|
| 105 |
+
- Runs two Snowflake queries:
|
| 106 |
1. **Content aggregation** β groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
|
| 107 |
+
2. **Sampled comments** β up to 50 per sentiment group per content (`QUALIFY ROW_NUMBER() <= 50`). `display_text` computed in SQL.
|
| 108 |
+
- Returns `(contents_df, comments_df)`. Cached **24 hours**.
|
| 109 |
+
|
| 110 |
+
#### `load_reply_required_data(platforms, brands, date_range)`
|
| 111 |
+
- Filters `REQUIRES_REPLY = TRUE`. Conditionally includes the social media table and/or musora table. Cached **24 hours**.
|
| 112 |
+
|
| 113 |
+
#### SQL column qualification note
|
| 114 |
+
The social media table and `DIM_CONTENT` share column names. Any `WHERE` clause inside a query that joins them **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`) to avoid Snowflake `ambiguous column name` errors.
|
| 115 |
+
|
| 116 |
+
---
|
| 117 |
+
|
| 118 |
+
### HelpScout data (`data/helpscout_data_loader.py` β `HelpScoutDataLoader`)
|
| 119 |
+
|
| 120 |
+
#### `load_dashboard_data()`
|
| 121 |
+
- Lightweight query from `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES`.
|
| 122 |
+
- Columns: `conversation_id, status, source, created_at, updated_at, duration_hours, sentiment_polarity, topics, is_refund_request, is_cancellation, is_membership, customer_email`.
|
| 123 |
+
- Merges demographics (age/timezone/experience) via email join (`LOWER(customer_email) = LOWER(usora_users.email)`).
|
| 124 |
+
- Cached **24 hours**. Stored in `st.session_state['helpscout_df']`.
|
| 125 |
|
| 126 |
+
#### `load_analysis_data(date_start, date_end, topics, sentiments, statuses, sources, is_refund, is_cancellation, is_membership)`
|
| 127 |
+
- Adds `summary, sentiment_notes, topic_notes, customer_first_name, customer_last_name` columns.
|
| 128 |
+
- SQL `WHERE` pushdown for all filters; multi-label topic filter uses `ARRAY_CONTAINS('topic_id'::VARIANT, SPLIT(TOPICS, ','))`.
|
| 129 |
+
- Cached **24 hours** keyed on filter tuple. Stored in `st.session_state['hs_analysis_df']`.
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
#### `get_filter_options(df)`
|
| 132 |
+
- Returns `sentiments`, `topics` (exploded and label-mapped from taxonomy), `statuses`, `states`, `sources`.
|
| 133 |
|
| 134 |
---
|
| 135 |
|
| 136 |
## Pages
|
| 137 |
|
| 138 |
+
The app has **5 pages** navigated via the sidebar radio:
|
| 139 |
|
| 140 |
+
### 1. Sentiment Dashboard (`components/dashboard.py`)
|
| 141 |
|
| 142 |
+
**Receives:** `filtered_df` β lightweight comment dataframe (after optional global filter from `app.py`).
|
| 143 |
|
| 144 |
**Key sections:**
|
| 145 |
- Summary stats + health indicator
|
| 146 |
- Sentiment distribution (pie + gauge)
|
| 147 |
- Sentiment by brand and platform (stacked + percentage bar charts)
|
| 148 |
+
- Intent analysis (bar + pie)
|
| 149 |
+
- Emotion analysis (bar + pie) β only when `emotions` column is non-null
|
| 150 |
+
- BrandβPlatform heatmap
|
| 151 |
- Reply requirements + urgency breakdown
|
| 152 |
+
- Demographics (age, timezone, experience) β only when demographics were merged
|
| 153 |
+
- **HelpScout compact summary** β appended at bottom; reads `st.session_state['helpscout_df']` directly (guarded by `try/except` so failures never break the main dashboard)
|
|
|
|
| 154 |
|
| 155 |
---
|
| 156 |
|
| 157 |
+
### 2. Custom Sentiment Queries (`components/sentiment_analysis.py`)
|
| 158 |
|
| 159 |
+
**Receives:** `data_loader` instance only.
|
| 160 |
|
| 161 |
**Flow:**
|
| 162 |
+
1. Reads `st.session_state['dashboard_df']` for filter option lists.
|
| 163 |
2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
|
| 164 |
+
3. On **Fetch Data**: calls `data_loader.load_sa_data(...)`, stores results in `st.session_state['sa_contents']` and `['sa_comments']`.
|
| 165 |
+
4. Renders content cards, per-content sentiment + intent + emotion charts, AI analysis buttons, sampled comment expanders.
|
|
|
|
| 166 |
|
| 167 |
**Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
|
| 168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
---
|
| 170 |
|
| 171 |
+
### 3. Reply Required (`components/reply_required.py`)
|
| 172 |
|
| 173 |
**Receives:** `data_loader` instance only.
|
| 174 |
|
| 175 |
**Flow:**
|
| 176 |
+
1. Pre-populates platform/brand/date from `st.session_state['global_filters']`.
|
| 177 |
+
2. On **Fetch Data**: calls `data_loader.load_reply_required_data(...)`, stores result in `st.session_state['rr_df']`.
|
| 178 |
+
3. Shows urgency breakdown, in-page filters (applied in Python, no extra query), paginated comment cards, and "Reply by Content" summary.
|
|
|
|
| 179 |
|
| 180 |
**Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
|
| 181 |
|
| 182 |
---
|
| 183 |
|
| 184 |
+
### 4. HelpScout Dashboard (`components/helpscout_dashboard.py`)
|
| 185 |
+
|
| 186 |
+
**Receives:** `helpscout_loader` instance.
|
| 187 |
+
|
| 188 |
+
**Reads from:** `st.session_state['helpscout_df']` (loaded at app startup).
|
| 189 |
+
|
| 190 |
+
**Key sections:**
|
| 191 |
+
- PDF export button (HelpScout Dashboard PDF)
|
| 192 |
+
- 6 KPI metrics: total conversations, average duration, refund requests, cancellations, negative rate, membership joins
|
| 193 |
+
- Sentiment distribution (pie + bar)
|
| 194 |
+
- Topic distribution and sentiment heatmap (from `process_helpscout/config_files/topics.json` taxonomy)
|
| 195 |
+
- Boolean flags (refund, cancellation, membership) breakdown
|
| 196 |
+
- Status and source breakdown
|
| 197 |
+
- Timelines expander (daily conversation volume, refund/cancel trend)
|
| 198 |
+
- Depth expander (topic co-occurrence, escalation funnel)
|
| 199 |
+
- Demographics (age, timezone, experience)
|
| 200 |
+
|
| 201 |
+
> **Note:** Global sidebar filters (brand, platform, sentiment, date) do **not** apply to HelpScout pages β HelpScout is brand-agnostic and uses its own filter panel.
|
| 202 |
+
|
| 203 |
+
---
|
| 204 |
+
|
| 205 |
+
### 5. HelpScout Analysis (`components/helpscout_analysis.py`)
|
| 206 |
+
|
| 207 |
+
**Receives:** `helpscout_loader` instance.
|
| 208 |
+
|
| 209 |
+
**Flow:**
|
| 210 |
+
1. **Filter panel** β date range, top_n, topics (multi-select with human-readable labels), sentiments, statuses, sources, and 3 boolean checkboxes (refund / cancellation / membership).
|
| 211 |
+
2. **Fetch Data** button β calls `helpscout_loader.load_analysis_data(...)`, stale-checked via `fetch_key` tuple.
|
| 212 |
+
3. **KPI row** + distribution charts (sentiment, topics, flags, status).
|
| 213 |
+
4. **AI Summary section:**
|
| 214 |
+
- "Generate AI Summary" button β calls `HelpScoutSummaryAgent`, stores result in `st.session_state['hs_analysis_summary']`.
|
| 215 |
+
- Renders: executive summary, top themes, top complaints, unexpected insights, notable quotes.
|
| 216 |
+
- "Export Analysis PDF" button β generates `HelpScoutAnalysisPDF`.
|
| 217 |
+
5. **Paginated conversation cards** β 10 per page; each card shows customer name, status, topics (label-mapped), summary, sentiment/topic notes.
|
| 218 |
+
6. **CSV export** button.
|
| 219 |
+
|
| 220 |
+
**Pagination:** `st.session_state['hs_analysis_page']`. Reset on new fetch.
|
| 221 |
+
|
| 222 |
+
**Date range default:** Clamps to `max(min_date, max_date β default_date_range_days)` so the default is always within the available data window.
|
| 223 |
+
|
| 224 |
+
---
|
| 225 |
+
|
| 226 |
## Global Filters & Session State
|
| 227 |
|
| 228 |
+
Global filters apply **only to comment pages** (Dashboard, Sentiment Analysis, Reply Required). They have no effect on HelpScout pages.
|
| 229 |
|
| 230 |
```python
|
| 231 |
+
st.session_state['global_filters'] = {
|
| 232 |
+
'platforms': ['facebook', 'instagram'],
|
| 233 |
'brands': ['drumeo'],
|
| 234 |
'sentiments': [],
|
| 235 |
'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
|
| 236 |
}
|
| 237 |
```
|
| 238 |
|
|
|
|
|
|
|
|
|
|
| 239 |
### Full session state key reference
|
| 240 |
|
| 241 |
| Key | Set by | Used by |
|
| 242 |
|-----|--------|---------|
|
| 243 |
+
| `dashboard_df` | `app.py` startup | sidebar, dashboard.py, SA + RR filter lists |
|
| 244 |
+
| `global_filters` | sidebar "Apply Filters" | app.py (dashboard filter), SA + RR pre-populate |
|
| 245 |
+
| `filters_applied` | sidebar buttons | app.py |
|
| 246 |
+
| `sa_contents` | SA fetch button | sentiment_analysis.py |
|
| 247 |
+
| `sa_comments` | SA fetch button | sentiment_analysis.py |
|
| 248 |
+
| `sa_fetch_key` | SA fetch button | SA stale-check |
|
| 249 |
+
| `rr_df` | RR fetch button | reply_required.py |
|
| 250 |
+
| `rr_fetch_key` | RR fetch button | RR stale-check |
|
| 251 |
| `sentiment_page` | SA page / fetch | SA pagination |
|
| 252 |
| `reply_page` | RR page / fetch | RR pagination |
|
| 253 |
+
| `content_summaries` | SA AI buttons | SA AI analysis display |
|
| 254 |
+
| `helpscout_df` | `app.py` startup | helpscout_dashboard.py, dashboard.py compact summary |
|
| 255 |
+
| `hs_analysis_df` | HS Analysis fetch | helpscout_analysis.py charts + cards |
|
| 256 |
+
| `hs_analysis_fetch_key` | HS Analysis fetch | HS Analysis stale-check |
|
| 257 |
+
| `hs_analysis_filter_desc` | HS Analysis fetch | human-readable filter string for PDF + agent |
|
| 258 |
+
| `hs_analysis_summary` | "Generate AI Summary" | HS Analysis summary renderer |
|
| 259 |
+
| `hs_analysis_summary_key` | "Generate AI Summary" | invalidated on re-fetch |
|
| 260 |
+
| `hs_analysis_page` | HS Analysis page / fetch | HS Analysis pagination |
|
| 261 |
|
| 262 |
---
|
| 263 |
|
| 264 |
## Snowflake Queries
|
| 265 |
|
| 266 |
+
### Comment tables
|
| 267 |
+
|
| 268 |
+
| Table | Platform | Notes |
|
| 269 |
+
|-------|----------|-------|
|
| 270 |
+
| `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN DIM_CONTENT` for `PERMALINK_URL` |
|
| 271 |
+
| `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively |
|
| 272 |
+
|
| 273 |
+
### HelpScout table
|
| 274 |
+
|
| 275 |
+
| Table | Notes |
|
| 276 |
+
|-------|-------|
|
| 277 |
+
| `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES` | One row per conversation; multi-label topics in comma-separated `TOPICS` column |
|
| 278 |
|
| 279 |
### Static queries (in `viz_config.json`)
|
| 280 |
|
| 281 |
| Key | Purpose |
|
| 282 |
|-----|---------|
|
| 283 |
+
| `dashboard_query` | Lightweight comment query β no text, no DIM_CONTENT join |
|
| 284 |
+
| `demographics_query` | Joins `usora_users` + `preprocessed.users` for age/timezone/experience |
|
| 285 |
+
| `helpscout.dashboard_query` | Lightweight HelpScout query (no SUMMARY/notes) |
|
| 286 |
+
| `helpscout.demographics_query` | Same demographics join, keyed on `customer_email` |
|
| 287 |
|
| 288 |
+
### Dynamic queries (built in `helpscout_data_loader.py`)
|
| 289 |
|
| 290 |
| Method | Description |
|
| 291 |
|--------|-------------|
|
| 292 |
+
| `_build_analysis_query()` | Full HelpScout query including SUMMARY/notes; multi-label topic filter via `ARRAY_CONTAINS` |
|
|
|
|
|
|
|
| 293 |
|
| 294 |
+
---
|
| 295 |
|
| 296 |
+
## Authentication
|
| 297 |
+
|
| 298 |
+
Module: `utils/auth.py`
|
| 299 |
+
|
| 300 |
+
- `AUTHORIZED_EMAILS` allowlist + `APP_TOKEN` env var.
|
| 301 |
+
- `render_login_page()` renders the login form and calls `st.stop()` when not authenticated.
|
| 302 |
+
- Gate is placed at the top of `app.py` (after `st.set_page_config`, before data loaders).
|
| 303 |
+
- Current user and logout button are shown in the sidebar.
|
| 304 |
+
|
| 305 |
+
**Required env vars:**
|
| 306 |
+
```
|
| 307 |
+
APP_TOKEN=<shared token>
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## PDF Reports
|
| 313 |
+
|
| 314 |
+
### Comment Dashboard PDF (`utils/pdf_exporter.py` β `DashboardPDFExporter`)
|
| 315 |
+
|
| 316 |
+
Generated from the "Export PDF Report" expander at the top of the Dashboard page.
|
| 317 |
+
|
| 318 |
+
Sections: cover, executive summary, sentiment, brand, platform, intent, cross-dimensional, volume, reply requirements, demographics (optional), language (optional), HelpScout summary (if data loaded), data summary.
|
| 319 |
+
|
| 320 |
+
### HelpScout Dashboard PDF (`utils/helpscout_pdf.py` β `HelpScoutDashboardPDF`)
|
| 321 |
+
|
| 322 |
+
Generated from the HelpScout Dashboard page. Sections: cover, KPI summary, sentiment, topics, flags & escalation, status & source, timelines, demographics.
|
| 323 |
+
|
| 324 |
+
### HelpScout Analysis PDF (`utils/helpscout_pdf.py` β `HelpScoutAnalysisPDF`)
|
| 325 |
+
|
| 326 |
+
Generated from the "Export Analysis PDF" button on the HelpScout Analysis page (only available after an AI Summary has been generated).
|
| 327 |
+
|
| 328 |
+
Sections: cover, filter summary, KPI summary, chart snapshots, AI summary (executive summary, top themes, top complaints, unexpected insights, notable quotes), conversation cards sample, metadata.
|
| 329 |
+
|
| 330 |
+
**Dependencies:** `fpdf2`, `kaleido` (for Plotly PNG rendering at 3Γ scale).
|
| 331 |
+
|
| 332 |
+
---
|
| 333 |
+
|
| 334 |
+
## AI Agents
|
| 335 |
+
|
| 336 |
+
### `ContentSummaryAgent` (`agents/content_summary_agent.py`)
|
| 337 |
+
|
| 338 |
+
Summarises sampled comments for a single content item on the Sentiment Analysis page. Called per-content when the user clicks the AI analysis button. Results cached in `st.session_state['content_summaries']`.
|
| 339 |
+
|
| 340 |
+
### `HelpScoutSummaryAgent` (`agents/helpscout_summary_agent.py`)
|
| 341 |
+
|
| 342 |
+
Produces a **page-level** executive report from the filtered HelpScout conversations by reading their pre-extracted `SUMMARY` fields through an LLM.
|
| 343 |
+
|
| 344 |
+
- Stratified sample by `sentiment_polarity` β capped at `max_conversations` (default 300).
|
| 345 |
+
- Builds aggregate context: sentiment breakdown, top topics, flag counts, average duration, then per-conversation summaries (capped at 250 chars each).
|
| 346 |
+
- Prompt asks the LLM to surface patterns **beyond** the pre-tagged topics/sentiments.
|
| 347 |
+
- Output structure:
|
| 348 |
+
|
| 349 |
+
```json
|
| 350 |
+
{
|
| 351 |
+
"executive_summary": "...",
|
| 352 |
+
"top_themes": [{"theme": "...", "description": "...", "prevalence": "..."}],
|
| 353 |
+
"top_complaints": ["..."],
|
| 354 |
+
"unexpected_insights": ["..."],
|
| 355 |
+
"notable_quotes": ["..."]
|
| 356 |
+
}
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
- Uses `LLMHelper.get_structured_completion()` with up to 3 retries.
|
| 360 |
|
| 361 |
---
|
| 362 |
|
| 363 |
## Adding or Changing Things
|
| 364 |
|
| 365 |
+
### Add a new chart to the Comment Dashboard
|
| 366 |
1. Write the chart function in the appropriate `visualizations/` file.
|
| 367 |
+
2. Call it from `render_dashboard()` in `components/dashboard.py`.
|
|
|
|
| 368 |
|
| 369 |
+
### Add a new chart to the HelpScout Dashboard
|
| 370 |
+
1. Add the chart method to `HelpScoutCharts` in `visualizations/helpscout_charts.py`.
|
| 371 |
+
2. Call it from `render_helpscout_dashboard()` in `components/helpscout_dashboard.py`.
|
|
|
|
| 372 |
|
| 373 |
+
### Add a new HelpScout filter
|
| 374 |
+
1. Add the widget to the filter panel in `helpscout_analysis.py`.
|
| 375 |
+
2. Include the new value in the `fetch_key` tuple.
|
| 376 |
+
3. Add the corresponding `WHERE` clause condition to `_build_analysis_query()` in `helpscout_data_loader.py`.
|
| 377 |
|
| 378 |
+
### Add a new HelpScout topic
|
| 379 |
+
- Edit `process_helpscout/config_files/topics.json` (the taxonomy file).
|
| 380 |
+
- `helpscout_utils.load_topic_taxonomy()` reloads it on each app start; no other changes needed.
|
| 381 |
|
| 382 |
### Change the cache duration
|
| 383 |
+
`@st.cache_data(ttl=86400)` appears on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, `load_demographics_data`, and their HelpScout equivalents. Change `86400` to the desired TTL. Users can always force a refresh with "Reload Data" in the sidebar.
|
|
|
|
|
|
|
| 384 |
|
| 385 |
### Add a new page
|
| 386 |
+
1. Create `components/new_page.py` with a `render_new_page(...)` function.
|
| 387 |
2. Import and add a radio option in `app.py`.
|
| 388 |
+
3. Add data loading to the appropriate loader class.
|
| 389 |
+
4. If the page should be excluded from global comment filters, extend the `_hs_page` guard in `app.py`.
|
| 390 |
|
| 391 |
+
### Change what the Sentiment Analysis page queries
|
| 392 |
+
- Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`.
|
| 393 |
+
- Update `_process_sa_content_stats()` and/or `_process_sa_comments()` for new columns.
|
|
|
|
| 394 |
|
| 395 |
---
|
| 396 |
|
|
|
|
| 411 |
SNOWFLAKE_DATABASE
|
| 412 |
SNOWFLAKE_WAREHOUSE
|
| 413 |
SNOWFLAKE_SCHEMA
|
| 414 |
+
OPENAI_API_KEY
|
| 415 |
+
APP_TOKEN
|
| 416 |
```
|
| 417 |
|
| 418 |
---
|
|
|
|
| 424 |
| Section | What it configures |
|
| 425 |
|---------|-------------------|
|
| 426 |
| `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
|
| 427 |
+
| `color_schemes.intent` | Hex colors per intent label |
|
| 428 |
+
| `color_schemes.emotion` | Hex colors per emotion label |
|
| 429 |
+
| `color_schemes.platform` | Hex colors per platform |
|
| 430 |
+
| `color_schemes.brand` | Hex colors per brand |
|
| 431 |
+
| `color_schemes_helpscout.topics` | Hex colors for HelpScout topic bars |
|
| 432 |
+
| `color_schemes_helpscout.status` | Hex colors for conversation status values |
|
| 433 |
+
| `color_schemes_helpscout.boolean_flags` | Hex colors for refund/cancellation/membership flags |
|
| 434 |
+
| `sentiment_order` | Display order for sentiment categories |
|
| 435 |
| `intent_order` | Display order for intent categories |
|
| 436 |
+
| `emotion_order` | Display order for emotion categories |
|
| 437 |
| `negative_sentiments` | Which sentiment values count as "negative" |
|
| 438 |
+
| `dashboard.default_date_range_days` | Default date filter window for comment pages |
|
| 439 |
+
| `helpscout.default_date_range_days` | Default date filter window for HelpScout Analysis |
|
| 440 |
+
| `helpscout.max_summary_conversations` | Cap on conversations sent to LLM summary agent |
|
| 441 |
+
| `helpscout.escalation_sentiments` | Sentiment values that count as escalation |
|
| 442 |
+
| `snowflake.dashboard_query` | Lightweight comment dashboard query |
|
| 443 |
+
| `snowflake.demographics_query` | Demographics join query (comment pages) |
|
| 444 |
+
| `helpscout.dashboard_query` | Lightweight HelpScout dashboard query |
|
| 445 |
+
| `helpscout.demographics_query` | Demographics join query (HelpScout, keyed on email) |
|
| 446 |
| `demographics.age_groups` | Age bucket definitions (label β [min, max]) |
|
| 447 |
| `demographics.experience_groups` | Experience bucket definitions |
|
| 448 |
| `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
|
visualization/agents/helpscout_summary_agent.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout Summary Agent
|
| 3 |
+
Generates a page-level summary report from filtered HelpScout conversations.
|
| 4 |
+
Analyses the already-extracted SUMMARY fields to surface patterns and insights
|
| 5 |
+
beyond the pre-tagged topics / sentiments.
|
| 6 |
+
"""
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any, Dict
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
|
| 14 |
+
# Ensure visualization/ is on sys.path so agents.*, utils.* imports resolve
|
| 15 |
+
_parent = Path(__file__).resolve().parent.parent
|
| 16 |
+
if str(_parent) not in sys.path:
|
| 17 |
+
sys.path.insert(0, str(_parent))
|
| 18 |
+
|
| 19 |
+
from agents.base_agent import BaseVisualizationAgent
|
| 20 |
+
from utils.llm_helper import LLMHelper
|
| 21 |
+
from utils.helpscout_utils import topic_label, load_topic_taxonomy
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class HelpScoutSummaryAgent(BaseVisualizationAgent):
|
| 25 |
+
"""
|
| 26 |
+
Produces an executive summary report from a filtered set of HelpScout
|
| 27 |
+
conversations by reading their SUMMARY fields through an LLM.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
MAX_SUMMARY_CHARS = 250 # per conversation summary sent to LLM
|
| 31 |
+
|
| 32 |
+
def __init__(self, model: str = "gpt-5-nano", temperature: float = 1,
|
| 33 |
+
max_conversations: int = 300):
|
| 34 |
+
super().__init__(name="HelpScoutSummaryAgent", model=model, temperature=temperature)
|
| 35 |
+
self.llm_helper = LLMHelper(model=model, temperature=temperature)
|
| 36 |
+
self.max_conversations = max_conversations
|
| 37 |
+
self.taxonomy = load_topic_taxonomy()
|
| 38 |
+
|
| 39 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
# BaseVisualizationAgent interface
|
| 41 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
|
| 43 |
+
def validate_input(self, input_data: Dict[str, Any]) -> bool:
|
| 44 |
+
if "conversations" not in input_data:
|
| 45 |
+
self.log_processing("Missing 'conversations' key", level="error")
|
| 46 |
+
return False
|
| 47 |
+
if not isinstance(input_data["conversations"], pd.DataFrame):
|
| 48 |
+
self.log_processing("'conversations' must be a DataFrame", level="error")
|
| 49 |
+
return False
|
| 50 |
+
if "summary" not in input_data["conversations"].columns:
|
| 51 |
+
self.log_processing("DataFrame must contain a 'summary' column", level="error")
|
| 52 |
+
return False
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 56 |
+
"""
|
| 57 |
+
Generate an aggregate summary report from filtered HelpScout conversations.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
input_data: {
|
| 61 |
+
'conversations': pd.DataFrame (must have 'summary' column),
|
| 62 |
+
'filter_description': str (human-readable applied filters),
|
| 63 |
+
'max_conversations': int (optional; overrides instance default),
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
{
|
| 68 |
+
'success': bool,
|
| 69 |
+
'summary': {
|
| 70 |
+
'executive_summary': str,
|
| 71 |
+
'top_themes': [{'theme': str, 'description': str, 'prevalence': str}],
|
| 72 |
+
'top_complaints': [str],
|
| 73 |
+
'unexpected_insights': [str],
|
| 74 |
+
'recommended_actions': [{'priority': str, 'action': str, 'rationale': str}],
|
| 75 |
+
'notable_quotes': [str],
|
| 76 |
+
},
|
| 77 |
+
'metadata': {
|
| 78 |
+
'total_conversations_analyzed': int,
|
| 79 |
+
'model_used': str,
|
| 80 |
+
'tokens_used': int,
|
| 81 |
+
'filter_applied': str,
|
| 82 |
+
},
|
| 83 |
+
'error': str | None,
|
| 84 |
+
}
|
| 85 |
+
"""
|
| 86 |
+
try:
|
| 87 |
+
if not self.validate_input(input_data):
|
| 88 |
+
return {"success": False, "error": "Invalid input data", "summary": None}
|
| 89 |
+
|
| 90 |
+
df = input_data["conversations"]
|
| 91 |
+
filter_desc = input_data.get("filter_description", "No filters applied")
|
| 92 |
+
max_convs = input_data.get("max_conversations", self.max_conversations)
|
| 93 |
+
|
| 94 |
+
total_available = len(df)
|
| 95 |
+
|
| 96 |
+
if total_available == 0:
|
| 97 |
+
return self._empty_result(filter_desc)
|
| 98 |
+
|
| 99 |
+
# Sample if over cap β stratified by sentiment to preserve signal
|
| 100 |
+
df_sample = self._stratified_sample(df, max_convs)
|
| 101 |
+
n_analyzed = len(df_sample)
|
| 102 |
+
|
| 103 |
+
self.log_processing(
|
| 104 |
+
f"Analysing {n_analyzed} of {total_available} conversations"
|
| 105 |
+
f" (filter: {filter_desc[:60]})"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Build aggregate context for the LLM
|
| 109 |
+
agg_context = self._build_aggregate_context(df_sample, df)
|
| 110 |
+
prompt = self._build_prompt(agg_context, filter_desc, n_analyzed)
|
| 111 |
+
|
| 112 |
+
system_msg = (
|
| 113 |
+
"You are an expert customer support analyst for Musora, "
|
| 114 |
+
"a music education platform (Drumeo, Pianote, Guitareo, Singeo, PlayBass). "
|
| 115 |
+
"Your role is to synthesize customer support conversation summaries "
|
| 116 |
+
"and surface actionable insights that go beyond simple tagging."
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
response = self.llm_helper.get_structured_completion(
|
| 120 |
+
prompt=prompt,
|
| 121 |
+
system_message=system_msg,
|
| 122 |
+
max_retries=3,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
if not response["success"]:
|
| 126 |
+
return self.handle_error(
|
| 127 |
+
Exception(response.get("error", "LLM call failed")),
|
| 128 |
+
context=f"filter={filter_desc[:60]}"
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
summary = response["content"]
|
| 132 |
+
summary = self._ensure_defaults(summary)
|
| 133 |
+
|
| 134 |
+
return {
|
| 135 |
+
"success": True,
|
| 136 |
+
"summary": summary,
|
| 137 |
+
"metadata": {
|
| 138 |
+
"total_conversations_analyzed": n_analyzed,
|
| 139 |
+
"total_available": total_available,
|
| 140 |
+
"model_used": response["model"],
|
| 141 |
+
"tokens_used": response["usage"]["total_tokens"],
|
| 142 |
+
"filter_applied": filter_desc,
|
| 143 |
+
},
|
| 144 |
+
"error": None,
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
return self.handle_error(e, context=input_data.get("filter_description", ""))
|
| 149 |
+
|
| 150 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
# Private helpers
|
| 152 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
|
| 154 |
+
def _stratified_sample(self, df: pd.DataFrame, cap: int) -> pd.DataFrame:
|
| 155 |
+
"""Stratified sample by sentiment to keep signal diversity."""
|
| 156 |
+
if len(df) <= cap:
|
| 157 |
+
return df
|
| 158 |
+
try:
|
| 159 |
+
strat_col = "sentiment_polarity"
|
| 160 |
+
if strat_col in df.columns and df[strat_col].nunique() > 1:
|
| 161 |
+
# Proportional allocation per sentiment group
|
| 162 |
+
groups = df.groupby(strat_col, group_keys=False)
|
| 163 |
+
sampled = groups.apply(
|
| 164 |
+
lambda g: g.sample(
|
| 165 |
+
n=max(1, int(cap * len(g) / len(df))),
|
| 166 |
+
random_state=42,
|
| 167 |
+
)
|
| 168 |
+
)
|
| 169 |
+
return sampled.head(cap)
|
| 170 |
+
except Exception:
|
| 171 |
+
pass
|
| 172 |
+
return df.sample(n=cap, random_state=42)
|
| 173 |
+
|
| 174 |
+
def _build_aggregate_context(self, df_sample: pd.DataFrame,
|
| 175 |
+
df_full: pd.DataFrame) -> str:
|
| 176 |
+
"""Build a text block with aggregate stats + conversation summaries."""
|
| 177 |
+
total = len(df_full)
|
| 178 |
+
n_sample = len(df_sample)
|
| 179 |
+
|
| 180 |
+
# Aggregate stats from the full filtered set
|
| 181 |
+
stats = []
|
| 182 |
+
if "sentiment_polarity" in df_full.columns:
|
| 183 |
+
sent_counts = df_full["sentiment_polarity"].value_counts()
|
| 184 |
+
sent_pct = (sent_counts / total * 100).round(1)
|
| 185 |
+
stats.append("Sentiment breakdown: " +
|
| 186 |
+
", ".join(f"{s} {pct}%" for s, pct in sent_pct.items()))
|
| 187 |
+
|
| 188 |
+
if "topics" in df_full.columns:
|
| 189 |
+
from utils.helpscout_utils import explode_topics
|
| 190 |
+
exploded = explode_topics(df_full)
|
| 191 |
+
if not exploded.empty:
|
| 192 |
+
top_topics = exploded["topic_id"].value_counts().head(8)
|
| 193 |
+
topic_strs = [f"{topic_label(t, self.taxonomy)} ({c})" for t, c in top_topics.items()]
|
| 194 |
+
stats.append("Top topics: " + ", ".join(topic_strs))
|
| 195 |
+
|
| 196 |
+
from utils.helpscout_utils import boolean_flag_counts
|
| 197 |
+
flags = boolean_flag_counts(df_full)
|
| 198 |
+
flag_parts = []
|
| 199 |
+
if flags["is_refund_request"]:
|
| 200 |
+
flag_parts.append(f"Refund requests: {flags['is_refund_request']}")
|
| 201 |
+
if flags["is_cancellation"]:
|
| 202 |
+
flag_parts.append(f"Cancellations: {flags['is_cancellation']}")
|
| 203 |
+
if flags["is_membership"]:
|
| 204 |
+
flag_parts.append(f"Membership joins: {flags['is_membership']}")
|
| 205 |
+
if flag_parts:
|
| 206 |
+
stats.append(", ".join(flag_parts))
|
| 207 |
+
|
| 208 |
+
if "duration_hours" in df_full.columns:
|
| 209 |
+
avg_dur = df_full["duration_hours"].mean()
|
| 210 |
+
stats.append(f"Average conversation duration: {avg_dur:.1f} hours")
|
| 211 |
+
|
| 212 |
+
stats_block = "\n".join(stats)
|
| 213 |
+
|
| 214 |
+
# Individual summaries (capped per conversation)
|
| 215 |
+
summaries = []
|
| 216 |
+
for i, row in enumerate(df_sample.itertuples(), 1):
|
| 217 |
+
s = getattr(row, "summary", None) or ""
|
| 218 |
+
s = str(s).strip()
|
| 219 |
+
if s:
|
| 220 |
+
s = s[:self.MAX_SUMMARY_CHARS] + ("β¦" if len(s) > self.MAX_SUMMARY_CHARS else "")
|
| 221 |
+
sent = getattr(row, "sentiment_polarity", "")
|
| 222 |
+
summaries.append(f"[{i}] ({sent}) {s}")
|
| 223 |
+
|
| 224 |
+
summaries_block = "\n".join(summaries) if summaries else "No summaries available."
|
| 225 |
+
|
| 226 |
+
note = (f"Note: Showing {n_sample} of {total} matched conversations."
|
| 227 |
+
if n_sample < total else f"Showing all {total} matched conversations.")
|
| 228 |
+
|
| 229 |
+
return f"""=== AGGREGATE STATISTICS ===
|
| 230 |
+
{stats_block}
|
| 231 |
+
{note}
|
| 232 |
+
|
| 233 |
+
=== CONVERSATION SUMMARIES ===
|
| 234 |
+
{summaries_block}"""
|
| 235 |
+
|
| 236 |
+
def _build_prompt(self, context: str, filter_desc: str,
|
| 237 |
+
n_analyzed: int) -> str:
|
| 238 |
+
return f"""Analyze the following {n_analyzed} HelpScout customer support conversation summaries for Musora.
|
| 239 |
+
|
| 240 |
+
Applied filters: {filter_desc}
|
| 241 |
+
|
| 242 |
+
{context}
|
| 243 |
+
|
| 244 |
+
Your task: Synthesize these conversations and produce insights that go BEYOND the pre-extracted tags.
|
| 245 |
+
Look for underlying patterns, recurring pain points, emotional signals, product gaps, and operational issues
|
| 246 |
+
that would not be obvious from simple topic counts alone.
|
| 247 |
+
|
| 248 |
+
Respond in JSON with this exact structure:
|
| 249 |
+
{{
|
| 250 |
+
"executive_summary": "3-5 sentence high-level synthesis of what customers are experiencing",
|
| 251 |
+
"top_themes": [
|
| 252 |
+
{{
|
| 253 |
+
"theme": "Short theme name (not a topic tag)",
|
| 254 |
+
"description": "What customers are actually saying and feeling about this",
|
| 255 |
+
"prevalence": "Rough estimate: e.g. 'Appears in ~30% of conversations'"
|
| 256 |
+
}}
|
| 257 |
+
],
|
| 258 |
+
"top_complaints": [
|
| 259 |
+
"Specific actionable complaint statement (not generic)"
|
| 260 |
+
],
|
| 261 |
+
"unexpected_insights": [
|
| 262 |
+
"A pattern, contradiction, or insight that would surprise a product manager"
|
| 263 |
+
],
|
| 264 |
+
"notable_quotes": [
|
| 265 |
+
"Paraphrased quote or representative statement from conversations (not verbatim)"
|
| 266 |
+
]
|
| 267 |
+
}}
|
| 268 |
+
|
| 269 |
+
Guidelines:
|
| 270 |
+
- Top themes: 5-8 items, each distinct from pre-extracted topics
|
| 271 |
+
- Top complaints: 5-8 bullet points, specific and actionable
|
| 272 |
+
- Unexpected insights: 3-5 items, must genuinely go beyond the tag taxonomy
|
| 273 |
+
- Notable quotes: 3-5 representative paraphrases
|
| 274 |
+
- If a section has fewer relevant items, use fewer β quality over quantity
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
@staticmethod
|
| 278 |
+
def _ensure_defaults(summary: dict) -> dict:
|
| 279 |
+
defaults = {
|
| 280 |
+
"executive_summary": "",
|
| 281 |
+
"top_themes": [],
|
| 282 |
+
"top_complaints": [],
|
| 283 |
+
"unexpected_insights": [],
|
| 284 |
+
"notable_quotes": [],
|
| 285 |
+
}
|
| 286 |
+
for k, v in defaults.items():
|
| 287 |
+
if k not in summary:
|
| 288 |
+
summary[k] = v
|
| 289 |
+
return summary
|
| 290 |
+
|
| 291 |
+
def _empty_result(self, filter_desc: str) -> dict:
|
| 292 |
+
return {
|
| 293 |
+
"success": True,
|
| 294 |
+
"summary": {
|
| 295 |
+
"executive_summary": "No conversations matched the selected filters.",
|
| 296 |
+
"top_themes": [],
|
| 297 |
+
"top_complaints": [],
|
| 298 |
+
"unexpected_insights": [],
|
| 299 |
+
"notable_quotes": [],
|
| 300 |
+
},
|
| 301 |
+
"metadata": {
|
| 302 |
+
"total_conversations_analyzed": 0,
|
| 303 |
+
"total_available": 0,
|
| 304 |
+
"model_used": self.model,
|
| 305 |
+
"tokens_used": 0,
|
| 306 |
+
"filter_applied": filter_desc,
|
| 307 |
+
},
|
| 308 |
+
"error": None,
|
| 309 |
+
}
|
visualization/app.py
CHANGED
|
@@ -14,9 +14,12 @@ parent_dir = Path(__file__).resolve().parent
|
|
| 14 |
sys.path.append(str(parent_dir))
|
| 15 |
|
| 16 |
from data.data_loader import SentimentDataLoader
|
|
|
|
| 17 |
from components.dashboard import render_dashboard
|
| 18 |
from components.sentiment_analysis import render_sentiment_analysis
|
| 19 |
from components.reply_required import render_reply_required
|
|
|
|
|
|
|
| 20 |
from utils.auth import check_authentication, render_login_page, logout, get_current_user
|
| 21 |
|
| 22 |
# ββ Load configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -38,15 +41,13 @@ st.set_page_config(
|
|
| 38 |
if not check_authentication():
|
| 39 |
render_login_page()
|
| 40 |
|
| 41 |
-
# ββ
|
| 42 |
data_loader = SentimentDataLoader()
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def _ensure_dashboard_data():
|
| 46 |
-
"""
|
| 47 |
-
Load dashboard data once and store in session_state.
|
| 48 |
-
Subsequent calls within the same session (or until cache expires) are free.
|
| 49 |
-
"""
|
| 50 |
if 'dashboard_df' not in st.session_state or st.session_state['dashboard_df'] is None:
|
| 51 |
with st.spinner("Loading dashboard dataβ¦"):
|
| 52 |
df = data_loader.load_dashboard_data()
|
|
@@ -54,6 +55,15 @@ def _ensure_dashboard_data():
|
|
| 54 |
return st.session_state['dashboard_df']
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def main():
|
| 58 |
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
with st.sidebar:
|
|
@@ -72,15 +82,22 @@ def main():
|
|
| 72 |
|
| 73 |
page = st.radio(
|
| 74 |
"Select Page",
|
| 75 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
index=0
|
| 77 |
)
|
| 78 |
|
| 79 |
st.markdown("---")
|
| 80 |
st.markdown("### π Global Filters")
|
| 81 |
|
| 82 |
-
# Load
|
| 83 |
dashboard_df = _ensure_dashboard_data()
|
|
|
|
| 84 |
|
| 85 |
if dashboard_df.empty:
|
| 86 |
st.error("No data available. Please check your Snowflake connection.")
|
|
@@ -148,22 +165,27 @@ def main():
|
|
| 148 |
if st.button("β»οΈ Reload Data", use_container_width=True):
|
| 149 |
st.cache_data.clear()
|
| 150 |
st.session_state.pop('dashboard_df', None)
|
|
|
|
| 151 |
st.rerun()
|
| 152 |
|
| 153 |
# Data info
|
| 154 |
st.markdown("---")
|
| 155 |
st.markdown("### βΉοΈ Data Info")
|
| 156 |
-
st.info(f"**
|
|
|
|
|
|
|
|
|
|
| 157 |
if 'processed_at' in dashboard_df.columns and not dashboard_df.empty:
|
| 158 |
last_update = dashboard_df['processed_at'].max()
|
| 159 |
if hasattr(last_update, 'strftime'):
|
| 160 |
st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
|
| 161 |
|
| 162 |
-
# ββ Build filtered dashboard_df
|
|
|
|
| 163 |
filters_applied = st.session_state.get('filters_applied', False)
|
| 164 |
global_filters = st.session_state.get('global_filters', {})
|
| 165 |
|
| 166 |
-
if filters_applied and global_filters:
|
| 167 |
filtered_df = data_loader.apply_filters(
|
| 168 |
dashboard_df,
|
| 169 |
platforms=global_filters.get('platforms') or None,
|
|
@@ -190,6 +212,12 @@ def main():
|
|
| 190 |
# RR page fetches its own data on demand; receives only data_loader
|
| 191 |
render_reply_required(data_loader)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
st.markdown("---")
|
| 195 |
st.markdown(
|
|
|
|
| 14 |
sys.path.append(str(parent_dir))
|
| 15 |
|
| 16 |
from data.data_loader import SentimentDataLoader
|
| 17 |
+
from data.helpscout_data_loader import HelpScoutDataLoader
|
| 18 |
from components.dashboard import render_dashboard
|
| 19 |
from components.sentiment_analysis import render_sentiment_analysis
|
| 20 |
from components.reply_required import render_reply_required
|
| 21 |
+
from components.helpscout_dashboard import render_helpscout_dashboard
|
| 22 |
+
from components.helpscout_analysis import render_helpscout_analysis
|
| 23 |
from utils.auth import check_authentication, render_login_page, logout, get_current_user
|
| 24 |
|
| 25 |
# ββ Load configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 41 |
if not check_authentication():
|
| 42 |
render_login_page()
|
| 43 |
|
| 44 |
+
# ββ Data loader instances (cheap: just read config) βββββββββββββββββββββββββββ
|
| 45 |
data_loader = SentimentDataLoader()
|
| 46 |
+
helpscout_loader = HelpScoutDataLoader()
|
| 47 |
|
| 48 |
|
| 49 |
def _ensure_dashboard_data():
|
| 50 |
+
"""Load comment dashboard data once and store in session_state."""
|
|
|
|
|
|
|
|
|
|
| 51 |
if 'dashboard_df' not in st.session_state or st.session_state['dashboard_df'] is None:
|
| 52 |
with st.spinner("Loading dashboard dataβ¦"):
|
| 53 |
df = data_loader.load_dashboard_data()
|
|
|
|
| 55 |
return st.session_state['dashboard_df']
|
| 56 |
|
| 57 |
|
| 58 |
+
def _ensure_helpscout_data():
|
| 59 |
+
"""Load HelpScout dashboard data once and store in session_state."""
|
| 60 |
+
if 'helpscout_df' not in st.session_state or st.session_state['helpscout_df'] is None:
|
| 61 |
+
with st.spinner("Loading HelpScout dataβ¦"):
|
| 62 |
+
hs_df = helpscout_loader.load_dashboard_data()
|
| 63 |
+
st.session_state['helpscout_df'] = hs_df
|
| 64 |
+
return st.session_state['helpscout_df']
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def main():
|
| 68 |
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
with st.sidebar:
|
|
|
|
| 82 |
|
| 83 |
page = st.radio(
|
| 84 |
"Select Page",
|
| 85 |
+
[
|
| 86 |
+
"π Sentiment Dashboard",
|
| 87 |
+
"π Custom Sentiment Queries",
|
| 88 |
+
"π¬ Reply Required",
|
| 89 |
+
"π§ HelpScout Dashboard",
|
| 90 |
+
"π¬ HelpScout Analysis",
|
| 91 |
+
],
|
| 92 |
index=0
|
| 93 |
)
|
| 94 |
|
| 95 |
st.markdown("---")
|
| 96 |
st.markdown("### π Global Filters")
|
| 97 |
|
| 98 |
+
# Load both data sources at startup
|
| 99 |
dashboard_df = _ensure_dashboard_data()
|
| 100 |
+
_ensure_helpscout_data()
|
| 101 |
|
| 102 |
if dashboard_df.empty:
|
| 103 |
st.error("No data available. Please check your Snowflake connection.")
|
|
|
|
| 165 |
if st.button("β»οΈ Reload Data", use_container_width=True):
|
| 166 |
st.cache_data.clear()
|
| 167 |
st.session_state.pop('dashboard_df', None)
|
| 168 |
+
st.session_state.pop('helpscout_df', None)
|
| 169 |
st.rerun()
|
| 170 |
|
| 171 |
# Data info
|
| 172 |
st.markdown("---")
|
| 173 |
st.markdown("### βΉοΈ Data Info")
|
| 174 |
+
st.info(f"**Comments:** {len(dashboard_df):,}")
|
| 175 |
+
hs_df_info = st.session_state.get('helpscout_df')
|
| 176 |
+
if hs_df_info is not None and not hs_df_info.empty:
|
| 177 |
+
st.info(f"**HelpScout:** {len(hs_df_info):,} conversations")
|
| 178 |
if 'processed_at' in dashboard_df.columns and not dashboard_df.empty:
|
| 179 |
last_update = dashboard_df['processed_at'].max()
|
| 180 |
if hasattr(last_update, 'strftime'):
|
| 181 |
st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
|
| 182 |
|
| 183 |
+
# ββ Build filtered dashboard_df (only applies to comment pages) βββββββββ
|
| 184 |
+
_hs_page = page in ("π§ HelpScout Dashboard", "π¬ HelpScout Analysis")
|
| 185 |
filters_applied = st.session_state.get('filters_applied', False)
|
| 186 |
global_filters = st.session_state.get('global_filters', {})
|
| 187 |
|
| 188 |
+
if not _hs_page and filters_applied and global_filters:
|
| 189 |
filtered_df = data_loader.apply_filters(
|
| 190 |
dashboard_df,
|
| 191 |
platforms=global_filters.get('platforms') or None,
|
|
|
|
| 212 |
# RR page fetches its own data on demand; receives only data_loader
|
| 213 |
render_reply_required(data_loader)
|
| 214 |
|
| 215 |
+
elif page == "π§ HelpScout Dashboard":
|
| 216 |
+
render_helpscout_dashboard(helpscout_loader)
|
| 217 |
+
|
| 218 |
+
elif page == "π¬ HelpScout Analysis":
|
| 219 |
+
render_helpscout_analysis(helpscout_loader)
|
| 220 |
+
|
| 221 |
# ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 222 |
st.markdown("---")
|
| 223 |
st.markdown(
|
visualization/components/dashboard.py
CHANGED
|
@@ -220,6 +220,51 @@ def render_dashboard(df):
|
|
| 220 |
|
| 221 |
st.markdown("---")
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# Brand-Platform Matrix
|
| 224 |
st.markdown("## π Cross-Dimensional Analysis")
|
| 225 |
|
|
@@ -580,4 +625,13 @@ def render_dashboard(df):
|
|
| 580 |
sunburst = distribution_charts.create_combined_distribution_sunburst(
|
| 581 |
df, title="Brand > Platform > Sentiment Distribution"
|
| 582 |
)
|
| 583 |
-
st.plotly_chart(sunburst, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
st.markdown("---")
|
| 222 |
|
| 223 |
+
# Emotion Analysis
|
| 224 |
+
st.markdown("## π Emotion Analysis")
|
| 225 |
+
|
| 226 |
+
if 'emotions' in df.columns and df['emotions'].notna().any():
|
| 227 |
+
col1, col2 = st.columns(2)
|
| 228 |
+
|
| 229 |
+
with col1:
|
| 230 |
+
emotion_bar = distribution_charts.create_emotion_bar_chart(
|
| 231 |
+
df, title="Emotion Distribution", orientation='h'
|
| 232 |
+
)
|
| 233 |
+
st.plotly_chart(emotion_bar, use_container_width=True)
|
| 234 |
+
|
| 235 |
+
with col2:
|
| 236 |
+
emotion_pie = distribution_charts.create_emotion_pie_chart(
|
| 237 |
+
df, title="Emotion Distribution"
|
| 238 |
+
)
|
| 239 |
+
st.plotly_chart(emotion_pie, use_container_width=True)
|
| 240 |
+
|
| 241 |
+
with st.expander("π‘ Emotion Insights"):
|
| 242 |
+
emotion_dist = processor.get_emotion_distribution(df)
|
| 243 |
+
if not emotion_dist.empty:
|
| 244 |
+
top_emotion = emotion_dist.iloc[0]
|
| 245 |
+
st.write(f"**Most common emotion:** {top_emotion['emotions'].title()} "
|
| 246 |
+
f"({int(top_emotion['count']):,} comments, {top_emotion['percentage']:.1f}%)")
|
| 247 |
+
|
| 248 |
+
negative_emotions = ['frustration', 'disappointment', 'sadness', 'anger']
|
| 249 |
+
neg_emotion_dist = emotion_dist[emotion_dist['emotions'].isin(negative_emotions)]
|
| 250 |
+
if not neg_emotion_dist.empty:
|
| 251 |
+
total_neg = neg_emotion_dist['count'].sum()
|
| 252 |
+
total = emotion_dist['count'].sum()
|
| 253 |
+
st.write(f"**Negative emotions** (frustration, disappointment, sadness, anger): "
|
| 254 |
+
f"{int(total_neg):,} occurrences ({total_neg / total * 100:.1f}%)")
|
| 255 |
+
|
| 256 |
+
positive_emotions = ['joy', 'excitement', 'gratitude', 'admiration']
|
| 257 |
+
pos_emotion_dist = emotion_dist[emotion_dist['emotions'].isin(positive_emotions)]
|
| 258 |
+
if not pos_emotion_dist.empty:
|
| 259 |
+
total_pos = pos_emotion_dist['count'].sum()
|
| 260 |
+
total = emotion_dist['count'].sum()
|
| 261 |
+
st.write(f"**Positive emotions** (joy, excitement, gratitude, admiration): "
|
| 262 |
+
f"{int(total_pos):,} occurrences ({total_pos / total * 100:.1f}%)")
|
| 263 |
+
else:
|
| 264 |
+
st.info("No emotion data available. Emotions are extracted for newly processed comments.")
|
| 265 |
+
|
| 266 |
+
st.markdown("---")
|
| 267 |
+
|
| 268 |
# Brand-Platform Matrix
|
| 269 |
st.markdown("## π Cross-Dimensional Analysis")
|
| 270 |
|
|
|
|
| 625 |
sunburst = distribution_charts.create_combined_distribution_sunburst(
|
| 626 |
df, title="Brand > Platform > Sentiment Distribution"
|
| 627 |
)
|
| 628 |
+
st.plotly_chart(sunburst, use_container_width=True)
|
| 629 |
+
|
| 630 |
+
# ββ HelpScout compact summary (additive β no impact on existing charts) ββ
|
| 631 |
+
hs_df = st.session_state.get("helpscout_df")
|
| 632 |
+
if hs_df is not None and not hs_df.empty:
|
| 633 |
+
try:
|
| 634 |
+
from components.helpscout_dashboard import render_helpscout_compact_summary
|
| 635 |
+
render_helpscout_compact_summary(hs_df)
|
| 636 |
+
except Exception:
|
| 637 |
+
pass # never break the main dashboard if helpscout module fails
|
visualization/components/helpscout_analysis.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout Analysis Page
|
| 3 |
+
Purpose-built analysis page for HelpScout conversations.
|
| 4 |
+
Mirrors the SA page architecture: filter β fetch β charts β LLM summary β export.
|
| 5 |
+
One page-level summary report for the entire filtered set.
|
| 6 |
+
"""
|
| 7 |
+
import sys
|
| 8 |
+
from datetime import date, timedelta
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import streamlit as st
|
| 13 |
+
|
| 14 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 15 |
+
sys.path.append(str(parent_dir))
|
| 16 |
+
|
| 17 |
+
from visualizations.helpscout_charts import HelpScoutCharts
|
| 18 |
+
from utils.helpscout_utils import (
|
| 19 |
+
boolean_flag_counts, build_filter_description, topic_label, load_topic_taxonomy
|
| 20 |
+
)
|
| 21 |
+
from agents.helpscout_summary_agent import HelpScoutSummaryAgent
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def render_helpscout_analysis(data_loader):
|
| 25 |
+
"""
|
| 26 |
+
Render the HelpScout Analysis page.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
data_loader: HelpScoutDataLoader instance
|
| 30 |
+
"""
|
| 31 |
+
st.title("π¬ HelpScout Analysis")
|
| 32 |
+
st.markdown(
|
| 33 |
+
"Deep-dive into customer support conversations. Apply filters, fetch the data, "
|
| 34 |
+
"explore distributions, and generate an AI-powered summary report."
|
| 35 |
+
)
|
| 36 |
+
st.markdown("---")
|
| 37 |
+
|
| 38 |
+
charts = HelpScoutCharts()
|
| 39 |
+
taxonomy = load_topic_taxonomy()
|
| 40 |
+
|
| 41 |
+
# ββ Filter options from already-loaded dashboard df βββββββββββββββββββββββ
|
| 42 |
+
hs_df = st.session_state.get("helpscout_df")
|
| 43 |
+
if hs_df is None or hs_df.empty:
|
| 44 |
+
st.warning("HelpScout dashboard data not loaded yet. Please wait for the app to initialise.")
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
filter_options = data_loader.get_filter_options(hs_df)
|
| 48 |
+
|
| 49 |
+
# ββ Filters βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
st.markdown("### π― Filters")
|
| 51 |
+
|
| 52 |
+
row1_col1, row1_col2 = st.columns(2)
|
| 53 |
+
with row1_col1:
|
| 54 |
+
min_date = hs_df["first_message_at"].min().date() if "first_message_at" in hs_df.columns and not hs_df.empty else date.today() - timedelta(days=60)
|
| 55 |
+
max_date = hs_df["first_message_at"].max().date() if "first_message_at" in hs_df.columns and not hs_df.empty else date.today()
|
| 56 |
+
default_start = max(min_date, max_date - timedelta(days=data_loader.default_date_range_days))
|
| 57 |
+
date_range = st.date_input(
|
| 58 |
+
"Date Range (First Message At)",
|
| 59 |
+
value=(default_start, max_date),
|
| 60 |
+
min_value=min_date, max_value=max_date,
|
| 61 |
+
key="hs_analysis_date_range",
|
| 62 |
+
)
|
| 63 |
+
with row1_col2:
|
| 64 |
+
top_n_options = [("All", 0), ("50", 50), ("100", 100), ("200", 200), ("500", 500), ("1000", 1000)]
|
| 65 |
+
top_n_label = st.selectbox(
|
| 66 |
+
"Limit Results",
|
| 67 |
+
options=[x[0] for x in top_n_options],
|
| 68 |
+
index=0,
|
| 69 |
+
help="Limit number of conversations fetched. 'All' fetches everything matching your filters.",
|
| 70 |
+
key="hs_analysis_top_n",
|
| 71 |
+
)
|
| 72 |
+
top_n = dict(top_n_options)[top_n_label]
|
| 73 |
+
|
| 74 |
+
row2_col1, row2_col2, row2_col3, row2_col4 = st.columns(4)
|
| 75 |
+
with row2_col1:
|
| 76 |
+
topic_options = filter_options.get("topics", [])
|
| 77 |
+
topic_labels_map = {t: topic_label(t, taxonomy) for t in topic_options}
|
| 78 |
+
selected_topic_labels = st.multiselect(
|
| 79 |
+
"Topics",
|
| 80 |
+
options=[topic_labels_map[t] for t in topic_options],
|
| 81 |
+
default=[],
|
| 82 |
+
key="hs_analysis_topics",
|
| 83 |
+
)
|
| 84 |
+
label_to_id = {v: k for k, v in topic_labels_map.items()}
|
| 85 |
+
selected_topics = [label_to_id[l] for l in selected_topic_labels if l in label_to_id]
|
| 86 |
+
|
| 87 |
+
with row2_col2:
|
| 88 |
+
selected_sentiments = st.multiselect(
|
| 89 |
+
"Sentiments",
|
| 90 |
+
options=filter_options.get("sentiments", []),
|
| 91 |
+
default=[],
|
| 92 |
+
key="hs_analysis_sentiments",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
with row2_col3:
|
| 96 |
+
selected_statuses = st.multiselect(
|
| 97 |
+
"Status",
|
| 98 |
+
options=filter_options.get("statuses", []),
|
| 99 |
+
default=[],
|
| 100 |
+
key="hs_analysis_statuses",
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
with row2_col4:
|
| 104 |
+
selected_sources = st.multiselect(
|
| 105 |
+
"Source Type",
|
| 106 |
+
options=filter_options.get("sources", []),
|
| 107 |
+
default=[],
|
| 108 |
+
key="hs_analysis_sources",
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
row3_col1, row3_col2, row3_col3 = st.columns(3)
|
| 112 |
+
with row3_col1:
|
| 113 |
+
refund_only = st.checkbox("Refund Requests Only", key="hs_analysis_refund")
|
| 114 |
+
with row3_col2:
|
| 115 |
+
cancel_only = st.checkbox("Cancellations Only", key="hs_analysis_cancel")
|
| 116 |
+
with row3_col3:
|
| 117 |
+
membership_only = st.checkbox("Membership Joins Only", key="hs_analysis_membership")
|
| 118 |
+
|
| 119 |
+
st.markdown("---")
|
| 120 |
+
|
| 121 |
+
# ββ Fetch button βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
dr_tuple = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else None
|
| 123 |
+
|
| 124 |
+
fetch_key = (
|
| 125 |
+
dr_tuple,
|
| 126 |
+
tuple(sorted(selected_sentiments)),
|
| 127 |
+
tuple(sorted(selected_topics)),
|
| 128 |
+
tuple(sorted(selected_statuses)),
|
| 129 |
+
tuple(sorted(selected_sources)),
|
| 130 |
+
bool(refund_only), bool(cancel_only), bool(membership_only),
|
| 131 |
+
top_n,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
has_data = (
|
| 135 |
+
"hs_analysis_df" in st.session_state
|
| 136 |
+
and st.session_state.get("hs_analysis_fetch_key") == fetch_key
|
| 137 |
+
and not st.session_state["hs_analysis_df"].empty
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
fetch_col, info_col = st.columns([1, 3])
|
| 141 |
+
with fetch_col:
|
| 142 |
+
fetch_clicked = st.button("π Fetch Data", type="primary",
|
| 143 |
+
use_container_width=True, key="hs_fetch_btn")
|
| 144 |
+
with info_col:
|
| 145 |
+
if has_data:
|
| 146 |
+
n = len(st.session_state["hs_analysis_df"])
|
| 147 |
+
st.success(f"β
Showing **{n:,}** conversations matching your filters")
|
| 148 |
+
elif not fetch_clicked:
|
| 149 |
+
st.info("π Set your filters and click **Fetch Data** to query Snowflake.")
|
| 150 |
+
|
| 151 |
+
if fetch_clicked:
|
| 152 |
+
with st.spinner("Fetching HelpScout data from Snowflakeβ¦"):
|
| 153 |
+
result_df = data_loader.load_analysis_data(
|
| 154 |
+
sentiments=selected_sentiments or None,
|
| 155 |
+
topics=selected_topics or None,
|
| 156 |
+
refund_only=refund_only,
|
| 157 |
+
cancel_only=cancel_only,
|
| 158 |
+
membership_only=membership_only,
|
| 159 |
+
statuses=selected_statuses or None,
|
| 160 |
+
sources=selected_sources or None,
|
| 161 |
+
date_range=(date_range[0], date_range[1]) if dr_tuple else None,
|
| 162 |
+
top_n=top_n or None,
|
| 163 |
+
)
|
| 164 |
+
applied_filters = {
|
| 165 |
+
"date_range": (date_range[0], date_range[1]) if dr_tuple else None,
|
| 166 |
+
"sentiments": selected_sentiments,
|
| 167 |
+
"topics": selected_topics,
|
| 168 |
+
"statuses": selected_statuses,
|
| 169 |
+
"sources": selected_sources,
|
| 170 |
+
"refund_only": refund_only,
|
| 171 |
+
"cancel_only": cancel_only,
|
| 172 |
+
"membership_only": membership_only,
|
| 173 |
+
}
|
| 174 |
+
st.session_state["hs_analysis_df"] = result_df
|
| 175 |
+
st.session_state["hs_analysis_fetch_key"] = fetch_key
|
| 176 |
+
st.session_state["hs_analysis_filter_desc"] = build_filter_description(applied_filters, taxonomy)
|
| 177 |
+
# Invalidate any prior summary when filters change
|
| 178 |
+
st.session_state.pop("hs_analysis_summary", None)
|
| 179 |
+
st.session_state.pop("hs_analysis_summary_key", None)
|
| 180 |
+
st.session_state["hs_analysis_page"] = 1
|
| 181 |
+
st.rerun()
|
| 182 |
+
|
| 183 |
+
if not has_data and not fetch_clicked:
|
| 184 |
+
return
|
| 185 |
+
|
| 186 |
+
analysis_df = st.session_state.get("hs_analysis_df", pd.DataFrame())
|
| 187 |
+
filter_desc = st.session_state.get("hs_analysis_filter_desc", "No filters applied")
|
| 188 |
+
|
| 189 |
+
if analysis_df.empty:
|
| 190 |
+
st.warning("No conversations found for the selected filters. Try adjusting and re-fetching.")
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
total = len(analysis_df)
|
| 194 |
+
flags = boolean_flag_counts(analysis_df)
|
| 195 |
+
neg_pct = analysis_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100
|
| 196 |
+
avg_dur = float(analysis_df["duration_hours"].mean()) if "duration_hours" in analysis_df.columns else 0.0
|
| 197 |
+
|
| 198 |
+
# ββ KPI Row βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 199 |
+
st.markdown("### π Overview")
|
| 200 |
+
k1, k2, k3, k4, k5 = st.columns(5)
|
| 201 |
+
k1.metric("Conversations", f"{total:,}")
|
| 202 |
+
k2.metric("Negative %", f"{neg_pct:.1f}%")
|
| 203 |
+
k3.metric("Refund Requests", f"{flags['is_refund_request']:,}")
|
| 204 |
+
k4.metric("Cancellations", f"{flags['is_cancellation']:,}")
|
| 205 |
+
k5.metric("Avg Duration (h)", f"{avg_dur:.1f}")
|
| 206 |
+
|
| 207 |
+
st.caption(f"**Active filters:** {filter_desc}")
|
| 208 |
+
st.markdown("---")
|
| 209 |
+
|
| 210 |
+
# ββ Distributions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 211 |
+
st.markdown("### π Distributions")
|
| 212 |
+
|
| 213 |
+
col1, col2 = st.columns(2)
|
| 214 |
+
with col1:
|
| 215 |
+
st.plotly_chart(charts.create_sentiment_pie_chart(analysis_df, title="Sentiment Distribution"),
|
| 216 |
+
use_container_width=True, key="hs_analysis_sent_pie")
|
| 217 |
+
with col2:
|
| 218 |
+
st.plotly_chart(charts.create_topic_bar_chart(analysis_df, title="Topic Distribution"),
|
| 219 |
+
use_container_width=True, key="hs_analysis_topic_bar")
|
| 220 |
+
|
| 221 |
+
col1, col2 = st.columns(2)
|
| 222 |
+
with col1:
|
| 223 |
+
st.plotly_chart(charts.create_topic_sentiment_heatmap(analysis_df),
|
| 224 |
+
use_container_width=True, key="hs_analysis_topic_heatmap")
|
| 225 |
+
with col2:
|
| 226 |
+
st.plotly_chart(charts.create_boolean_flags_chart(analysis_df),
|
| 227 |
+
use_container_width=True, key="hs_analysis_flags")
|
| 228 |
+
|
| 229 |
+
if "emotions" in analysis_df.columns and analysis_df["emotions"].notna().any():
|
| 230 |
+
col1, col2 = st.columns(2)
|
| 231 |
+
with col1:
|
| 232 |
+
st.plotly_chart(charts.create_emotion_bar_chart(analysis_df, title="Emotion Distribution"),
|
| 233 |
+
use_container_width=True, key="hs_analysis_emotion")
|
| 234 |
+
with col2:
|
| 235 |
+
st.plotly_chart(charts.create_volume_timeline(analysis_df, title="Volume Over Time"),
|
| 236 |
+
use_container_width=True, key="hs_analysis_vol_timeline")
|
| 237 |
+
else:
|
| 238 |
+
st.plotly_chart(charts.create_volume_timeline(analysis_df, title="Volume Over Time"),
|
| 239 |
+
use_container_width=True, key="hs_analysis_vol_timeline2")
|
| 240 |
+
|
| 241 |
+
st.markdown("---")
|
| 242 |
+
|
| 243 |
+
# ββ AI Summary Report βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 244 |
+
st.markdown("### π€ AI Summary Report")
|
| 245 |
+
st.markdown(
|
| 246 |
+
"Generate an LLM-powered report from the conversation summaries matching your filters. "
|
| 247 |
+
"The AI looks beyond the pre-extracted tags to surface patterns, pain points, "
|
| 248 |
+
"and actionable insights."
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
summary_available = (
|
| 252 |
+
"hs_analysis_summary" in st.session_state
|
| 253 |
+
and st.session_state.get("hs_analysis_summary_key") == fetch_key
|
| 254 |
+
and st.session_state["hs_analysis_summary"] is not None
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
gen_col, pdf_col = st.columns([1, 1])
|
| 258 |
+
with gen_col:
|
| 259 |
+
gen_clicked = st.button("π§ Generate Summary Report", type="primary",
|
| 260 |
+
use_container_width=True, key="hs_gen_summary_btn")
|
| 261 |
+
with pdf_col:
|
| 262 |
+
export_pdf_clicked = st.button("π Export as PDF", use_container_width=True,
|
| 263 |
+
key="hs_export_pdf_btn")
|
| 264 |
+
|
| 265 |
+
if gen_clicked:
|
| 266 |
+
with st.spinner("Analysing conversations with AIβ¦ this may take 20β40 secondsβ¦"):
|
| 267 |
+
agent = HelpScoutSummaryAgent()
|
| 268 |
+
result = agent.process({
|
| 269 |
+
"conversations": analysis_df,
|
| 270 |
+
"filter_description": filter_desc,
|
| 271 |
+
})
|
| 272 |
+
st.session_state["hs_analysis_summary"] = result
|
| 273 |
+
st.session_state["hs_analysis_summary_key"] = fetch_key
|
| 274 |
+
st.rerun()
|
| 275 |
+
|
| 276 |
+
if export_pdf_clicked:
|
| 277 |
+
with st.spinner("Generating PDFβ¦"):
|
| 278 |
+
try:
|
| 279 |
+
from utils.helpscout_pdf import HelpScoutAnalysisPDF
|
| 280 |
+
import datetime
|
| 281 |
+
summary_result = st.session_state.get("hs_analysis_summary")
|
| 282 |
+
exporter = HelpScoutAnalysisPDF()
|
| 283 |
+
pdf_bytes = exporter.generate_report(
|
| 284 |
+
analysis_df,
|
| 285 |
+
filter_info={"Filters": filter_desc, "Total Conversations": str(total)},
|
| 286 |
+
summary_result=summary_result,
|
| 287 |
+
)
|
| 288 |
+
filename = f"helpscout_analysis_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.pdf"
|
| 289 |
+
st.success("Report generated!")
|
| 290 |
+
st.download_button(
|
| 291 |
+
label="Download Analysis PDF",
|
| 292 |
+
data=pdf_bytes,
|
| 293 |
+
file_name=filename,
|
| 294 |
+
mime="application/pdf",
|
| 295 |
+
use_container_width=True,
|
| 296 |
+
key="hs_download_pdf_btn",
|
| 297 |
+
)
|
| 298 |
+
except Exception as e:
|
| 299 |
+
st.error(f"Failed to generate PDF: {e}")
|
| 300 |
+
st.exception(e)
|
| 301 |
+
|
| 302 |
+
# Render the summary if available
|
| 303 |
+
if summary_available:
|
| 304 |
+
result = st.session_state["hs_analysis_summary"]
|
| 305 |
+
_render_summary_report(result)
|
| 306 |
+
|
| 307 |
+
st.markdown("---")
|
| 308 |
+
|
| 309 |
+
# ββ Conversation Cards ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 310 |
+
st.markdown("### π¬ Conversations")
|
| 311 |
+
|
| 312 |
+
if "hs_analysis_page" not in st.session_state:
|
| 313 |
+
st.session_state.hs_analysis_page = 1
|
| 314 |
+
|
| 315 |
+
per_page = 10
|
| 316 |
+
total_pages = max(1, (total + per_page - 1) // per_page)
|
| 317 |
+
|
| 318 |
+
if total > per_page:
|
| 319 |
+
st.info(f"Page {st.session_state.hs_analysis_page} of {total_pages} ({total:,} conversations)")
|
| 320 |
+
pc1, pc2, pc3 = st.columns([1, 2, 1])
|
| 321 |
+
with pc1:
|
| 322 |
+
if st.button("β¬
οΈ Previous", key="hs_prev_top",
|
| 323 |
+
disabled=st.session_state.hs_analysis_page == 1):
|
| 324 |
+
st.session_state.hs_analysis_page -= 1
|
| 325 |
+
st.rerun()
|
| 326 |
+
with pc2:
|
| 327 |
+
st.markdown(
|
| 328 |
+
f"<div style='text-align:center;padding-top:8px;'>"
|
| 329 |
+
f"Page {st.session_state.hs_analysis_page} / {total_pages}</div>",
|
| 330 |
+
unsafe_allow_html=True,
|
| 331 |
+
)
|
| 332 |
+
with pc3:
|
| 333 |
+
if st.button("Next β‘οΈ", key="hs_next_top",
|
| 334 |
+
disabled=st.session_state.hs_analysis_page >= total_pages):
|
| 335 |
+
st.session_state.hs_analysis_page += 1
|
| 336 |
+
st.rerun()
|
| 337 |
+
st.markdown("---")
|
| 338 |
+
|
| 339 |
+
start = (st.session_state.hs_analysis_page - 1) * per_page
|
| 340 |
+
end = min(start + per_page, total)
|
| 341 |
+
page_df = analysis_df.iloc[start:end]
|
| 342 |
+
|
| 343 |
+
for _, row in page_df.iterrows():
|
| 344 |
+
_render_conversation_card(row, taxonomy)
|
| 345 |
+
|
| 346 |
+
# Bottom pagination
|
| 347 |
+
if total > per_page:
|
| 348 |
+
pb1, pb2, pb3 = st.columns([1, 2, 1])
|
| 349 |
+
with pb1:
|
| 350 |
+
if st.button("β¬
οΈ Previous", key="hs_prev_bot",
|
| 351 |
+
disabled=st.session_state.hs_analysis_page == 1):
|
| 352 |
+
st.session_state.hs_analysis_page -= 1
|
| 353 |
+
st.rerun()
|
| 354 |
+
with pb2:
|
| 355 |
+
st.markdown(
|
| 356 |
+
f"<div style='text-align:center;padding-top:8px;'>"
|
| 357 |
+
f"Page {st.session_state.hs_analysis_page} / {total_pages}</div>",
|
| 358 |
+
unsafe_allow_html=True,
|
| 359 |
+
)
|
| 360 |
+
with pb3:
|
| 361 |
+
if st.button("Next β‘οΈ", key="hs_next_bot",
|
| 362 |
+
disabled=st.session_state.hs_analysis_page >= total_pages):
|
| 363 |
+
st.session_state.hs_analysis_page += 1
|
| 364 |
+
st.rerun()
|
| 365 |
+
|
| 366 |
+
st.markdown("---")
|
| 367 |
+
|
| 368 |
+
# ββ Export CSV ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 369 |
+
st.markdown("### πΎ Export Data")
|
| 370 |
+
export_cols = [c for c in ["conversation_id", "customer_email", "first_message_at",
|
| 371 |
+
"status", "sentiment_polarity", "topics", "summary",
|
| 372 |
+
"is_refund_request", "is_cancellation", "is_membership",
|
| 373 |
+
"duration_hours"] if c in analysis_df.columns]
|
| 374 |
+
csv = analysis_df[export_cols].to_csv(index=False)
|
| 375 |
+
st.download_button(
|
| 376 |
+
label="π₯ Download as CSV",
|
| 377 |
+
data=csv,
|
| 378 |
+
file_name=f"helpscout_analysis_{total}conversations.csv",
|
| 379 |
+
mime="text/csv",
|
| 380 |
+
key="hs_csv_download",
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 385 |
+
# Helper renderers
|
| 386 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 387 |
+
|
| 388 |
+
def _render_summary_report(result: dict):
|
| 389 |
+
"""Render the LLM summary result with nice formatting."""
|
| 390 |
+
if not result.get("success"):
|
| 391 |
+
st.error(f"AI analysis failed: {result.get('error', 'Unknown error')}")
|
| 392 |
+
return
|
| 393 |
+
|
| 394 |
+
summary = result.get("summary", {})
|
| 395 |
+
meta = result.get("metadata", {})
|
| 396 |
+
|
| 397 |
+
with st.container():
|
| 398 |
+
st.markdown("---")
|
| 399 |
+
st.markdown("#### π Executive Summary")
|
| 400 |
+
st.info(summary.get("executive_summary", ""))
|
| 401 |
+
|
| 402 |
+
col1, col2 = st.columns(2)
|
| 403 |
+
|
| 404 |
+
with col1:
|
| 405 |
+
themes = summary.get("top_themes", [])
|
| 406 |
+
if themes:
|
| 407 |
+
st.markdown("#### π― Top Themes")
|
| 408 |
+
for t in themes:
|
| 409 |
+
st.markdown(
|
| 410 |
+
f"**{t.get('theme', '')}** _{t.get('prevalence', '')}_ \n"
|
| 411 |
+
f"{t.get('description', '')}"
|
| 412 |
+
)
|
| 413 |
+
st.markdown("")
|
| 414 |
+
|
| 415 |
+
insights = summary.get("unexpected_insights", [])
|
| 416 |
+
if insights:
|
| 417 |
+
st.markdown("#### π‘ Unexpected Insights")
|
| 418 |
+
for ins in insights:
|
| 419 |
+
st.markdown(f"- {ins}")
|
| 420 |
+
|
| 421 |
+
with col2:
|
| 422 |
+
complaints = summary.get("top_complaints", [])
|
| 423 |
+
if complaints:
|
| 424 |
+
st.markdown("#### β οΈ Top Complaints")
|
| 425 |
+
for c in complaints:
|
| 426 |
+
st.markdown(f"- {c}")
|
| 427 |
+
|
| 428 |
+
quotes = summary.get("notable_quotes", [])
|
| 429 |
+
if quotes:
|
| 430 |
+
st.markdown("#### π¬ Notable Quotes")
|
| 431 |
+
for q in quotes:
|
| 432 |
+
st.markdown(f"> {q}")
|
| 433 |
+
|
| 434 |
+
with st.expander("βΉοΈ Analysis Metadata"):
|
| 435 |
+
mc1, mc2, mc3 = st.columns(3)
|
| 436 |
+
mc1.metric("Conversations Analysed", meta.get("total_conversations_analyzed", 0))
|
| 437 |
+
mc2.metric("Model Used", meta.get("model_used", "N/A"))
|
| 438 |
+
mc3.metric("Tokens Used", meta.get("tokens_used", 0))
|
| 439 |
+
if meta.get("total_available", 0) > meta.get("total_conversations_analyzed", 0):
|
| 440 |
+
st.caption(
|
| 441 |
+
f"Sampled {meta['total_conversations_analyzed']} of "
|
| 442 |
+
f"{meta['total_available']} conversations for this analysis."
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def _render_conversation_card(row, taxonomy: dict):
|
| 447 |
+
"""Render a single conversation card."""
|
| 448 |
+
sent = str(row.get("sentiment_polarity", "unknown"))
|
| 449 |
+
sent_emoji = {
|
| 450 |
+
"very_positive": "π’", "positive": "π©", "neutral": "π‘",
|
| 451 |
+
"negative": "π ", "very_negative": "π΄",
|
| 452 |
+
}.get(sent, "βͺ")
|
| 453 |
+
|
| 454 |
+
topics_list = row.get("topics_list") or []
|
| 455 |
+
topic_labels_str = ", ".join(topic_label(t, taxonomy) for t in topics_list) if topics_list else "β"
|
| 456 |
+
|
| 457 |
+
first_name = str(row.get("customer_first") or "").strip()
|
| 458 |
+
last_name = str(row.get("customer_last") or "").strip()
|
| 459 |
+
customer_str = f"{first_name} {last_name[:1]}." if first_name or last_name else "Anonymous"
|
| 460 |
+
|
| 461 |
+
first_msg = row.get("first_message_at")
|
| 462 |
+
date_str = first_msg.strftime("%Y-%m-%d") if hasattr(first_msg, "strftime") else str(first_msg or "")
|
| 463 |
+
|
| 464 |
+
flags = []
|
| 465 |
+
if row.get("is_refund_request"): flags.append("π° Refund")
|
| 466 |
+
if row.get("is_cancellation"): flags.append("π« Cancel")
|
| 467 |
+
if row.get("is_membership"): flags.append("β
Membership")
|
| 468 |
+
flags_str = " | ".join(flags) if flags else ""
|
| 469 |
+
|
| 470 |
+
with st.expander(
|
| 471 |
+
f"{sent_emoji} {customer_str} β {topic_labels_str} | {sent.replace('_', ' ').title()} | {date_str}"
|
| 472 |
+
+ (f" [{flags_str}]" if flags_str else ""),
|
| 473 |
+
expanded=False,
|
| 474 |
+
):
|
| 475 |
+
info_col1, info_col2, info_col3 = st.columns(3)
|
| 476 |
+
info_col1.markdown(f"**Status:** {row.get('status', 'β')}")
|
| 477 |
+
info_col2.markdown(f"**Source:** {row.get('source_type', 'β')}")
|
| 478 |
+
info_col3.markdown(f"**Duration:** {row.get('duration_hours', 0):.1f}h | **Threads:** {row.get('thread_count', 0)}")
|
| 479 |
+
|
| 480 |
+
summary = str(row.get("summary") or "No summary available.")
|
| 481 |
+
st.markdown(f"**Summary:** {summary}")
|
| 482 |
+
|
| 483 |
+
notes_col1, notes_col2 = st.columns(2)
|
| 484 |
+
with notes_col1:
|
| 485 |
+
sent_note = str(row.get("sentiment_notes") or "")
|
| 486 |
+
if sent_note:
|
| 487 |
+
st.markdown(f"**Sentiment Note:** _{sent_note}_")
|
| 488 |
+
with notes_col2:
|
| 489 |
+
topic_note = str(row.get("topic_notes") or "")
|
| 490 |
+
if topic_note:
|
| 491 |
+
st.markdown(f"**Topic Note:** _{topic_note}_")
|
visualization/components/helpscout_dashboard.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout Dashboard Page
|
| 3 |
+
Full dedicated dashboard for HelpScout customer support conversation analysis.
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import streamlit as st
|
| 10 |
+
|
| 11 |
+
parent_dir = Path(__file__).resolve().parent.parent
|
| 12 |
+
sys.path.append(str(parent_dir))
|
| 13 |
+
|
| 14 |
+
from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
|
| 15 |
+
from visualizations.helpscout_charts import HelpScoutCharts
|
| 16 |
+
from visualizations.demographic_charts import DemographicCharts
|
| 17 |
+
from utils.data_processor import SentimentDataProcessor
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _sentiment_score(df) -> float:
|
| 21 |
+
"""Compute average sentiment score on a -2 to +2 scale."""
|
| 22 |
+
score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
|
| 23 |
+
"negative": -1, "very_negative": -2}
|
| 24 |
+
if "sentiment_polarity" not in df.columns or df.empty:
|
| 25 |
+
return 0.0
|
| 26 |
+
scores = df["sentiment_polarity"].map(score_map).fillna(0)
|
| 27 |
+
return float(scores.mean())
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def render_helpscout_dashboard(data_loader):
|
| 31 |
+
"""
|
| 32 |
+
Render the full HelpScout Dashboard page.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
data_loader: HelpScoutDataLoader instance
|
| 36 |
+
"""
|
| 37 |
+
st.title("π§ HelpScout Support Dashboard")
|
| 38 |
+
st.markdown("Customer support conversation analysis from HelpScout.")
|
| 39 |
+
|
| 40 |
+
hs_df = st.session_state.get("helpscout_df")
|
| 41 |
+
if hs_df is None or hs_df.empty:
|
| 42 |
+
st.warning("No HelpScout data available. Please check your Snowflake connection.")
|
| 43 |
+
return
|
| 44 |
+
|
| 45 |
+
charts = HelpScoutCharts()
|
| 46 |
+
taxonomy = load_topic_taxonomy()
|
| 47 |
+
|
| 48 |
+
# ββ PDF Export ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 49 |
+
with st.expander("π Export PDF Report", expanded=False):
|
| 50 |
+
st.markdown(
|
| 51 |
+
"Generate a comprehensive HelpScout support report. "
|
| 52 |
+
"Covers sentiment, topics, billing flags, timelines, and demographics."
|
| 53 |
+
)
|
| 54 |
+
if st.button("Generate HelpScout PDF Report", type="primary",
|
| 55 |
+
use_container_width=True, key="hs_dash_pdf_btn"):
|
| 56 |
+
with st.spinner("Generating HelpScout PDF reportβ¦"):
|
| 57 |
+
try:
|
| 58 |
+
from utils.helpscout_pdf import HelpScoutDashboardPDF
|
| 59 |
+
exporter = HelpScoutDashboardPDF()
|
| 60 |
+
pdf_bytes = exporter.generate_report(hs_df)
|
| 61 |
+
import datetime
|
| 62 |
+
filename = f"helpscout_dashboard_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.pdf"
|
| 63 |
+
st.success("Report generated successfully!")
|
| 64 |
+
st.download_button(
|
| 65 |
+
label="Download HelpScout Dashboard PDF",
|
| 66 |
+
data=pdf_bytes,
|
| 67 |
+
file_name=filename,
|
| 68 |
+
mime="application/pdf",
|
| 69 |
+
use_container_width=True,
|
| 70 |
+
)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
st.error(f"Failed to generate report: {e}")
|
| 73 |
+
st.exception(e)
|
| 74 |
+
|
| 75 |
+
st.markdown("---")
|
| 76 |
+
|
| 77 |
+
# ββ KPI Row βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
total = len(hs_df)
|
| 79 |
+
escalation_count = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
|
| 80 |
+
flags = boolean_flag_counts(hs_df)
|
| 81 |
+
neg_pct = (hs_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100) if total else 0
|
| 82 |
+
avg_duration = float(hs_df["duration_hours"].mean()) if "duration_hours" in hs_df.columns else 0.0
|
| 83 |
+
|
| 84 |
+
k1, k2, k3, k4, k5, k6 = st.columns(6)
|
| 85 |
+
k1.metric("Total Conversations", f"{total:,}")
|
| 86 |
+
k2.metric("Avg Duration (h)", f"{avg_duration:.1f}")
|
| 87 |
+
k3.metric("Escalations", f"{escalation_count:,}", delta=f"{escalation_count/total*100:.1f}% of total" if total else None, delta_color="inverse")
|
| 88 |
+
k4.metric("Refund Requests", f"{flags['is_refund_request']:,}")
|
| 89 |
+
k5.metric("Cancellations", f"{flags['is_cancellation']:,}")
|
| 90 |
+
k6.metric("Membership Joins",f"{flags['is_membership']:,}")
|
| 91 |
+
|
| 92 |
+
st.markdown("---")
|
| 93 |
+
|
| 94 |
+
# ββ Sentiment βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
st.markdown("## π― Sentiment Distribution")
|
| 96 |
+
col1, col2 = st.columns(2)
|
| 97 |
+
with col1:
|
| 98 |
+
st.plotly_chart(charts.create_sentiment_pie_chart(hs_df), use_container_width=True)
|
| 99 |
+
with col2:
|
| 100 |
+
avg_score = _sentiment_score(hs_df)
|
| 101 |
+
st.plotly_chart(charts.create_sentiment_score_gauge(avg_score), use_container_width=True)
|
| 102 |
+
m1, m2 = st.columns(2)
|
| 103 |
+
pos_pct = hs_df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
|
| 104 |
+
m1.metric("Positive %", f"{pos_pct:.1f}%")
|
| 105 |
+
m2.metric("Negative %", f"{neg_pct:.1f}%")
|
| 106 |
+
|
| 107 |
+
st.markdown("---")
|
| 108 |
+
|
| 109 |
+
# ββ Topics ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
+
st.markdown("## π·οΈ Topic Analysis")
|
| 111 |
+
col1, col2 = st.columns(2)
|
| 112 |
+
with col1:
|
| 113 |
+
st.plotly_chart(charts.create_topic_bar_chart(hs_df, title="Conversations by Topic"),
|
| 114 |
+
use_container_width=True)
|
| 115 |
+
with col2:
|
| 116 |
+
st.plotly_chart(charts.create_topic_pie_chart(hs_df, title="Topic Share"),
|
| 117 |
+
use_container_width=True)
|
| 118 |
+
|
| 119 |
+
st.plotly_chart(charts.create_topic_sentiment_heatmap(hs_df), use_container_width=True)
|
| 120 |
+
|
| 121 |
+
st.markdown("---")
|
| 122 |
+
|
| 123 |
+
# ββ Emotions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
if "emotions" in hs_df.columns and hs_df["emotions"].notna().any():
|
| 125 |
+
st.markdown("## π Emotion Analysis")
|
| 126 |
+
col1, col2 = st.columns(2)
|
| 127 |
+
with col1:
|
| 128 |
+
st.plotly_chart(charts.create_emotion_bar_chart(hs_df, title="Emotion Distribution"),
|
| 129 |
+
use_container_width=True)
|
| 130 |
+
with col2:
|
| 131 |
+
# Reuse the existing DistributionCharts emotion pie (same df structure with emotions col)
|
| 132 |
+
from visualizations.distribution_charts import DistributionCharts
|
| 133 |
+
dist_charts = DistributionCharts()
|
| 134 |
+
st.plotly_chart(dist_charts.create_emotion_pie_chart(hs_df, title="Emotion Share"),
|
| 135 |
+
use_container_width=True)
|
| 136 |
+
st.markdown("---")
|
| 137 |
+
|
| 138 |
+
# ββ Billing Flags βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
st.markdown("## π³ Billing & Membership Flags")
|
| 140 |
+
col1, col2 = st.columns(2)
|
| 141 |
+
with col1:
|
| 142 |
+
st.plotly_chart(charts.create_boolean_flags_chart(hs_df), use_container_width=True)
|
| 143 |
+
with col2:
|
| 144 |
+
st.plotly_chart(charts.create_escalation_breakdown(hs_df), use_container_width=True)
|
| 145 |
+
|
| 146 |
+
st.markdown("---")
|
| 147 |
+
|
| 148 |
+
# ββ Status / Source βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
+
st.markdown("## π¬ Status & Source Distribution")
|
| 150 |
+
col1, col2 = st.columns(2)
|
| 151 |
+
with col1:
|
| 152 |
+
st.plotly_chart(charts.create_status_distribution(hs_df), use_container_width=True)
|
| 153 |
+
with col2:
|
| 154 |
+
st.plotly_chart(charts.create_source_distribution(hs_df), use_container_width=True)
|
| 155 |
+
|
| 156 |
+
st.markdown("---")
|
| 157 |
+
|
| 158 |
+
# ββ Volume & Timelines ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
+
with st.expander("π Volume & Trends", expanded=False):
|
| 160 |
+
freq_col, _ = st.columns([1, 3])
|
| 161 |
+
with freq_col:
|
| 162 |
+
freq = st.selectbox("Time Granularity", ["D", "W", "M"],
|
| 163 |
+
format_func=lambda x: {"D": "Daily", "W": "Weekly", "M": "Monthly"}[x],
|
| 164 |
+
index=1, key="hs_dash_freq")
|
| 165 |
+
st.plotly_chart(charts.create_volume_timeline(hs_df, freq=freq), use_container_width=True)
|
| 166 |
+
st.plotly_chart(charts.create_sentiment_timeline(hs_df, freq=freq), use_container_width=True)
|
| 167 |
+
st.plotly_chart(charts.create_topic_timeline(hs_df, freq=freq), use_container_width=True)
|
| 168 |
+
st.plotly_chart(charts.create_refund_cancel_timeline(hs_df, freq=freq), use_container_width=True)
|
| 169 |
+
|
| 170 |
+
# ββ Duration & Thread Count βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
with st.expander("π Conversation Depth", expanded=False):
|
| 172 |
+
col1, col2 = st.columns(2)
|
| 173 |
+
with col1:
|
| 174 |
+
st.plotly_chart(charts.create_duration_histogram(hs_df), use_container_width=True)
|
| 175 |
+
with col2:
|
| 176 |
+
st.plotly_chart(charts.create_thread_count_histogram(hs_df), use_container_width=True)
|
| 177 |
+
|
| 178 |
+
# ββ Demographics βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
+
has_demographics = (
|
| 180 |
+
"age_group" in hs_df.columns
|
| 181 |
+
and "timezone_region" in hs_df.columns
|
| 182 |
+
and (hs_df["age_group"] != "Unknown").any()
|
| 183 |
+
)
|
| 184 |
+
if has_demographics:
|
| 185 |
+
st.markdown("---")
|
| 186 |
+
st.markdown("## π₯ Customer Demographics")
|
| 187 |
+
st.info(f"Demographics available for customers whose email matched Musora user records.")
|
| 188 |
+
|
| 189 |
+
processor = SentimentDataProcessor()
|
| 190 |
+
demo_charts = DemographicCharts()
|
| 191 |
+
|
| 192 |
+
demo_col1, demo_col2, demo_col3, demo_col4 = st.columns(4)
|
| 193 |
+
known_demo = int((hs_df["age_group"] != "Unknown").sum())
|
| 194 |
+
demo_col1.metric("With Demographics", f"{known_demo:,}", f"{known_demo/total*100:.1f}% matched")
|
| 195 |
+
|
| 196 |
+
avg_age = hs_df["age"].mean() if "age" in hs_df.columns else None
|
| 197 |
+
demo_col2.metric("Average Age", f"{avg_age:.1f}" if avg_age else "N/A")
|
| 198 |
+
|
| 199 |
+
top_region = hs_df["timezone_region"].value_counts().index[0] if "timezone_region" in hs_df.columns and not hs_df.empty else "N/A"
|
| 200 |
+
demo_col3.metric("Top Region", str(top_region))
|
| 201 |
+
|
| 202 |
+
avg_exp = hs_df["experience_level"].mean() if "experience_level" in hs_df.columns else None
|
| 203 |
+
demo_col4.metric("Avg Experience", f"{avg_exp:.1f}/10" if avg_exp else "N/A")
|
| 204 |
+
|
| 205 |
+
st.markdown("---")
|
| 206 |
+
age_dist = processor.get_demographics_distribution(hs_df, "age_group")
|
| 207 |
+
if not age_dist.empty:
|
| 208 |
+
st.markdown("### Age Distribution")
|
| 209 |
+
col1, col2 = st.columns(2)
|
| 210 |
+
with col1:
|
| 211 |
+
st.plotly_chart(demo_charts.create_age_distribution_chart(age_dist), use_container_width=True)
|
| 212 |
+
with col2:
|
| 213 |
+
age_sent = processor.get_demographics_by_sentiment(hs_df, "age_group")
|
| 214 |
+
if not age_sent.empty:
|
| 215 |
+
st.plotly_chart(demo_charts.create_age_sentiment_chart(age_sent), use_container_width=True)
|
| 216 |
+
|
| 217 |
+
region_dist = processor.get_timezone_regions_distribution(hs_df)
|
| 218 |
+
if not region_dist.empty:
|
| 219 |
+
st.markdown("### Geographic Distribution")
|
| 220 |
+
col1, col2 = st.columns(2)
|
| 221 |
+
with col1:
|
| 222 |
+
st.plotly_chart(demo_charts.create_region_distribution_chart(region_dist), use_container_width=True)
|
| 223 |
+
with col2:
|
| 224 |
+
region_sent = processor.get_demographics_by_sentiment(hs_df, "timezone_region")
|
| 225 |
+
if not region_sent.empty:
|
| 226 |
+
st.plotly_chart(demo_charts.create_region_sentiment_chart(region_sent), use_container_width=True)
|
| 227 |
+
|
| 228 |
+
st.markdown("---")
|
| 229 |
+
st.caption(
|
| 230 |
+
"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES | "
|
| 231 |
+
f"Last processed: {hs_df['processed_at'].max().strftime('%Y-%m-%d %H:%M') if 'processed_at' in hs_df.columns and not hs_df.empty else 'Unknown'}"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# Compact summary for embedding in the main Sentiment Dashboard
|
| 237 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 238 |
+
|
| 239 |
+
def render_helpscout_compact_summary(hs_df):
|
| 240 |
+
"""
|
| 241 |
+
A one-screen HelpScout summary section embedded at the bottom of the
|
| 242 |
+
main Sentiment Dashboard. Kept purposely brief.
|
| 243 |
+
"""
|
| 244 |
+
st.markdown("---")
|
| 245 |
+
st.markdown("## π§ HelpScout Support β Quick View")
|
| 246 |
+
st.caption(f"{len(hs_df):,} processed customer conversations")
|
| 247 |
+
|
| 248 |
+
total = len(hs_df)
|
| 249 |
+
if total == 0:
|
| 250 |
+
st.info("No HelpScout conversations available.")
|
| 251 |
+
return
|
| 252 |
+
|
| 253 |
+
charts = HelpScoutCharts()
|
| 254 |
+
flags = boolean_flag_counts(hs_df)
|
| 255 |
+
escalation_count = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
|
| 256 |
+
avg_dur = float(hs_df["duration_hours"].mean()) if "duration_hours" in hs_df.columns else 0.0
|
| 257 |
+
|
| 258 |
+
k1, k2, k3, k4 = st.columns(4)
|
| 259 |
+
k1.metric("Conversations", f"{total:,}")
|
| 260 |
+
k2.metric("Escalations", f"{escalation_count:,}", delta=f"{escalation_count/total*100:.1f}%", delta_color="inverse")
|
| 261 |
+
k3.metric("Refund Requests", f"{flags['is_refund_request']:,}")
|
| 262 |
+
k4.metric("Avg Duration (h)", f"{avg_dur:.1f}")
|
| 263 |
+
|
| 264 |
+
col1, col2 = st.columns(2)
|
| 265 |
+
with col1:
|
| 266 |
+
st.plotly_chart(
|
| 267 |
+
charts.create_sentiment_pie_chart(hs_df, title="HelpScout Sentiment"),
|
| 268 |
+
use_container_width=True,
|
| 269 |
+
key="hs_compact_sentiment_pie",
|
| 270 |
+
)
|
| 271 |
+
with col2:
|
| 272 |
+
st.plotly_chart(
|
| 273 |
+
charts.create_topic_bar_chart(hs_df, title="Top Topics", top_n=5),
|
| 274 |
+
use_container_width=True,
|
| 275 |
+
key="hs_compact_topic_bar",
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
st.info("π Navigate to **π§ HelpScout Dashboard** for the full analysis.")
|
visualization/components/sentiment_analysis.py
CHANGED
|
@@ -116,7 +116,7 @@ def render_sentiment_analysis(data_loader):
|
|
| 116 |
mask = (dashboard_df['platform'] == selected_platform) & (dashboard_df['brand'] == selected_brand)
|
| 117 |
preview_df = dashboard_df[mask]
|
| 118 |
|
| 119 |
-
filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(
|
| 120 |
|
| 121 |
with filter_col1:
|
| 122 |
sentiment_options = sorted(preview_df['sentiment_polarity'].unique().tolist())
|
|
@@ -141,6 +141,20 @@ def render_sentiment_analysis(data_loader):
|
|
| 141 |
)
|
| 142 |
|
| 143 |
with filter_col3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
top_n = st.selectbox(
|
| 145 |
"Top N Contents",
|
| 146 |
options=[5, 10, 15, 20, 25],
|
|
@@ -148,12 +162,12 @@ def render_sentiment_analysis(data_loader):
|
|
| 148 |
help="Number of contents to display"
|
| 149 |
)
|
| 150 |
|
| 151 |
-
with
|
| 152 |
-
filter_active = bool(selected_sentiments or selected_intents)
|
| 153 |
st.metric(
|
| 154 |
"Filters Active",
|
| 155 |
"β Yes" if filter_active else "β No",
|
| 156 |
-
help="Sentiment or
|
| 157 |
)
|
| 158 |
|
| 159 |
st.markdown("---")
|
|
@@ -200,6 +214,7 @@ def render_sentiment_analysis(data_loader):
|
|
| 200 |
fetch_key = (
|
| 201 |
selected_platform, selected_brand, top_n, min_comments, sort_by_value,
|
| 202 |
tuple(sorted(selected_sentiments)), tuple(sorted(selected_intents)),
|
|
|
|
| 203 |
str(query_date_range)
|
| 204 |
)
|
| 205 |
|
|
@@ -234,6 +249,7 @@ def render_sentiment_analysis(data_loader):
|
|
| 234 |
sort_by=sort_by_value,
|
| 235 |
sentiments=selected_sentiments or None,
|
| 236 |
intents=selected_intents or None,
|
|
|
|
| 237 |
date_range=query_date_range,
|
| 238 |
)
|
| 239 |
st.session_state['sa_contents'] = contents_df
|
|
@@ -332,7 +348,7 @@ def render_sentiment_analysis(data_loader):
|
|
| 332 |
if content_comments.empty:
|
| 333 |
st.info("No sampled comment details available for this content.")
|
| 334 |
else:
|
| 335 |
-
viz_col1, viz_col2 = st.columns(
|
| 336 |
with viz_col1:
|
| 337 |
pie = sentiment_charts.create_sentiment_pie_chart(
|
| 338 |
content_comments, title="Sentiment Distribution (sample)"
|
|
@@ -345,6 +361,12 @@ def render_sentiment_analysis(data_loader):
|
|
| 345 |
)
|
| 346 |
st.plotly_chart(bar, use_container_width=True,
|
| 347 |
key=f"intent_bar_{content_row['content_sk']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
|
| 349 |
# AI Analysis
|
| 350 |
st.markdown("#### π€ AI-Powered Analysis")
|
|
@@ -500,7 +522,7 @@ def render_sentiment_analysis(data_loader):
|
|
| 500 |
comments_df['content_sk'].isin(filtered_contents['content_sk'])
|
| 501 |
] if not comments_df.empty else pd.DataFrame()
|
| 502 |
|
| 503 |
-
insight_col1, insight_col2 = st.columns(
|
| 504 |
with insight_col1:
|
| 505 |
st.markdown("#### π― Common Intent Patterns")
|
| 506 |
if not all_sampled.empty:
|
|
@@ -509,6 +531,16 @@ def render_sentiment_analysis(data_loader):
|
|
| 509 |
st.markdown(f"- **{row['intent']}**: {row['count']} ({row['percentage']:.1f}%)")
|
| 510 |
|
| 511 |
with insight_col2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
st.markdown("#### π Platform Breakdown")
|
| 513 |
if not all_sampled.empty:
|
| 514 |
for platform, count in all_sampled['platform'].value_counts().items():
|
|
|
|
| 116 |
mask = (dashboard_df['platform'] == selected_platform) & (dashboard_df['brand'] == selected_brand)
|
| 117 |
preview_df = dashboard_df[mask]
|
| 118 |
|
| 119 |
+
filter_col1, filter_col2, filter_col3, filter_col4, filter_col5 = st.columns(5)
|
| 120 |
|
| 121 |
with filter_col1:
|
| 122 |
sentiment_options = sorted(preview_df['sentiment_polarity'].unique().tolist())
|
|
|
|
| 141 |
)
|
| 142 |
|
| 143 |
with filter_col3:
|
| 144 |
+
emotion_list = (
|
| 145 |
+
preview_df['emotions']
|
| 146 |
+
.str.split(',').explode().str.strip()
|
| 147 |
+
.dropna().unique().tolist()
|
| 148 |
+
if 'emotions' in preview_df.columns else []
|
| 149 |
+
)
|
| 150 |
+
selected_emotions = st.multiselect(
|
| 151 |
+
"Emotion",
|
| 152 |
+
options=sorted(e for e in emotion_list if e),
|
| 153 |
+
default=[],
|
| 154 |
+
help="Filter contents that have comments with these emotions"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
with filter_col4:
|
| 158 |
top_n = st.selectbox(
|
| 159 |
"Top N Contents",
|
| 160 |
options=[5, 10, 15, 20, 25],
|
|
|
|
| 162 |
help="Number of contents to display"
|
| 163 |
)
|
| 164 |
|
| 165 |
+
with filter_col5:
|
| 166 |
+
filter_active = bool(selected_sentiments or selected_intents or selected_emotions)
|
| 167 |
st.metric(
|
| 168 |
"Filters Active",
|
| 169 |
"β Yes" if filter_active else "β No",
|
| 170 |
+
help="Sentiment, intent, or emotion filters applied" if filter_active else "Showing all sentiments"
|
| 171 |
)
|
| 172 |
|
| 173 |
st.markdown("---")
|
|
|
|
| 214 |
fetch_key = (
|
| 215 |
selected_platform, selected_brand, top_n, min_comments, sort_by_value,
|
| 216 |
tuple(sorted(selected_sentiments)), tuple(sorted(selected_intents)),
|
| 217 |
+
tuple(sorted(selected_emotions)),
|
| 218 |
str(query_date_range)
|
| 219 |
)
|
| 220 |
|
|
|
|
| 249 |
sort_by=sort_by_value,
|
| 250 |
sentiments=selected_sentiments or None,
|
| 251 |
intents=selected_intents or None,
|
| 252 |
+
emotions=selected_emotions or None,
|
| 253 |
date_range=query_date_range,
|
| 254 |
)
|
| 255 |
st.session_state['sa_contents'] = contents_df
|
|
|
|
| 348 |
if content_comments.empty:
|
| 349 |
st.info("No sampled comment details available for this content.")
|
| 350 |
else:
|
| 351 |
+
viz_col1, viz_col2, viz_col3 = st.columns(3)
|
| 352 |
with viz_col1:
|
| 353 |
pie = sentiment_charts.create_sentiment_pie_chart(
|
| 354 |
content_comments, title="Sentiment Distribution (sample)"
|
|
|
|
| 361 |
)
|
| 362 |
st.plotly_chart(bar, use_container_width=True,
|
| 363 |
key=f"intent_bar_{content_row['content_sk']}")
|
| 364 |
+
with viz_col3:
|
| 365 |
+
emotion_bar = distribution_charts.create_emotion_bar_chart(
|
| 366 |
+
content_comments, title="Emotion Distribution (sample)", orientation='h'
|
| 367 |
+
)
|
| 368 |
+
st.plotly_chart(emotion_bar, use_container_width=True,
|
| 369 |
+
key=f"emotion_bar_{content_row['content_sk']}")
|
| 370 |
|
| 371 |
# AI Analysis
|
| 372 |
st.markdown("#### π€ AI-Powered Analysis")
|
|
|
|
| 522 |
comments_df['content_sk'].isin(filtered_contents['content_sk'])
|
| 523 |
] if not comments_df.empty else pd.DataFrame()
|
| 524 |
|
| 525 |
+
insight_col1, insight_col2, insight_col3 = st.columns(3)
|
| 526 |
with insight_col1:
|
| 527 |
st.markdown("#### π― Common Intent Patterns")
|
| 528 |
if not all_sampled.empty:
|
|
|
|
| 531 |
st.markdown(f"- **{row['intent']}**: {row['count']} ({row['percentage']:.1f}%)")
|
| 532 |
|
| 533 |
with insight_col2:
|
| 534 |
+
st.markdown("#### π Top Emotions")
|
| 535 |
+
if not all_sampled.empty:
|
| 536 |
+
emotion_dist = processor.get_emotion_distribution(all_sampled)
|
| 537 |
+
if not emotion_dist.empty:
|
| 538 |
+
for _, row in emotion_dist.sort_values('count', ascending=False).head(5).iterrows():
|
| 539 |
+
st.markdown(f"- **{row['emotions'].title()}**: {row['count']} ({row['percentage']:.1f}%)")
|
| 540 |
+
else:
|
| 541 |
+
st.info("No emotion data available.")
|
| 542 |
+
|
| 543 |
+
with insight_col3:
|
| 544 |
st.markdown("#### π Platform Breakdown")
|
| 545 |
if not all_sampled.empty:
|
| 546 |
for platform, count in all_sampled['platform'].value_counts().items():
|
visualization/config/viz_config.json
CHANGED
|
@@ -17,6 +17,19 @@
|
|
| 17 |
"off_topic": "#9E9E9E",
|
| 18 |
"spam_selfpromo": "#795548"
|
| 19 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"platform": {
|
| 21 |
"facebook": "#1877F2",
|
| 22 |
"instagram": "#E4405F",
|
|
@@ -49,6 +62,19 @@
|
|
| 49 |
"off_topic",
|
| 50 |
"spam_selfpromo"
|
| 51 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"negative_sentiments": [
|
| 53 |
"negative",
|
| 54 |
"very_negative"
|
|
@@ -67,7 +93,7 @@
|
|
| 67 |
},
|
| 68 |
"snowflake": {
|
| 69 |
"query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 70 |
-
"dashboard_query": "SELECT s.COMMENT_SK, s.CONTENT_SK, LOWER(s.PLATFORM) AS PLATFORM, LOWER(s.CHANNEL_NAME) AS BRAND, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.DETECTED_LANGUAGE, s.COMMENT_TIMESTAMP, s.PROCESSED_AT, s.AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s UNION ALL SELECT COMMENT_SK, CONTENT_SK, CASE WHEN LOWER(PLATFORM) = 'musora' THEN 'musora_app' ELSE LOWER(PLATFORM) END AS PLATFORM, LOWER(CHANNEL_NAME) AS BRAND, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, DETECTED_LANGUAGE, COMMENT_TIMESTAMP, PROCESSED_AT, AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 71 |
"demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
|
| 72 |
},
|
| 73 |
"demographics": {
|
|
@@ -84,5 +110,39 @@
|
|
| 84 |
"Advanced (8-10)": [8, 10]
|
| 85 |
},
|
| 86 |
"top_timezones_count": 15
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
}
|
| 88 |
}
|
|
|
|
| 17 |
"off_topic": "#9E9E9E",
|
| 18 |
"spam_selfpromo": "#795548"
|
| 19 |
},
|
| 20 |
+
"emotion": {
|
| 21 |
+
"joy": "#FFD700",
|
| 22 |
+
"excitement": "#FF6B35",
|
| 23 |
+
"gratitude": "#4CAF50",
|
| 24 |
+
"admiration": "#2196F3",
|
| 25 |
+
"curiosity": "#00BCD4",
|
| 26 |
+
"humor": "#9C27B0",
|
| 27 |
+
"frustration": "#FF9800",
|
| 28 |
+
"disappointment": "#795548",
|
| 29 |
+
"sadness": "#607D8B",
|
| 30 |
+
"anger": "#D32F2F",
|
| 31 |
+
"neutral": "#9E9E9E"
|
| 32 |
+
},
|
| 33 |
"platform": {
|
| 34 |
"facebook": "#1877F2",
|
| 35 |
"instagram": "#E4405F",
|
|
|
|
| 62 |
"off_topic",
|
| 63 |
"spam_selfpromo"
|
| 64 |
],
|
| 65 |
+
"emotion_order": [
|
| 66 |
+
"joy",
|
| 67 |
+
"excitement",
|
| 68 |
+
"gratitude",
|
| 69 |
+
"admiration",
|
| 70 |
+
"curiosity",
|
| 71 |
+
"humor",
|
| 72 |
+
"frustration",
|
| 73 |
+
"disappointment",
|
| 74 |
+
"sadness",
|
| 75 |
+
"anger",
|
| 76 |
+
"neutral"
|
| 77 |
+
],
|
| 78 |
"negative_sentiments": [
|
| 79 |
"negative",
|
| 80 |
"very_negative"
|
|
|
|
| 93 |
},
|
| 94 |
"snowflake": {
|
| 95 |
"query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 96 |
+
"dashboard_query": "SELECT s.COMMENT_SK, s.CONTENT_SK, LOWER(s.PLATFORM) AS PLATFORM, LOWER(s.CHANNEL_NAME) AS BRAND, s.SENTIMENT_POLARITY, s.INTENT, s.EMOTIONS, s.REQUIRES_REPLY, s.DETECTED_LANGUAGE, s.COMMENT_TIMESTAMP, s.PROCESSED_AT, s.AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s UNION ALL SELECT COMMENT_SK, CONTENT_SK, CASE WHEN LOWER(PLATFORM) = 'musora' THEN 'musora_app' ELSE LOWER(PLATFORM) END AS PLATFORM, LOWER(CHANNEL_NAME) AS BRAND, SENTIMENT_POLARITY, INTENT, EMOTIONS, REQUIRES_REPLY, DETECTED_LANGUAGE, COMMENT_TIMESTAMP, PROCESSED_AT, AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
|
| 97 |
"demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
|
| 98 |
},
|
| 99 |
"demographics": {
|
|
|
|
| 110 |
"Advanced (8-10)": [8, 10]
|
| 111 |
},
|
| 112 |
"top_timezones_count": 15
|
| 113 |
+
},
|
| 114 |
+
"helpscout": {
|
| 115 |
+
"dashboard_query": "SELECT CONVERSATION_ID, LOWER(CUSTOMER_EMAIL) AS CUSTOMER_EMAIL, THREAD_COUNT, FIRST_MESSAGE_AT, LAST_MESSAGE_AT, DURATION_HOURS, STATUS, STATE, SOURCE_TYPE, SOURCE_VIA, SENTIMENT_POLARITY, EMOTIONS, TOPICS, IS_REFUND_REQUEST, IS_CANCELLATION, IS_MEMBERSHIP, SENTIMENT_CONFIDENCE, TOPIC_CONFIDENCE, PROCESSED_AT FROM SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES",
|
| 116 |
+
"demographics_query": "SELECT LOWER(u.email) AS CUSTOMER_EMAIL, TO_VARCHAR(u.birthday, 'YYYY-MM-DD HH24:MI:SS.FF6 TZHTZM') AS BIRTHDAY, u.timezone AS TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id WHERE u.email IS NOT NULL",
|
| 117 |
+
"default_top_n": 10,
|
| 118 |
+
"default_date_range_days": 60,
|
| 119 |
+
"escalation_sentiments": ["negative", "very_negative"],
|
| 120 |
+
"max_summary_conversations": 300
|
| 121 |
+
},
|
| 122 |
+
"color_schemes_helpscout": {
|
| 123 |
+
"topics": {
|
| 124 |
+
"video_and_playback": "#1982C4",
|
| 125 |
+
"app_and_technical_errors": "#D32F2F",
|
| 126 |
+
"navigation_and_ux": "#9C27B0",
|
| 127 |
+
"account_and_access": "#FF6F00",
|
| 128 |
+
"billing_and_subscription": "#00C851",
|
| 129 |
+
"learning_and_progress": "#2196F3",
|
| 130 |
+
"content_and_resources": "#4CAF50",
|
| 131 |
+
"community_and_notifications":"#FFB300",
|
| 132 |
+
"feedback_and_suggestions": "#00BCD4",
|
| 133 |
+
"uncategorized": "#9E9E9E"
|
| 134 |
+
},
|
| 135 |
+
"status": {
|
| 136 |
+
"active": "#FF6F00",
|
| 137 |
+
"pending": "#FFB300",
|
| 138 |
+
"closed": "#4CAF50",
|
| 139 |
+
"spam": "#9E9E9E",
|
| 140 |
+
"default": "#607D8B"
|
| 141 |
+
},
|
| 142 |
+
"boolean_flags": {
|
| 143 |
+
"is_refund_request": "#D32F2F",
|
| 144 |
+
"is_cancellation": "#FF6F00",
|
| 145 |
+
"is_membership": "#00C851"
|
| 146 |
+
}
|
| 147 |
}
|
| 148 |
}
|
visualization/data/data_loader.py
CHANGED
|
@@ -90,6 +90,10 @@ class SentimentDataLoader:
|
|
| 90 |
df['platform'] = df['platform'].fillna('unknown').str.lower()
|
| 91 |
df['brand'] = df['brand'].fillna('unknown').str.lower()
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
if 'requires_reply' in df.columns:
|
| 94 |
df['requires_reply'] = df['requires_reply'].astype(bool)
|
| 95 |
|
|
@@ -166,7 +170,7 @@ class SentimentDataLoader:
|
|
| 166 |
|
| 167 |
def load_sa_data(self, platform, brand, top_n=10, min_comments=10,
|
| 168 |
sort_by='severity_score', sentiments=None, intents=None,
|
| 169 |
-
date_range=None):
|
| 170 |
"""
|
| 171 |
Load Sentiment Analysis page data:
|
| 172 |
1. Content aggregation stats for top-N contents
|
|
@@ -180,6 +184,7 @@ class SentimentDataLoader:
|
|
| 180 |
sort_by: 'severity_score' | 'sentiment_percentage' | 'sentiment_count' | 'total_comments'
|
| 181 |
sentiments: List of sentiments to filter by (dominant_sentiment)
|
| 182 |
intents: List of intents to filter by
|
|
|
|
| 183 |
date_range: Tuple (start_date, end_date) or None
|
| 184 |
|
| 185 |
Returns:
|
|
@@ -187,16 +192,17 @@ class SentimentDataLoader:
|
|
| 187 |
"""
|
| 188 |
sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
|
| 189 |
intents_key = tuple(sorted(intents)) if intents else ()
|
|
|
|
| 190 |
date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
|
| 191 |
|
| 192 |
return self._fetch_sa_data(
|
| 193 |
platform, brand, top_n, min_comments, sort_by,
|
| 194 |
-
sentiments_key, intents_key, date_key
|
| 195 |
)
|
| 196 |
|
| 197 |
@st.cache_data(ttl=86400)
|
| 198 |
def _fetch_sa_data(_self, platform, brand, top_n, min_comments, sort_by,
|
| 199 |
-
sentiments, intents, date_range):
|
| 200 |
"""Cached SA data fetch β returns (contents_df, comments_df)."""
|
| 201 |
try:
|
| 202 |
conn = SnowFlakeConn()
|
|
@@ -245,6 +251,16 @@ class SentimentDataLoader:
|
|
| 245 |
]['content_sk'].unique()
|
| 246 |
contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
|
| 247 |
comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
else:
|
| 249 |
comments_df = pd.DataFrame()
|
| 250 |
|
|
@@ -387,7 +403,7 @@ class SentimentDataLoader:
|
|
| 387 |
LOWER(s.PLATFORM) AS PLATFORM,
|
| 388 |
LOWER(s.CHANNEL_NAME) AS BRAND,
|
| 389 |
s.COMMENT_TIMESTAMP, s.AUTHOR_NAME,
|
| 390 |
-
s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT,
|
| 391 |
s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH,
|
| 392 |
c.PERMALINK_URL
|
| 393 |
FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s
|
|
@@ -407,7 +423,7 @@ class SentimentDataLoader:
|
|
| 407 |
'musora_app' AS PLATFORM,
|
| 408 |
LOWER(CHANNEL_NAME) AS BRAND,
|
| 409 |
COMMENT_TIMESTAMP, AUTHOR_NAME,
|
| 410 |
-
DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT,
|
| 411 |
REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH,
|
| 412 |
PERMALINK_URL
|
| 413 |
FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES
|
|
@@ -448,6 +464,10 @@ class SentimentDataLoader:
|
|
| 448 |
df['intent'] = df['intent'].fillna('unknown')
|
| 449 |
df['platform'] = df['platform'].fillna('unknown').str.lower()
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
if 'requires_reply' in df.columns:
|
| 452 |
df['requires_reply'] = df['requires_reply'].astype(bool)
|
| 453 |
|
|
|
|
| 90 |
df['platform'] = df['platform'].fillna('unknown').str.lower()
|
| 91 |
df['brand'] = df['brand'].fillna('unknown').str.lower()
|
| 92 |
|
| 93 |
+
# emotions is optional (soft-fail); keep NaN as-is
|
| 94 |
+
if 'emotions' not in df.columns:
|
| 95 |
+
df['emotions'] = None
|
| 96 |
+
|
| 97 |
if 'requires_reply' in df.columns:
|
| 98 |
df['requires_reply'] = df['requires_reply'].astype(bool)
|
| 99 |
|
|
|
|
| 170 |
|
| 171 |
def load_sa_data(self, platform, brand, top_n=10, min_comments=10,
|
| 172 |
sort_by='severity_score', sentiments=None, intents=None,
|
| 173 |
+
emotions=None, date_range=None):
|
| 174 |
"""
|
| 175 |
Load Sentiment Analysis page data:
|
| 176 |
1. Content aggregation stats for top-N contents
|
|
|
|
| 184 |
sort_by: 'severity_score' | 'sentiment_percentage' | 'sentiment_count' | 'total_comments'
|
| 185 |
sentiments: List of sentiments to filter by (dominant_sentiment)
|
| 186 |
intents: List of intents to filter by
|
| 187 |
+
emotions: List of emotions to filter by (content must have at least one comment with these emotions)
|
| 188 |
date_range: Tuple (start_date, end_date) or None
|
| 189 |
|
| 190 |
Returns:
|
|
|
|
| 192 |
"""
|
| 193 |
sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
|
| 194 |
intents_key = tuple(sorted(intents)) if intents else ()
|
| 195 |
+
emotions_key = tuple(sorted(emotions)) if emotions else ()
|
| 196 |
date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
|
| 197 |
|
| 198 |
return self._fetch_sa_data(
|
| 199 |
platform, brand, top_n, min_comments, sort_by,
|
| 200 |
+
sentiments_key, intents_key, emotions_key, date_key
|
| 201 |
)
|
| 202 |
|
| 203 |
@st.cache_data(ttl=86400)
|
| 204 |
def _fetch_sa_data(_self, platform, brand, top_n, min_comments, sort_by,
|
| 205 |
+
sentiments, intents, emotions, date_range):
|
| 206 |
"""Cached SA data fetch β returns (contents_df, comments_df)."""
|
| 207 |
try:
|
| 208 |
conn = SnowFlakeConn()
|
|
|
|
| 251 |
]['content_sk'].unique()
|
| 252 |
contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
|
| 253 |
comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
|
| 254 |
+
|
| 255 |
+
# Python-side emotion filter β keep only content_sks that have
|
| 256 |
+
# at least one comment matching any selected emotion
|
| 257 |
+
if emotions:
|
| 258 |
+
pattern = '|'.join(re.escape(e) for e in emotions)
|
| 259 |
+
valid_sks = comments_df[
|
| 260 |
+
comments_df['emotions'].str.contains(pattern, na=False, case=False)
|
| 261 |
+
]['content_sk'].unique()
|
| 262 |
+
contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
|
| 263 |
+
comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
|
| 264 |
else:
|
| 265 |
comments_df = pd.DataFrame()
|
| 266 |
|
|
|
|
| 403 |
LOWER(s.PLATFORM) AS PLATFORM,
|
| 404 |
LOWER(s.CHANNEL_NAME) AS BRAND,
|
| 405 |
s.COMMENT_TIMESTAMP, s.AUTHOR_NAME,
|
| 406 |
+
s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT, s.EMOTIONS,
|
| 407 |
s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH,
|
| 408 |
c.PERMALINK_URL
|
| 409 |
FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s
|
|
|
|
| 423 |
'musora_app' AS PLATFORM,
|
| 424 |
LOWER(CHANNEL_NAME) AS BRAND,
|
| 425 |
COMMENT_TIMESTAMP, AUTHOR_NAME,
|
| 426 |
+
DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT, EMOTIONS,
|
| 427 |
REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH,
|
| 428 |
PERMALINK_URL
|
| 429 |
FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES
|
|
|
|
| 464 |
df['intent'] = df['intent'].fillna('unknown')
|
| 465 |
df['platform'] = df['platform'].fillna('unknown').str.lower()
|
| 466 |
|
| 467 |
+
# emotions is optional (soft-fail); keep NaN as-is for chart filtering
|
| 468 |
+
if 'emotions' not in df.columns:
|
| 469 |
+
df['emotions'] = None
|
| 470 |
+
|
| 471 |
if 'requires_reply' in df.columns:
|
| 472 |
df['requires_reply'] = df['requires_reply'].astype(bool)
|
| 473 |
|
visualization/data/helpscout_data_loader.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout data loader β mirrors SentimentDataLoader architecture.
|
| 3 |
+
|
| 4 |
+
Three loading modes:
|
| 5 |
+
- load_dashboard_data() : lightweight (no long text), cached 24 h
|
| 6 |
+
- load_analysis_data(...) : filtered with SUMMARY + notes, on-demand, cached 24 h
|
| 7 |
+
- load_demographics_data() : email-keyed user demographics, cached 24 h
|
| 8 |
+
"""
|
| 9 |
+
import re
|
| 10 |
+
import sys
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import streamlit as st
|
| 16 |
+
from dateutil.relativedelta import relativedelta
|
| 17 |
+
|
| 18 |
+
root_dir = Path(__file__).resolve().parent.parent.parent
|
| 19 |
+
sys.path.append(str(root_dir))
|
| 20 |
+
|
| 21 |
+
from visualization.SnowFlakeConnection import SnowFlakeConn
|
| 22 |
+
from visualization.utils.helpscout_utils import (
|
| 23 |
+
load_topic_taxonomy, parse_topics, compute_escalation_flag
|
| 24 |
+
)
|
| 25 |
+
import json
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class HelpScoutDataLoader:
|
| 29 |
+
"""
|
| 30 |
+
Loads HelpScout conversation features from Snowflake with caching.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, config_path=None):
|
| 34 |
+
if config_path is None:
|
| 35 |
+
config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
|
| 36 |
+
with open(config_path, "r") as f:
|
| 37 |
+
self.config = json.load(f)
|
| 38 |
+
|
| 39 |
+
self.hs_config = self.config.get("helpscout", {})
|
| 40 |
+
self.dashboard_query = self.hs_config.get("dashboard_query", "")
|
| 41 |
+
self.demographics_query = self.hs_config.get("demographics_query", "")
|
| 42 |
+
self.escalation_sentiments = self.hs_config.get("escalation_sentiments", ["negative", "very_negative"])
|
| 43 |
+
self.default_date_range_days = self.hs_config.get("default_date_range_days", 60)
|
| 44 |
+
self.max_summary_conversations = self.hs_config.get("max_summary_conversations", 300)
|
| 45 |
+
self.topic_colors = self.config.get("color_schemes_helpscout", {}).get("topics", {})
|
| 46 |
+
self.status_colors = self.config.get("color_schemes_helpscout", {}).get("status", {})
|
| 47 |
+
self.flag_colors = self.config.get("color_schemes_helpscout", {}).get("boolean_flags", {})
|
| 48 |
+
self.sentiment_colors = self.config.get("color_schemes", {}).get("sentiment_polarity", {})
|
| 49 |
+
self.demographics_config = self.config.get("demographics", {})
|
| 50 |
+
|
| 51 |
+
self.taxonomy = load_topic_taxonomy()
|
| 52 |
+
|
| 53 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
# Dashboard data (lightweight, 24-hour cache)
|
| 55 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
|
| 57 |
+
@st.cache_data(ttl=86400)
|
| 58 |
+
def load_dashboard_data(_self):
|
| 59 |
+
"""Load lightweight HelpScout dashboard data β no long-form text columns."""
|
| 60 |
+
try:
|
| 61 |
+
conn = SnowFlakeConn()
|
| 62 |
+
df = conn.run_read_query(_self.dashboard_query, "HelpScout dashboard data")
|
| 63 |
+
conn.close_connection()
|
| 64 |
+
|
| 65 |
+
if df is None or df.empty:
|
| 66 |
+
st.error("No HelpScout data returned from Snowflake")
|
| 67 |
+
return pd.DataFrame()
|
| 68 |
+
|
| 69 |
+
df = _self._process_dashboard_df(df)
|
| 70 |
+
|
| 71 |
+
if _self.demographics_query:
|
| 72 |
+
demo_df = _self.load_demographics_data()
|
| 73 |
+
if not demo_df.empty:
|
| 74 |
+
df = _self.merge_demographics(df, demo_df)
|
| 75 |
+
|
| 76 |
+
return df
|
| 77 |
+
except Exception as e:
|
| 78 |
+
st.error(f"Error loading HelpScout dashboard data: {e}")
|
| 79 |
+
return pd.DataFrame()
|
| 80 |
+
|
| 81 |
+
def _process_dashboard_df(self, df):
|
| 82 |
+
df.columns = df.columns.str.lower()
|
| 83 |
+
|
| 84 |
+
for ts_col in ("first_message_at", "last_message_at", "processed_at"):
|
| 85 |
+
if ts_col in df.columns:
|
| 86 |
+
df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce", utc=True).dt.tz_localize(None)
|
| 87 |
+
|
| 88 |
+
df["sentiment_polarity"] = df["sentiment_polarity"].fillna("unknown")
|
| 89 |
+
df["status"] = df["status"].fillna("unknown").str.lower()
|
| 90 |
+
df["state"] = df["state"].fillna("unknown").str.lower()
|
| 91 |
+
df["source_type"] = df["source_type"].fillna("unknown").str.lower()
|
| 92 |
+
|
| 93 |
+
for bool_col in ("is_refund_request", "is_cancellation", "is_membership"):
|
| 94 |
+
if bool_col in df.columns:
|
| 95 |
+
df[bool_col] = df[bool_col].fillna(False).astype(bool)
|
| 96 |
+
|
| 97 |
+
if "emotions" not in df.columns:
|
| 98 |
+
df["emotions"] = None
|
| 99 |
+
|
| 100 |
+
# topics_list for filter options
|
| 101 |
+
df["topics_list"] = df["topics"].apply(parse_topics)
|
| 102 |
+
|
| 103 |
+
# escalation flag
|
| 104 |
+
df["is_escalation"] = compute_escalation_flag(df, self.escalation_sentiments)
|
| 105 |
+
|
| 106 |
+
return df
|
| 107 |
+
|
| 108 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
# Analysis page data (on-demand, 24-hour cache)
|
| 110 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββ
|
| 111 |
+
|
| 112 |
+
def load_analysis_data(self, sentiments=None, topics=None,
|
| 113 |
+
refund_only=False, cancel_only=False,
|
| 114 |
+
membership_only=False, statuses=None,
|
| 115 |
+
sources=None, date_range=None, top_n=None):
|
| 116 |
+
"""
|
| 117 |
+
Load filtered HelpScout conversations with full text for the Analysis page.
|
| 118 |
+
Caches based on argument tuple.
|
| 119 |
+
"""
|
| 120 |
+
sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
|
| 121 |
+
topics_key = tuple(sorted(topics)) if topics else ()
|
| 122 |
+
statuses_key = tuple(sorted(statuses)) if statuses else ()
|
| 123 |
+
sources_key = tuple(sorted(sources)) if sources else ()
|
| 124 |
+
date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
|
| 125 |
+
return self._fetch_analysis_data(
|
| 126 |
+
sentiments_key, topics_key, bool(refund_only), bool(cancel_only),
|
| 127 |
+
bool(membership_only), statuses_key, sources_key, date_key, top_n or 0
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
@st.cache_data(ttl=86400)
|
| 131 |
+
def _fetch_analysis_data(_self, sentiments, topics, refund_only, cancel_only,
|
| 132 |
+
membership_only, statuses, sources, date_range, top_n):
|
| 133 |
+
"""Cached analysis data fetch β returns full-detail conversation df."""
|
| 134 |
+
try:
|
| 135 |
+
query = _self._build_analysis_query(
|
| 136 |
+
sentiments, topics, refund_only, cancel_only,
|
| 137 |
+
membership_only, statuses, sources, date_range, top_n
|
| 138 |
+
)
|
| 139 |
+
conn = SnowFlakeConn()
|
| 140 |
+
df = conn.run_read_query(query, "HelpScout analysis data")
|
| 141 |
+
conn.close_connection()
|
| 142 |
+
|
| 143 |
+
if df is None or df.empty:
|
| 144 |
+
return pd.DataFrame()
|
| 145 |
+
|
| 146 |
+
df = _self._process_analysis_df(df)
|
| 147 |
+
return df
|
| 148 |
+
except Exception as e:
|
| 149 |
+
st.error(f"Error loading HelpScout analysis data: {e}")
|
| 150 |
+
return pd.DataFrame()
|
| 151 |
+
|
| 152 |
+
def _build_analysis_query(self, sentiments, topics, refund_only, cancel_only,
|
| 153 |
+
membership_only, statuses, sources, date_range, top_n):
|
| 154 |
+
"""Build dynamic SQL for the analysis page with all filters pushed to Snowflake."""
|
| 155 |
+
where_clauses = []
|
| 156 |
+
|
| 157 |
+
if date_range and len(date_range) == 2:
|
| 158 |
+
where_clauses.append(f"FIRST_MESSAGE_AT >= '{date_range[0]}' AND FIRST_MESSAGE_AT <= '{date_range[1]}'")
|
| 159 |
+
|
| 160 |
+
if sentiments:
|
| 161 |
+
safe = "', '".join(self._sanitize(s) for s in sentiments)
|
| 162 |
+
where_clauses.append(f"SENTIMENT_POLARITY IN ('{safe}')")
|
| 163 |
+
|
| 164 |
+
if topics:
|
| 165 |
+
topic_conditions = []
|
| 166 |
+
for t in topics:
|
| 167 |
+
safe_t = self._sanitize(t)
|
| 168 |
+
topic_conditions.append(
|
| 169 |
+
f"ARRAY_CONTAINS('{safe_t}'::VARIANT, SPLIT(TOPICS, ','))"
|
| 170 |
+
)
|
| 171 |
+
where_clauses.append("(" + " OR ".join(topic_conditions) + ")")
|
| 172 |
+
|
| 173 |
+
if statuses:
|
| 174 |
+
safe = "', '".join(self._sanitize(s.lower()) for s in statuses)
|
| 175 |
+
where_clauses.append(f"LOWER(STATUS) IN ('{safe}')")
|
| 176 |
+
|
| 177 |
+
if sources:
|
| 178 |
+
safe = "', '".join(self._sanitize(s.lower()) for s in sources)
|
| 179 |
+
where_clauses.append(f"LOWER(SOURCE_TYPE) IN ('{safe}')")
|
| 180 |
+
|
| 181 |
+
if refund_only:
|
| 182 |
+
where_clauses.append("IS_REFUND_REQUEST = TRUE")
|
| 183 |
+
if cancel_only:
|
| 184 |
+
where_clauses.append("IS_CANCELLATION = TRUE")
|
| 185 |
+
if membership_only:
|
| 186 |
+
where_clauses.append("IS_MEMBERSHIP = TRUE")
|
| 187 |
+
|
| 188 |
+
where_sql = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
|
| 189 |
+
limit_sql = f"LIMIT {int(top_n)}" if top_n and top_n > 0 else ""
|
| 190 |
+
|
| 191 |
+
return f"""
|
| 192 |
+
SELECT
|
| 193 |
+
CONVERSATION_ID,
|
| 194 |
+
LOWER(CUSTOMER_EMAIL) AS CUSTOMER_EMAIL,
|
| 195 |
+
CUSTOMER_FIRST,
|
| 196 |
+
CUSTOMER_LAST,
|
| 197 |
+
THREAD_COUNT,
|
| 198 |
+
FIRST_MESSAGE_AT,
|
| 199 |
+
LAST_MESSAGE_AT,
|
| 200 |
+
DURATION_HOURS,
|
| 201 |
+
STATUS,
|
| 202 |
+
STATE,
|
| 203 |
+
SOURCE_TYPE,
|
| 204 |
+
SOURCE_VIA,
|
| 205 |
+
SENTIMENT_POLARITY,
|
| 206 |
+
EMOTIONS,
|
| 207 |
+
SENTIMENT_CONFIDENCE,
|
| 208 |
+
SENTIMENT_NOTES,
|
| 209 |
+
TOPICS,
|
| 210 |
+
IS_REFUND_REQUEST,
|
| 211 |
+
IS_CANCELLATION,
|
| 212 |
+
IS_MEMBERSHIP,
|
| 213 |
+
TOPIC_CONFIDENCE,
|
| 214 |
+
TOPIC_NOTES,
|
| 215 |
+
SUMMARY,
|
| 216 |
+
PROCESSED_AT
|
| 217 |
+
FROM SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
|
| 218 |
+
{where_sql}
|
| 219 |
+
ORDER BY FIRST_MESSAGE_AT DESC
|
| 220 |
+
{limit_sql}
|
| 221 |
+
"""
|
| 222 |
+
|
| 223 |
+
def _process_analysis_df(self, df):
|
| 224 |
+
df.columns = df.columns.str.lower()
|
| 225 |
+
|
| 226 |
+
for ts_col in ("first_message_at", "last_message_at", "processed_at"):
|
| 227 |
+
if ts_col in df.columns:
|
| 228 |
+
df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce", utc=True).dt.tz_localize(None)
|
| 229 |
+
|
| 230 |
+
df["sentiment_polarity"] = df["sentiment_polarity"].fillna("unknown")
|
| 231 |
+
df["status"] = df["status"].fillna("unknown").str.lower()
|
| 232 |
+
df["source_type"] = df["source_type"].fillna("unknown").str.lower()
|
| 233 |
+
|
| 234 |
+
for bool_col in ("is_refund_request", "is_cancellation", "is_membership"):
|
| 235 |
+
if bool_col in df.columns:
|
| 236 |
+
df[bool_col] = df[bool_col].fillna(False).astype(bool)
|
| 237 |
+
|
| 238 |
+
if "emotions" not in df.columns:
|
| 239 |
+
df["emotions"] = None
|
| 240 |
+
|
| 241 |
+
df["topics_list"] = df["topics"].apply(parse_topics)
|
| 242 |
+
df["is_escalation"] = compute_escalation_flag(df, self.escalation_sentiments)
|
| 243 |
+
|
| 244 |
+
# Short summary for cards (100 chars)
|
| 245 |
+
if "summary" in df.columns:
|
| 246 |
+
text = df["summary"].fillna("").astype(str)
|
| 247 |
+
df["summary_short"] = text.where(text.str.len() <= 120, text.str[:120] + "β¦")
|
| 248 |
+
|
| 249 |
+
return df
|
| 250 |
+
|
| 251 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
+
# Demographics (email-keyed, 24-hour cache)
|
| 253 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
+
|
| 255 |
+
@st.cache_data(ttl=86400)
|
| 256 |
+
def load_demographics_data(_self):
|
| 257 |
+
"""Load user demographics keyed by email."""
|
| 258 |
+
if not _self.demographics_query:
|
| 259 |
+
return pd.DataFrame()
|
| 260 |
+
try:
|
| 261 |
+
conn = SnowFlakeConn()
|
| 262 |
+
df = conn.run_read_query(_self.demographics_query, "HelpScout user demographics")
|
| 263 |
+
conn.close_connection()
|
| 264 |
+
|
| 265 |
+
if df is None or df.empty:
|
| 266 |
+
return pd.DataFrame()
|
| 267 |
+
|
| 268 |
+
return _self._process_demographics_df(df)
|
| 269 |
+
except Exception as e:
|
| 270 |
+
st.warning(f"Could not load HelpScout demographics: {e}")
|
| 271 |
+
return pd.DataFrame()
|
| 272 |
+
|
| 273 |
+
def _process_demographics_df(self, df):
|
| 274 |
+
df.columns = df.columns.str.lower()
|
| 275 |
+
|
| 276 |
+
if "birthday" in df.columns:
|
| 277 |
+
df["birthday"] = df["birthday"].astype(str)
|
| 278 |
+
df["birthday"] = pd.to_datetime(df["birthday"], errors="coerce", utc=True)
|
| 279 |
+
df["birthday"] = df["birthday"].dt.tz_localize(None)
|
| 280 |
+
df["age"] = df["birthday"].apply(self._calculate_age)
|
| 281 |
+
df["age_group"] = df["age"].apply(self._categorize_age)
|
| 282 |
+
|
| 283 |
+
if "timezone" in df.columns:
|
| 284 |
+
df["timezone_region"] = df["timezone"].apply(self._extract_timezone_region)
|
| 285 |
+
|
| 286 |
+
if "experience_level" in df.columns:
|
| 287 |
+
df["experience_group"] = df["experience_level"].apply(self._categorize_experience)
|
| 288 |
+
|
| 289 |
+
if "customer_email" in df.columns:
|
| 290 |
+
df = df[df["customer_email"].notna()]
|
| 291 |
+
df["customer_email"] = df["customer_email"].str.lower()
|
| 292 |
+
|
| 293 |
+
return df
|
| 294 |
+
|
| 295 |
+
def merge_demographics(self, df, demo_df):
|
| 296 |
+
"""Merge demographic data with HelpScout conversations on customer_email."""
|
| 297 |
+
if demo_df.empty or "customer_email" not in df.columns:
|
| 298 |
+
for col, val in [("age", None), ("age_group", "Unknown"),
|
| 299 |
+
("timezone", None), ("timezone_region", "Unknown"),
|
| 300 |
+
("experience_level", None), ("experience_group", "Unknown")]:
|
| 301 |
+
df[col] = val
|
| 302 |
+
return df
|
| 303 |
+
|
| 304 |
+
if "customer_email" not in demo_df.columns:
|
| 305 |
+
return df
|
| 306 |
+
|
| 307 |
+
merge_cols = ["customer_email"]
|
| 308 |
+
for c in ["age", "age_group", "timezone", "timezone_region", "experience_level", "experience_group"]:
|
| 309 |
+
if c in demo_df.columns:
|
| 310 |
+
merge_cols.append(c)
|
| 311 |
+
|
| 312 |
+
merged = df.merge(demo_df[merge_cols], on="customer_email", how="left")
|
| 313 |
+
|
| 314 |
+
for col in ["age_group", "timezone_region", "experience_group"]:
|
| 315 |
+
if col in merged.columns:
|
| 316 |
+
merged[col] = merged[col].fillna("Unknown")
|
| 317 |
+
|
| 318 |
+
return merged
|
| 319 |
+
|
| 320 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 321 |
+
# Filter helpers
|
| 322 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 323 |
+
|
| 324 |
+
def get_filter_options(self, df):
|
| 325 |
+
"""Return unique values for all in-page filters from the dashboard df."""
|
| 326 |
+
topics_flat = df["topics_list"].explode().dropna().unique().tolist() if "topics_list" in df.columns else []
|
| 327 |
+
return {
|
| 328 |
+
"sentiments": sorted(df["sentiment_polarity"].dropna().unique().tolist()),
|
| 329 |
+
"topics": sorted(t for t in topics_flat if t),
|
| 330 |
+
"statuses": sorted(df["status"].dropna().unique().tolist()),
|
| 331 |
+
"states": sorted(df["state"].dropna().unique().tolist()) if "state" in df.columns else [],
|
| 332 |
+
"sources": sorted(df["source_type"].dropna().unique().tolist()),
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 336 |
+
# Demographics calculation helpers (mirrors SentimentDataLoader)
|
| 337 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 338 |
+
|
| 339 |
+
@staticmethod
|
| 340 |
+
def _calculate_age(birthday):
|
| 341 |
+
if pd.isna(birthday):
|
| 342 |
+
return None
|
| 343 |
+
try:
|
| 344 |
+
age = relativedelta(datetime.now(), birthday).years
|
| 345 |
+
return age if 0 <= age <= 120 else None
|
| 346 |
+
except Exception:
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
def _categorize_age(self, age):
|
| 350 |
+
if pd.isna(age) or age is None:
|
| 351 |
+
return "Unknown"
|
| 352 |
+
for group_name, (min_age, max_age) in self.demographics_config.get("age_groups", {}).items():
|
| 353 |
+
if min_age <= age <= max_age:
|
| 354 |
+
return group_name
|
| 355 |
+
return "Unknown"
|
| 356 |
+
|
| 357 |
+
@staticmethod
|
| 358 |
+
def _extract_timezone_region(timezone):
|
| 359 |
+
if pd.isna(timezone) or not isinstance(timezone, str):
|
| 360 |
+
return "Unknown"
|
| 361 |
+
parts = timezone.split("/")
|
| 362 |
+
return parts[0] if parts else "Unknown"
|
| 363 |
+
|
| 364 |
+
def _categorize_experience(self, experience_level):
|
| 365 |
+
if pd.isna(experience_level):
|
| 366 |
+
return "Unknown"
|
| 367 |
+
try:
|
| 368 |
+
exp_level = float(experience_level)
|
| 369 |
+
except Exception:
|
| 370 |
+
return "Unknown"
|
| 371 |
+
for group_name, (min_exp, max_exp) in self.demographics_config.get("experience_groups", {}).items():
|
| 372 |
+
if min_exp <= exp_level <= max_exp:
|
| 373 |
+
return group_name
|
| 374 |
+
return "Unknown"
|
| 375 |
+
|
| 376 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 377 |
+
# Internal helpers
|
| 378 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 379 |
+
|
| 380 |
+
@staticmethod
|
| 381 |
+
def _sanitize(value: str) -> str:
|
| 382 |
+
return re.sub(r"['\";\\]", "", str(value))
|
visualization/utils/auth.py
CHANGED
|
@@ -24,8 +24,6 @@ AUTHORIZED_EMAILS = {
|
|
| 24 |
"gabriel@musora.com",
|
| 25 |
"jmilligan@musora.com",
|
| 26 |
"dave@musora.com",
|
| 27 |
-
"amy@musora.com",
|
| 28 |
-
"karissa@musora.com"
|
| 29 |
}
|
| 30 |
|
| 31 |
|
|
|
|
| 24 |
"gabriel@musora.com",
|
| 25 |
"jmilligan@musora.com",
|
| 26 |
"dave@musora.com",
|
|
|
|
|
|
|
| 27 |
}
|
| 28 |
|
| 29 |
|
visualization/utils/data_processor.py
CHANGED
|
@@ -113,6 +113,52 @@ class SentimentDataProcessor:
|
|
| 113 |
|
| 114 |
return intent_counts
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
@staticmethod
|
| 117 |
def get_content_summary(df):
|
| 118 |
"""
|
|
|
|
| 113 |
|
| 114 |
return intent_counts
|
| 115 |
|
| 116 |
+
@staticmethod
|
| 117 |
+
def get_emotion_distribution(df, group_by=None):
|
| 118 |
+
"""
|
| 119 |
+
Calculate emotion distribution (handles multi-label).
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
df: Sentiment dataframe with 'emotions' column
|
| 123 |
+
group_by: Optional column(s) to group by
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
pd.DataFrame: Emotion distribution with columns [emotion, count, percentage]
|
| 127 |
+
"""
|
| 128 |
+
if 'emotions' not in df.columns:
|
| 129 |
+
return pd.DataFrame()
|
| 130 |
+
|
| 131 |
+
df_exploded = df.dropna(subset=['emotions']).copy()
|
| 132 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
|
| 133 |
+
df_exploded = df_exploded.explode('emotions')
|
| 134 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.strip()
|
| 135 |
+
df_exploded = df_exploded[df_exploded['emotions'] != '']
|
| 136 |
+
|
| 137 |
+
if df_exploded.empty:
|
| 138 |
+
return pd.DataFrame()
|
| 139 |
+
|
| 140 |
+
if group_by:
|
| 141 |
+
if isinstance(group_by, str):
|
| 142 |
+
group_by = [group_by]
|
| 143 |
+
|
| 144 |
+
emotion_counts = df_exploded.groupby(
|
| 145 |
+
group_by + ['emotions'],
|
| 146 |
+
as_index=False
|
| 147 |
+
).size().rename(columns={'size': 'count'})
|
| 148 |
+
|
| 149 |
+
emotion_counts['percentage'] = emotion_counts.groupby(group_by)['count'].transform(
|
| 150 |
+
lambda x: (x / x.sum() * 100).round(2)
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
else:
|
| 154 |
+
emotion_counts = df_exploded['emotions'].value_counts().reset_index()
|
| 155 |
+
emotion_counts.columns = ['emotions', 'count']
|
| 156 |
+
emotion_counts['percentage'] = (
|
| 157 |
+
emotion_counts['count'] / emotion_counts['count'].sum() * 100
|
| 158 |
+
).round(2)
|
| 159 |
+
|
| 160 |
+
return emotion_counts
|
| 161 |
+
|
| 162 |
@staticmethod
|
| 163 |
def get_content_summary(df):
|
| 164 |
"""
|
visualization/utils/helpscout_pdf.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout PDF Exporters.
|
| 3 |
+
|
| 4 |
+
Two classes sharing the MusoraPDF base from pdf_exporter.py:
|
| 5 |
+
- HelpScoutDashboardPDF : full HelpScout dashboard report
|
| 6 |
+
- HelpScoutAnalysisPDF : filtered analysis report + optional LLM summary
|
| 7 |
+
"""
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import tempfile
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
import plotly.io as pio
|
| 16 |
+
|
| 17 |
+
_parent = Path(__file__).resolve().parent.parent
|
| 18 |
+
if str(_parent) not in sys.path:
|
| 19 |
+
sys.path.insert(0, str(_parent))
|
| 20 |
+
|
| 21 |
+
from utils.pdf_exporter import MusoraPDF # reuse base class
|
| 22 |
+
from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
|
| 23 |
+
from visualizations.helpscout_charts import HelpScoutCharts
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
_RENDER_SCALE = 3
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
# Shared rendering helpers (mixin-style functions)
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
|
| 34 |
+
def _prepare_fig(fig, is_side_by_side=False):
|
| 35 |
+
base_fs = 13 if is_side_by_side else 14
|
| 36 |
+
fig.update_layout(
|
| 37 |
+
paper_bgcolor="white", plot_bgcolor="white",
|
| 38 |
+
font=dict(color="black", size=base_fs),
|
| 39 |
+
title_font_size=base_fs + 4,
|
| 40 |
+
margin=(dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80)),
|
| 41 |
+
)
|
| 42 |
+
fig.update_xaxes(automargin=True)
|
| 43 |
+
fig.update_yaxes(automargin=True)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _fig_to_tmp(fig, width=800, height=400, is_side_by_side=False) -> str:
|
| 47 |
+
_prepare_fig(fig, is_side_by_side)
|
| 48 |
+
img = pio.to_image(fig, format="png", width=width, height=height,
|
| 49 |
+
scale=_RENDER_SCALE, engine="kaleido")
|
| 50 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 51 |
+
tmp.write(img)
|
| 52 |
+
tmp.close()
|
| 53 |
+
return tmp.name
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _cleanup(paths):
|
| 57 |
+
for p in paths:
|
| 58 |
+
try:
|
| 59 |
+
os.unlink(p)
|
| 60 |
+
except OSError:
|
| 61 |
+
pass
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# HelpScoutDashboardPDF
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
class HelpScoutDashboardPDF:
|
| 69 |
+
"""
|
| 70 |
+
Generates a comprehensive HelpScout dashboard PDF report.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self):
|
| 74 |
+
self.charts = HelpScoutCharts()
|
| 75 |
+
self.taxonomy = load_topic_taxonomy()
|
| 76 |
+
self._tmp: list = []
|
| 77 |
+
|
| 78 |
+
def generate_report(self, df, filter_info: dict = None) -> bytes:
|
| 79 |
+
"""Build and return the full dashboard PDF."""
|
| 80 |
+
self.pdf = MusoraPDF()
|
| 81 |
+
self._tmp = []
|
| 82 |
+
try:
|
| 83 |
+
self._cover(df, filter_info)
|
| 84 |
+
self._executive_summary(df)
|
| 85 |
+
self._sentiment_section(df)
|
| 86 |
+
self._topic_section(df)
|
| 87 |
+
self._emotion_section(df)
|
| 88 |
+
self._flags_section(df)
|
| 89 |
+
self._status_source_section(df)
|
| 90 |
+
self._timelines_section(df)
|
| 91 |
+
self._depth_section(df)
|
| 92 |
+
self._data_summary(df, filter_info)
|
| 93 |
+
return bytes(self.pdf.output())
|
| 94 |
+
finally:
|
| 95 |
+
_cleanup(self._tmp)
|
| 96 |
+
|
| 97 |
+
# ββ Rendering helpers ββ
|
| 98 |
+
|
| 99 |
+
def _add_chart(self, fig, width=180, img_w=800, img_h=400):
|
| 100 |
+
try:
|
| 101 |
+
p = _fig_to_tmp(fig, img_w, img_h)
|
| 102 |
+
self._tmp.append(p)
|
| 103 |
+
h_mm = width * (img_h / img_w)
|
| 104 |
+
self.pdf.check_page_break(h_mm + 5)
|
| 105 |
+
self.pdf.image(p, x=10, w=width)
|
| 106 |
+
self.pdf.ln(3)
|
| 107 |
+
except Exception:
|
| 108 |
+
logger.exception("Chart render failed")
|
| 109 |
+
self.pdf.body_text("[Chart could not be rendered]")
|
| 110 |
+
|
| 111 |
+
def _add_two_charts(self, fig1, fig2, width=92):
|
| 112 |
+
try:
|
| 113 |
+
p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
|
| 114 |
+
p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
|
| 115 |
+
h_mm = width * (450 / 700)
|
| 116 |
+
self.pdf.check_page_break(h_mm + 5)
|
| 117 |
+
y = self.pdf.get_y()
|
| 118 |
+
self.pdf.image(p1, x=10, y=y, w=width)
|
| 119 |
+
self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
|
| 120 |
+
self.pdf.set_y(y + h_mm + 3)
|
| 121 |
+
except Exception:
|
| 122 |
+
logger.exception("Side-by-side render failed")
|
| 123 |
+
self.pdf.body_text("[Charts could not be rendered]")
|
| 124 |
+
|
| 125 |
+
# ββ Sections ββ
|
| 126 |
+
|
| 127 |
+
def _cover(self, df, filter_info):
|
| 128 |
+
self.pdf.add_page()
|
| 129 |
+
self.pdf.ln(40)
|
| 130 |
+
r, g, b = MusoraPDF.PRIMARY
|
| 131 |
+
self.pdf.set_fill_color(r, g, b)
|
| 132 |
+
self.pdf.rect(0, 60, 210, 4, style="F")
|
| 133 |
+
self.pdf.ln(20)
|
| 134 |
+
self.pdf.set_font("Helvetica", "B", 28)
|
| 135 |
+
self.pdf.set_text_color(r, g, b)
|
| 136 |
+
self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
|
| 137 |
+
self.pdf.set_font("Helvetica", "", 16)
|
| 138 |
+
self.pdf.set_text_color(80, 80, 80)
|
| 139 |
+
self.pdf.cell(0, 10, "HelpScout Support Dashboard Report",
|
| 140 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 141 |
+
self.pdf.ln(10)
|
| 142 |
+
self.pdf.set_font("Helvetica", "", 12)
|
| 143 |
+
self.pdf.set_text_color(100, 100, 100)
|
| 144 |
+
self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
|
| 145 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 146 |
+
self.pdf.ln(5)
|
| 147 |
+
self.pdf.set_font("Helvetica", "", 10)
|
| 148 |
+
self.pdf.cell(0, 7, f"Total Conversations: {len(df):,}",
|
| 149 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 150 |
+
if "first_message_at" in df.columns and not df.empty:
|
| 151 |
+
valid = df["first_message_at"].dropna()
|
| 152 |
+
if not valid.empty:
|
| 153 |
+
dr = f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}"
|
| 154 |
+
self.pdf.ln(3)
|
| 155 |
+
self.pdf.set_font("Helvetica", "I", 9)
|
| 156 |
+
self.pdf.set_text_color(120, 120, 120)
|
| 157 |
+
self.pdf.cell(0, 6, MusoraPDF._sanitize(f"Data period: {dr}"),
|
| 158 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 159 |
+
self.pdf.ln(20)
|
| 160 |
+
self.pdf.set_font("Helvetica", "I", 8)
|
| 161 |
+
self.pdf.set_text_color(150, 150, 150)
|
| 162 |
+
self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
|
| 163 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 164 |
+
|
| 165 |
+
def _executive_summary(self, df):
|
| 166 |
+
self.pdf.add_page()
|
| 167 |
+
self.pdf.section_header("Executive Summary")
|
| 168 |
+
total = len(df)
|
| 169 |
+
flags = boolean_flag_counts(df)
|
| 170 |
+
neg = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum()
|
| 171 |
+
pos = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum()
|
| 172 |
+
neg_pct = neg / total * 100 if total else 0
|
| 173 |
+
pos_pct = pos / total * 100 if total else 0
|
| 174 |
+
esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
|
| 175 |
+
avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
|
| 176 |
+
|
| 177 |
+
self.pdf.metric_row([
|
| 178 |
+
("Total Conversations", f"{total:,}"),
|
| 179 |
+
("Positive %", f"{pos_pct:.1f}%"),
|
| 180 |
+
("Negative %", f"{neg_pct:.1f}%"),
|
| 181 |
+
("Avg Duration (h)", f"{avg_dur:.1f}"),
|
| 182 |
+
])
|
| 183 |
+
self.pdf.metric_row([
|
| 184 |
+
("Escalations", f"{esc:,}"),
|
| 185 |
+
("Refund Requests", f"{flags['is_refund_request']:,}"),
|
| 186 |
+
("Cancellations", f"{flags['is_cancellation']:,}"),
|
| 187 |
+
("Membership Joins", f"{flags['is_membership']:,}"),
|
| 188 |
+
])
|
| 189 |
+
|
| 190 |
+
def _sentiment_section(self, df):
|
| 191 |
+
self.pdf.add_page()
|
| 192 |
+
self.pdf.section_header("Sentiment Distribution")
|
| 193 |
+
pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
|
| 194 |
+
gauge = self.charts.create_sentiment_score_gauge(self._avg_score(df))
|
| 195 |
+
self._add_two_charts(pie, gauge)
|
| 196 |
+
|
| 197 |
+
def _topic_section(self, df):
|
| 198 |
+
self.pdf.add_page()
|
| 199 |
+
self.pdf.section_header("Topic Analysis")
|
| 200 |
+
bar = self.charts.create_topic_bar_chart(df, title="Conversations by Topic")
|
| 201 |
+
pie = self.charts.create_topic_pie_chart(df, title="Topic Share")
|
| 202 |
+
self._add_two_charts(bar, pie)
|
| 203 |
+
self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
|
| 204 |
+
|
| 205 |
+
def _emotion_section(self, df):
|
| 206 |
+
if "emotions" not in df.columns or df["emotions"].dropna().empty:
|
| 207 |
+
return
|
| 208 |
+
self.pdf.add_page()
|
| 209 |
+
self.pdf.section_header("Emotion Analysis")
|
| 210 |
+
self._add_chart(self.charts.create_emotion_bar_chart(df, title="Emotion Distribution"))
|
| 211 |
+
|
| 212 |
+
def _flags_section(self, df):
|
| 213 |
+
self.pdf.add_page()
|
| 214 |
+
self.pdf.section_header("Billing & Membership Flags")
|
| 215 |
+
flags_chart = self.charts.create_boolean_flags_chart(df)
|
| 216 |
+
esc_chart = self.charts.create_escalation_breakdown(df)
|
| 217 |
+
self._add_two_charts(flags_chart, esc_chart)
|
| 218 |
+
|
| 219 |
+
def _status_source_section(self, df):
|
| 220 |
+
self.pdf.add_page()
|
| 221 |
+
self.pdf.section_header("Status & Source Distribution")
|
| 222 |
+
status_chart = self.charts.create_status_distribution(df)
|
| 223 |
+
source_chart = self.charts.create_source_distribution(df)
|
| 224 |
+
self._add_two_charts(status_chart, source_chart)
|
| 225 |
+
|
| 226 |
+
def _timelines_section(self, df):
|
| 227 |
+
self.pdf.add_page()
|
| 228 |
+
self.pdf.section_header("Volume & Trends (Weekly)")
|
| 229 |
+
self._add_chart(self.charts.create_volume_timeline(df, freq="W"))
|
| 230 |
+
self._add_chart(self.charts.create_sentiment_timeline(df, freq="W"))
|
| 231 |
+
self._add_chart(self.charts.create_refund_cancel_timeline(df, freq="W"))
|
| 232 |
+
|
| 233 |
+
def _depth_section(self, df):
|
| 234 |
+
self.pdf.add_page()
|
| 235 |
+
self.pdf.section_header("Conversation Depth")
|
| 236 |
+
dur = self.charts.create_duration_histogram(df)
|
| 237 |
+
thd = self.charts.create_thread_count_histogram(df)
|
| 238 |
+
self._add_two_charts(dur, thd)
|
| 239 |
+
|
| 240 |
+
def _data_summary(self, df, filter_info):
|
| 241 |
+
self.pdf.add_page()
|
| 242 |
+
self.pdf.section_header("Data Summary")
|
| 243 |
+
self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 244 |
+
self.pdf.body_text(f"Total conversations: {len(df):,}")
|
| 245 |
+
self.pdf.callout_box(
|
| 246 |
+
"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
|
| 247 |
+
"This report is confidential and intended for internal Musora team use only.",
|
| 248 |
+
bg_color=(245, 245, 245),
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
@staticmethod
|
| 252 |
+
def _avg_score(df) -> float:
|
| 253 |
+
score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
|
| 254 |
+
"negative": -1, "very_negative": -2}
|
| 255 |
+
if "sentiment_polarity" not in df.columns or df.empty:
|
| 256 |
+
return 0.0
|
| 257 |
+
return float(df["sentiment_polarity"].map(score_map).fillna(0).mean())
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ---------------------------------------------------------------------------
|
| 261 |
+
# HelpScoutAnalysisPDF
|
| 262 |
+
# ---------------------------------------------------------------------------
|
| 263 |
+
|
| 264 |
+
class HelpScoutAnalysisPDF:
|
| 265 |
+
"""
|
| 266 |
+
Generates a focused analysis PDF from the HelpScout Analysis page.
|
| 267 |
+
Includes filter summary, distributions, and optionally the LLM summary report.
|
| 268 |
+
"""
|
| 269 |
+
|
| 270 |
+
def __init__(self):
|
| 271 |
+
self.charts = HelpScoutCharts()
|
| 272 |
+
self.taxonomy = load_topic_taxonomy()
|
| 273 |
+
self._tmp: list = []
|
| 274 |
+
|
| 275 |
+
def generate_report(self, df, filter_info: dict = None,
|
| 276 |
+
summary_result: dict = None) -> bytes:
|
| 277 |
+
"""
|
| 278 |
+
Build and return the analysis PDF.
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
df: Filtered HelpScout analysis DataFrame.
|
| 282 |
+
filter_info: Dict of filter descriptions for the cover.
|
| 283 |
+
summary_result: Output from HelpScoutSummaryAgent.process() or None.
|
| 284 |
+
"""
|
| 285 |
+
self.pdf = MusoraPDF()
|
| 286 |
+
self._tmp = []
|
| 287 |
+
try:
|
| 288 |
+
self._cover(df, filter_info)
|
| 289 |
+
self._filter_summary_section(filter_info, df)
|
| 290 |
+
self._kpi_section(df)
|
| 291 |
+
self._distributions_section(df)
|
| 292 |
+
self._summary_section(summary_result)
|
| 293 |
+
self._data_summary(df, filter_info)
|
| 294 |
+
return bytes(self.pdf.output())
|
| 295 |
+
finally:
|
| 296 |
+
_cleanup(self._tmp)
|
| 297 |
+
|
| 298 |
+
# ββ Rendering helpers ββ
|
| 299 |
+
|
| 300 |
+
def _add_chart(self, fig, width=180, img_w=800, img_h=400):
|
| 301 |
+
try:
|
| 302 |
+
p = _fig_to_tmp(fig, img_w, img_h)
|
| 303 |
+
self._tmp.append(p)
|
| 304 |
+
h_mm = width * (img_h / img_w)
|
| 305 |
+
self.pdf.check_page_break(h_mm + 5)
|
| 306 |
+
self.pdf.image(p, x=10, w=width)
|
| 307 |
+
self.pdf.ln(3)
|
| 308 |
+
except Exception:
|
| 309 |
+
logger.exception("Chart render failed")
|
| 310 |
+
self.pdf.body_text("[Chart could not be rendered]")
|
| 311 |
+
|
| 312 |
+
def _add_two_charts(self, fig1, fig2, width=92):
|
| 313 |
+
try:
|
| 314 |
+
p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
|
| 315 |
+
p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
|
| 316 |
+
h_mm = width * (450 / 700)
|
| 317 |
+
self.pdf.check_page_break(h_mm + 5)
|
| 318 |
+
y = self.pdf.get_y()
|
| 319 |
+
self.pdf.image(p1, x=10, y=y, w=width)
|
| 320 |
+
self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
|
| 321 |
+
self.pdf.set_y(y + h_mm + 3)
|
| 322 |
+
except Exception:
|
| 323 |
+
logger.exception("Side-by-side render failed")
|
| 324 |
+
self.pdf.body_text("[Charts could not be rendered]")
|
| 325 |
+
|
| 326 |
+
# ββ Sections ββ
|
| 327 |
+
|
| 328 |
+
def _cover(self, df, filter_info):
|
| 329 |
+
self.pdf.add_page()
|
| 330 |
+
self.pdf.ln(40)
|
| 331 |
+
r, g, b = MusoraPDF.PRIMARY
|
| 332 |
+
self.pdf.set_fill_color(r, g, b)
|
| 333 |
+
self.pdf.rect(0, 60, 210, 4, style="F")
|
| 334 |
+
self.pdf.ln(20)
|
| 335 |
+
self.pdf.set_font("Helvetica", "B", 28)
|
| 336 |
+
self.pdf.set_text_color(r, g, b)
|
| 337 |
+
self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
|
| 338 |
+
self.pdf.set_font("Helvetica", "", 16)
|
| 339 |
+
self.pdf.set_text_color(80, 80, 80)
|
| 340 |
+
self.pdf.cell(0, 10, "HelpScout Analysis Report",
|
| 341 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 342 |
+
self.pdf.ln(10)
|
| 343 |
+
self.pdf.set_font("Helvetica", "", 12)
|
| 344 |
+
self.pdf.set_text_color(100, 100, 100)
|
| 345 |
+
self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
|
| 346 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 347 |
+
self.pdf.ln(5)
|
| 348 |
+
self.pdf.set_font("Helvetica", "", 10)
|
| 349 |
+
self.pdf.cell(0, 7, f"Matched Conversations: {len(df):,}",
|
| 350 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 351 |
+
if filter_info:
|
| 352 |
+
self.pdf.ln(8)
|
| 353 |
+
self.pdf.set_font("Helvetica", "B", 9)
|
| 354 |
+
self.pdf.set_text_color(80, 80, 80)
|
| 355 |
+
self.pdf.cell(0, 6, "Applied Filters:", align="C", new_x="LMARGIN", new_y="NEXT")
|
| 356 |
+
self.pdf.set_font("Helvetica", "", 9)
|
| 357 |
+
for k, v in filter_info.items():
|
| 358 |
+
if v:
|
| 359 |
+
self.pdf.cell(0, 5, MusoraPDF._sanitize(f"{k}: {v}"),
|
| 360 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 361 |
+
self.pdf.ln(20)
|
| 362 |
+
self.pdf.set_font("Helvetica", "I", 8)
|
| 363 |
+
self.pdf.set_text_color(150, 150, 150)
|
| 364 |
+
self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
|
| 365 |
+
align="C", new_x="LMARGIN", new_y="NEXT")
|
| 366 |
+
|
| 367 |
+
def _filter_summary_section(self, filter_info, df):
|
| 368 |
+
self.pdf.add_page()
|
| 369 |
+
self.pdf.section_header("Filter Set Summary")
|
| 370 |
+
if filter_info:
|
| 371 |
+
rows = [(k, MusoraPDF._sanitize(str(v))) for k, v in filter_info.items() if v]
|
| 372 |
+
if rows:
|
| 373 |
+
self.pdf.add_table(["Filter", "Value"], rows, col_widths=[80, 110])
|
| 374 |
+
else:
|
| 375 |
+
self.pdf.body_text("No filters applied β report covers all available conversations.")
|
| 376 |
+
|
| 377 |
+
def _kpi_section(self, df):
|
| 378 |
+
total = len(df)
|
| 379 |
+
flags = boolean_flag_counts(df)
|
| 380 |
+
neg_pct = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
|
| 381 |
+
pos_pct = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
|
| 382 |
+
avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
|
| 383 |
+
esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
|
| 384 |
+
|
| 385 |
+
self.pdf.section_header("Key Metrics")
|
| 386 |
+
self.pdf.metric_row([
|
| 387 |
+
("Conversations", f"{total:,}"),
|
| 388 |
+
("Positive %", f"{pos_pct:.1f}%"),
|
| 389 |
+
("Negative %", f"{neg_pct:.1f}%"),
|
| 390 |
+
("Avg Duration (h)", f"{avg_dur:.1f}"),
|
| 391 |
+
])
|
| 392 |
+
self.pdf.metric_row([
|
| 393 |
+
("Escalations", f"{esc:,}"),
|
| 394 |
+
("Refund Requests", f"{flags['is_refund_request']:,}"),
|
| 395 |
+
("Cancellations", f"{flags['is_cancellation']:,}"),
|
| 396 |
+
("Membership Joins", f"{flags['is_membership']:,}"),
|
| 397 |
+
])
|
| 398 |
+
|
| 399 |
+
def _distributions_section(self, df):
|
| 400 |
+
self.pdf.add_page()
|
| 401 |
+
self.pdf.section_header("Distributions")
|
| 402 |
+
pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
|
| 403 |
+
tbar = self.charts.create_topic_bar_chart(df, title="Topic Distribution")
|
| 404 |
+
self._add_two_charts(pie, tbar)
|
| 405 |
+
self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
|
| 406 |
+
|
| 407 |
+
def _summary_section(self, result: dict):
|
| 408 |
+
self.pdf.add_page()
|
| 409 |
+
self.pdf.section_header("AI Summary Report")
|
| 410 |
+
|
| 411 |
+
if result is None or not result.get("success"):
|
| 412 |
+
self.pdf.callout_box(
|
| 413 |
+
"AI summary not generated. To include it, click 'Generate Summary Report' "
|
| 414 |
+
"in the app before exporting the PDF.",
|
| 415 |
+
bg_color=(255, 250, 230),
|
| 416 |
+
)
|
| 417 |
+
return
|
| 418 |
+
|
| 419 |
+
summary = result.get("summary", {})
|
| 420 |
+
meta = result.get("metadata", {})
|
| 421 |
+
|
| 422 |
+
exec_summary = MusoraPDF._sanitize(summary.get("executive_summary", ""))
|
| 423 |
+
if exec_summary:
|
| 424 |
+
self.pdf.subsection_header("Executive Summary")
|
| 425 |
+
self.pdf.section_description(exec_summary)
|
| 426 |
+
|
| 427 |
+
themes = summary.get("top_themes", [])
|
| 428 |
+
if themes:
|
| 429 |
+
self.pdf.subsection_header("Top Themes")
|
| 430 |
+
for t in themes:
|
| 431 |
+
theme_text = MusoraPDF._sanitize(
|
| 432 |
+
f"{t.get('theme', '')} β {t.get('prevalence', '')}: {t.get('description', '')}"
|
| 433 |
+
)
|
| 434 |
+
self.pdf.body_text(f" * {theme_text}")
|
| 435 |
+
|
| 436 |
+
complaints = summary.get("top_complaints", [])
|
| 437 |
+
if complaints:
|
| 438 |
+
self.pdf.subsection_header("Top Complaints")
|
| 439 |
+
for c in complaints:
|
| 440 |
+
self.pdf.body_text(f" * {MusoraPDF._sanitize(c)}")
|
| 441 |
+
|
| 442 |
+
insights = summary.get("unexpected_insights", [])
|
| 443 |
+
if insights:
|
| 444 |
+
self.pdf.subsection_header("Unexpected Insights")
|
| 445 |
+
for ins in insights:
|
| 446 |
+
self.pdf.body_text(f" * {MusoraPDF._sanitize(ins)}")
|
| 447 |
+
|
| 448 |
+
quotes = summary.get("notable_quotes", [])
|
| 449 |
+
if quotes:
|
| 450 |
+
self.pdf.subsection_header("Notable Quotes")
|
| 451 |
+
for q in quotes:
|
| 452 |
+
self.pdf.body_text(f' "{MusoraPDF._sanitize(q)}"')
|
| 453 |
+
|
| 454 |
+
self.pdf.ln(4)
|
| 455 |
+
self.pdf.callout_box(
|
| 456 |
+
f"Analysis based on {meta.get('total_conversations_analyzed', 0)} conversations "
|
| 457 |
+
f"| Model: {meta.get('model_used', 'N/A')} "
|
| 458 |
+
f"| Tokens: {meta.get('tokens_used', 0):,}",
|
| 459 |
+
bg_color=(240, 248, 255),
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
def _data_summary(self, df, filter_info):
|
| 463 |
+
self.pdf.add_page()
|
| 464 |
+
self.pdf.section_header("Data Summary")
|
| 465 |
+
self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 466 |
+
self.pdf.body_text(f"Total conversations in report: {len(df):,}")
|
| 467 |
+
self.pdf.callout_box(
|
| 468 |
+
"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
|
| 469 |
+
"This report is confidential and intended for internal Musora team use only.",
|
| 470 |
+
bg_color=(245, 245, 245),
|
| 471 |
+
)
|
visualization/utils/helpscout_utils.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout utility helpers β pure functions, no Streamlit dependency.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# ---------------------------------------------------------------------------
|
| 11 |
+
# Topic taxonomy helpers
|
| 12 |
+
# ---------------------------------------------------------------------------
|
| 13 |
+
|
| 14 |
+
def load_topic_taxonomy(path: str = None) -> dict:
|
| 15 |
+
"""
|
| 16 |
+
Load topics.json and return {id: {'label': str, 'description': str}}.
|
| 17 |
+
Default path resolves to process_helpscout/config_files/topics.json
|
| 18 |
+
relative to the project root.
|
| 19 |
+
"""
|
| 20 |
+
if path is None:
|
| 21 |
+
root = Path(__file__).resolve().parent.parent.parent
|
| 22 |
+
path = root / "process_helpscout" / "config_files" / "topics.json"
|
| 23 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 24 |
+
raw = json.load(f)
|
| 25 |
+
return {t["id"]: {"label": t["label"], "description": t.get("description", "")}
|
| 26 |
+
for t in raw.get("topics", [])}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def topic_label(topic_id: str, taxonomy: dict) -> str:
|
| 30 |
+
"""Return human-readable label for a topic id. Falls back to title-cased id."""
|
| 31 |
+
if topic_id in taxonomy:
|
| 32 |
+
return taxonomy[topic_id]["label"]
|
| 33 |
+
return topic_id.replace("_", " ").title()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def parse_topics(value) -> list:
|
| 37 |
+
"""Split a comma-separated TOPICS string into a list of stripped lowercase ids."""
|
| 38 |
+
if pd.isna(value) or not isinstance(value, str) or not value.strip():
|
| 39 |
+
return []
|
| 40 |
+
return [t.strip().lower() for t in value.split(",") if t.strip()]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def explode_topics(df: pd.DataFrame, topics_col: str = "topics") -> pd.DataFrame:
|
| 44 |
+
"""
|
| 45 |
+
Return a new dataframe with one row per (conversation_id, topic_id).
|
| 46 |
+
Requires df to have a 'conversation_id' column and a topics_col column.
|
| 47 |
+
"""
|
| 48 |
+
df = df.copy()
|
| 49 |
+
df["_topic_list"] = df[topics_col].apply(parse_topics)
|
| 50 |
+
exploded = df.explode("_topic_list").rename(columns={"_topic_list": "topic_id"})
|
| 51 |
+
exploded = exploded[exploded["topic_id"].notna() & (exploded["topic_id"] != "")]
|
| 52 |
+
return exploded.drop(columns=[topics_col], errors="ignore").reset_index(drop=True)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# Boolean flag helpers
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
def boolean_flag_counts(df: pd.DataFrame) -> dict:
|
| 60 |
+
"""Return counts for refund / cancellation / membership flags."""
|
| 61 |
+
return {
|
| 62 |
+
"is_refund_request": int(df["is_refund_request"].sum()) if "is_refund_request" in df.columns else 0,
|
| 63 |
+
"is_cancellation": int(df["is_cancellation"].sum()) if "is_cancellation" in df.columns else 0,
|
| 64 |
+
"is_membership": int(df["is_membership"].sum()) if "is_membership" in df.columns else 0,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def compute_escalation_flag(df: pd.DataFrame, escalation_sentiments: list) -> pd.Series:
|
| 69 |
+
"""
|
| 70 |
+
Boolean Series: True when conversation is negative-sentiment
|
| 71 |
+
OR is a refund request OR is a cancellation.
|
| 72 |
+
"""
|
| 73 |
+
is_neg = df["sentiment_polarity"].isin(escalation_sentiments)
|
| 74 |
+
is_refund = df.get("is_refund_request", pd.Series(False, index=df.index)).fillna(False).astype(bool)
|
| 75 |
+
is_cancel = df.get("is_cancellation", pd.Series(False, index=df.index)).fillna(False).astype(bool)
|
| 76 |
+
return is_neg | is_refund | is_cancel
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# Filter description builder
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
|
| 83 |
+
def build_filter_description(filters: dict, taxonomy: dict) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Convert the filter dict from the analysis page into a human-readable string
|
| 86 |
+
suitable for the agent prompt and PDF cover.
|
| 87 |
+
"""
|
| 88 |
+
parts = []
|
| 89 |
+
if filters.get("date_range"):
|
| 90 |
+
s, e = filters["date_range"]
|
| 91 |
+
parts.append(f"Date: {s} to {e}")
|
| 92 |
+
if filters.get("sentiments"):
|
| 93 |
+
parts.append(f"Sentiments: {', '.join(filters['sentiments'])}")
|
| 94 |
+
if filters.get("topics"):
|
| 95 |
+
labels = [topic_label(t, taxonomy) for t in filters["topics"]]
|
| 96 |
+
parts.append(f"Topics: {', '.join(labels)}")
|
| 97 |
+
if filters.get("statuses"):
|
| 98 |
+
parts.append(f"Status: {', '.join(filters['statuses'])}")
|
| 99 |
+
if filters.get("sources"):
|
| 100 |
+
parts.append(f"Source: {', '.join(filters['sources'])}")
|
| 101 |
+
if filters.get("refund_only"):
|
| 102 |
+
parts.append("Refund requests only")
|
| 103 |
+
if filters.get("cancel_only"):
|
| 104 |
+
parts.append("Cancellations only")
|
| 105 |
+
if filters.get("membership_only"):
|
| 106 |
+
parts.append("Membership requests only")
|
| 107 |
+
return "; ".join(parts) if parts else "No filters applied β showing all conversations"
|
visualization/utils/pdf_exporter.py
CHANGED
|
@@ -79,6 +79,13 @@ _DESCRIPTIONS = {
|
|
| 79 |
"Note: These charts reflect only users who have filled in their profile information - "
|
| 80 |
"they do not represent all community members."
|
| 81 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"language": (
|
| 83 |
"Language distribution shows what languages comments are written in. "
|
| 84 |
"Non-English comments are automatically translated for analysis."
|
|
@@ -342,6 +349,7 @@ class DashboardPDFExporter:
|
|
| 342 |
self._add_brand_section(df)
|
| 343 |
self._add_platform_section(df)
|
| 344 |
self._add_intent_section(df)
|
|
|
|
| 345 |
self._add_cross_dimensional_section(df)
|
| 346 |
self._add_volume_section(df)
|
| 347 |
self._add_reply_requirements_section(df)
|
|
@@ -350,6 +358,7 @@ class DashboardPDFExporter:
|
|
| 350 |
if "detected_language" in df.columns:
|
| 351 |
self._add_language_section(df)
|
| 352 |
self._add_data_summary(df, filter_info)
|
|
|
|
| 353 |
|
| 354 |
return bytes(self.pdf.output())
|
| 355 |
finally:
|
|
@@ -782,6 +791,39 @@ class DashboardPDFExporter:
|
|
| 782 |
)
|
| 783 |
self._add_two_charts(intent_bar, intent_pie)
|
| 784 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
def _add_cross_dimensional_section(self, df) -> None:
|
| 786 |
if "brand" not in df.columns or "platform" not in df.columns:
|
| 787 |
return
|
|
@@ -913,6 +955,44 @@ class DashboardPDFExporter:
|
|
| 913 |
self.distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
|
| 914 |
)
|
| 915 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
def _add_data_summary(self, df, filter_info: dict) -> None:
|
| 917 |
self.pdf.add_page()
|
| 918 |
self.pdf.section_header("Data Summary")
|
|
|
|
| 79 |
"Note: These charts reflect only users who have filled in their profile information - "
|
| 80 |
"they do not represent all community members."
|
| 81 |
),
|
| 82 |
+
"emotion": (
|
| 83 |
+
"Beyond sentiment polarity, the AI identifies the underlying emotion in each comment: "
|
| 84 |
+
"joy, excitement, gratitude, admiration, curiosity, humor, frustration, "
|
| 85 |
+
"disappointment, sadness, anger, or neutral. "
|
| 86 |
+
"Comments can have multiple emotions (multi-label). "
|
| 87 |
+
"Emotions with no data are omitted from the charts."
|
| 88 |
+
),
|
| 89 |
"language": (
|
| 90 |
"Language distribution shows what languages comments are written in. "
|
| 91 |
"Non-English comments are automatically translated for analysis."
|
|
|
|
| 349 |
self._add_brand_section(df)
|
| 350 |
self._add_platform_section(df)
|
| 351 |
self._add_intent_section(df)
|
| 352 |
+
self._add_emotion_section(df)
|
| 353 |
self._add_cross_dimensional_section(df)
|
| 354 |
self._add_volume_section(df)
|
| 355 |
self._add_reply_requirements_section(df)
|
|
|
|
| 358 |
if "detected_language" in df.columns:
|
| 359 |
self._add_language_section(df)
|
| 360 |
self._add_data_summary(df, filter_info)
|
| 361 |
+
self._add_helpscout_summary_section()
|
| 362 |
|
| 363 |
return bytes(self.pdf.output())
|
| 364 |
finally:
|
|
|
|
| 791 |
)
|
| 792 |
self._add_two_charts(intent_bar, intent_pie)
|
| 793 |
|
| 794 |
+
def _add_emotion_section(self, df) -> None:
|
| 795 |
+
if "emotions" not in df.columns or df["emotions"].dropna().empty:
|
| 796 |
+
return
|
| 797 |
+
|
| 798 |
+
self.pdf.add_page()
|
| 799 |
+
self.pdf.section_header("Emotion Analysis")
|
| 800 |
+
self.pdf.section_description(_DESCRIPTIONS["emotion"])
|
| 801 |
+
|
| 802 |
+
emotion_bar = self.distribution_charts.create_emotion_bar_chart(
|
| 803 |
+
df, title="Emotion Distribution", orientation="h"
|
| 804 |
+
)
|
| 805 |
+
emotion_pie = self.distribution_charts.create_emotion_pie_chart(
|
| 806 |
+
df, title="Emotion Distribution"
|
| 807 |
+
)
|
| 808 |
+
self._add_two_charts(emotion_bar, emotion_pie)
|
| 809 |
+
|
| 810 |
+
# Top 5 emotions summary
|
| 811 |
+
emotion_dist = self.processor.get_emotion_distribution(df)
|
| 812 |
+
if not emotion_dist.empty:
|
| 813 |
+
self.pdf.subsection_header("Top Emotions")
|
| 814 |
+
rows = []
|
| 815 |
+
for _, row in emotion_dist.sort_values('count', ascending=False).head(8).iterrows():
|
| 816 |
+
rows.append((
|
| 817 |
+
str(row['emotions']).title(),
|
| 818 |
+
f"{int(row['count']):,}",
|
| 819 |
+
f"{row['percentage']:.1f}%",
|
| 820 |
+
))
|
| 821 |
+
self.pdf.add_table(
|
| 822 |
+
headers=["Emotion", "Count", "Percentage"],
|
| 823 |
+
rows=rows,
|
| 824 |
+
col_widths=[80, 55, 55],
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
def _add_cross_dimensional_section(self, df) -> None:
|
| 828 |
if "brand" not in df.columns or "platform" not in df.columns:
|
| 829 |
return
|
|
|
|
| 955 |
self.distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
|
| 956 |
)
|
| 957 |
|
| 958 |
+
def _add_helpscout_summary_section(self) -> None:
|
| 959 |
+
"""Short HelpScout overview appended to the combined dashboard PDF."""
|
| 960 |
+
try:
|
| 961 |
+
import streamlit as st
|
| 962 |
+
hs_df = st.session_state.get("helpscout_df")
|
| 963 |
+
if hs_df is None or hs_df.empty:
|
| 964 |
+
return
|
| 965 |
+
|
| 966 |
+
from utils.helpscout_utils import boolean_flag_counts
|
| 967 |
+
from visualizations.helpscout_charts import HelpScoutCharts
|
| 968 |
+
|
| 969 |
+
self.pdf.add_page()
|
| 970 |
+
self.pdf.section_header("HelpScout Support Overview")
|
| 971 |
+
self.pdf.section_description(
|
| 972 |
+
"Summary of customer support conversations processed through the "
|
| 973 |
+
"HelpScout sentiment pipeline."
|
| 974 |
+
)
|
| 975 |
+
|
| 976 |
+
total = len(hs_df)
|
| 977 |
+
flags = boolean_flag_counts(hs_df)
|
| 978 |
+
neg_pct = hs_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
|
| 979 |
+
esc = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
|
| 980 |
+
|
| 981 |
+
self.pdf.metric_row([
|
| 982 |
+
("Conversations", f"{total:,}"),
|
| 983 |
+
("Negative %", f"{neg_pct:.1f}%"),
|
| 984 |
+
("Escalations", f"{esc:,}"),
|
| 985 |
+
("Refund Requests", f"{flags['is_refund_request']:,}"),
|
| 986 |
+
])
|
| 987 |
+
|
| 988 |
+
hs_charts = HelpScoutCharts()
|
| 989 |
+
pie = hs_charts.create_sentiment_pie_chart(hs_df, title="HelpScout Sentiment Distribution")
|
| 990 |
+
tbar = hs_charts.create_topic_bar_chart(hs_df, title="Top Topics", top_n=5)
|
| 991 |
+
self._add_two_charts(pie, tbar)
|
| 992 |
+
|
| 993 |
+
except Exception:
|
| 994 |
+
logger.exception("HelpScout summary section failed β skipping")
|
| 995 |
+
|
| 996 |
def _add_data_summary(self, df, filter_info: dict) -> None:
|
| 997 |
self.pdf.add_page()
|
| 998 |
self.pdf.section_header("Data Summary")
|
visualization/visualizations/distribution_charts.py
CHANGED
|
@@ -29,9 +29,11 @@ class DistributionCharts:
|
|
| 29 |
self.config = json.load(f)
|
| 30 |
|
| 31 |
self.intent_colors = self.config['color_schemes']['intent']
|
|
|
|
| 32 |
self.platform_colors = self.config['color_schemes']['platform']
|
| 33 |
self.brand_colors = self.config['color_schemes']['brand']
|
| 34 |
self.intent_order = self.config['intent_order']
|
|
|
|
| 35 |
self.chart_height = self.config['dashboard']['chart_height']
|
| 36 |
|
| 37 |
def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'):
|
|
@@ -141,6 +143,135 @@ class DistributionCharts:
|
|
| 141 |
|
| 142 |
return fig
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def create_platform_distribution(self, df, title="Comments by Platform"):
|
| 145 |
"""
|
| 146 |
Create bar chart for platform distribution
|
|
|
|
| 29 |
self.config = json.load(f)
|
| 30 |
|
| 31 |
self.intent_colors = self.config['color_schemes']['intent']
|
| 32 |
+
self.emotion_colors = self.config['color_schemes'].get('emotion', {})
|
| 33 |
self.platform_colors = self.config['color_schemes']['platform']
|
| 34 |
self.brand_colors = self.config['color_schemes']['brand']
|
| 35 |
self.intent_order = self.config['intent_order']
|
| 36 |
+
self.emotion_order = self.config.get('emotion_order', [])
|
| 37 |
self.chart_height = self.config['dashboard']['chart_height']
|
| 38 |
|
| 39 |
def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'):
|
|
|
|
| 143 |
|
| 144 |
return fig
|
| 145 |
|
| 146 |
+
def create_emotion_bar_chart(self, df, title="Emotion Distribution", orientation='h'):
|
| 147 |
+
"""
|
| 148 |
+
Create bar chart for emotion distribution (handles multi-label).
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
df: Sentiment dataframe with 'emotions' column
|
| 152 |
+
title: Chart title
|
| 153 |
+
orientation: 'h' for horizontal, 'v' for vertical
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
plotly.graph_objects.Figure
|
| 157 |
+
"""
|
| 158 |
+
if 'emotions' not in df.columns:
|
| 159 |
+
return go.Figure().add_annotation(
|
| 160 |
+
text="No emotion data available",
|
| 161 |
+
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
df_exploded = df.dropna(subset=['emotions']).copy()
|
| 165 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
|
| 166 |
+
df_exploded = df_exploded.explode('emotions')
|
| 167 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.strip()
|
| 168 |
+
df_exploded = df_exploded[df_exploded['emotions'] != '']
|
| 169 |
+
|
| 170 |
+
if df_exploded.empty:
|
| 171 |
+
return go.Figure().add_annotation(
|
| 172 |
+
text="No emotion data available",
|
| 173 |
+
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
emotion_counts = df_exploded['emotions'].value_counts()
|
| 177 |
+
ordered_emotions = [e for e in self.emotion_order if e in emotion_counts.index]
|
| 178 |
+
# include any emotions not in the order list
|
| 179 |
+
remaining = [e for e in emotion_counts.index if e not in ordered_emotions]
|
| 180 |
+
ordered_emotions = ordered_emotions + remaining
|
| 181 |
+
emotion_counts = emotion_counts[ordered_emotions]
|
| 182 |
+
|
| 183 |
+
colors = [self.emotion_colors.get(e, '#CCCCCC') for e in emotion_counts.index]
|
| 184 |
+
|
| 185 |
+
if orientation == 'h':
|
| 186 |
+
fig = go.Figure(data=[go.Bar(
|
| 187 |
+
y=emotion_counts.index,
|
| 188 |
+
x=emotion_counts.values,
|
| 189 |
+
orientation='h',
|
| 190 |
+
marker=dict(color=colors),
|
| 191 |
+
text=emotion_counts.values,
|
| 192 |
+
textposition='auto',
|
| 193 |
+
hovertemplate='<b>%{y}</b><br>Count: %{x}<extra></extra>'
|
| 194 |
+
)])
|
| 195 |
+
fig.update_layout(
|
| 196 |
+
title=title,
|
| 197 |
+
xaxis_title="Number of Comments",
|
| 198 |
+
yaxis_title="Emotion",
|
| 199 |
+
height=self.chart_height,
|
| 200 |
+
yaxis={'categoryorder': 'total ascending'}
|
| 201 |
+
)
|
| 202 |
+
else:
|
| 203 |
+
fig = go.Figure(data=[go.Bar(
|
| 204 |
+
x=emotion_counts.index,
|
| 205 |
+
y=emotion_counts.values,
|
| 206 |
+
marker=dict(color=colors),
|
| 207 |
+
text=emotion_counts.values,
|
| 208 |
+
textposition='auto',
|
| 209 |
+
hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>'
|
| 210 |
+
)])
|
| 211 |
+
fig.update_layout(
|
| 212 |
+
title=title,
|
| 213 |
+
xaxis_title="Emotion",
|
| 214 |
+
yaxis_title="Number of Comments",
|
| 215 |
+
height=self.chart_height
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
return fig
|
| 219 |
+
|
| 220 |
+
def create_emotion_pie_chart(self, df, title="Emotion Distribution"):
|
| 221 |
+
"""
|
| 222 |
+
Create pie chart for emotion distribution.
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
df: Sentiment dataframe with 'emotions' column
|
| 226 |
+
title: Chart title
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
plotly.graph_objects.Figure
|
| 230 |
+
"""
|
| 231 |
+
if 'emotions' not in df.columns:
|
| 232 |
+
return go.Figure().add_annotation(
|
| 233 |
+
text="No emotion data available",
|
| 234 |
+
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
df_exploded = df.dropna(subset=['emotions']).copy()
|
| 238 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
|
| 239 |
+
df_exploded = df_exploded.explode('emotions')
|
| 240 |
+
df_exploded['emotions'] = df_exploded['emotions'].str.strip()
|
| 241 |
+
df_exploded = df_exploded[df_exploded['emotions'] != '']
|
| 242 |
+
|
| 243 |
+
if df_exploded.empty:
|
| 244 |
+
return go.Figure().add_annotation(
|
| 245 |
+
text="No emotion data available",
|
| 246 |
+
xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
emotion_counts = df_exploded['emotions'].value_counts()
|
| 250 |
+
ordered_emotions = [e for e in self.emotion_order if e in emotion_counts.index]
|
| 251 |
+
remaining = [e for e in emotion_counts.index if e not in ordered_emotions]
|
| 252 |
+
ordered_emotions = ordered_emotions + remaining
|
| 253 |
+
emotion_counts = emotion_counts[ordered_emotions]
|
| 254 |
+
|
| 255 |
+
colors = [self.emotion_colors.get(e, '#CCCCCC') for e in emotion_counts.index]
|
| 256 |
+
|
| 257 |
+
fig = go.Figure(data=[go.Pie(
|
| 258 |
+
labels=emotion_counts.index,
|
| 259 |
+
values=emotion_counts.values,
|
| 260 |
+
marker=dict(colors=colors),
|
| 261 |
+
textinfo='label+percent',
|
| 262 |
+
textposition='auto',
|
| 263 |
+
hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
|
| 264 |
+
)])
|
| 265 |
+
|
| 266 |
+
fig.update_layout(
|
| 267 |
+
title=title,
|
| 268 |
+
height=self.chart_height,
|
| 269 |
+
showlegend=True,
|
| 270 |
+
legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05)
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
return fig
|
| 274 |
+
|
| 275 |
def create_platform_distribution(self, df, title="Comments by Platform"):
|
| 276 |
"""
|
| 277 |
Create bar chart for platform distribution
|
visualization/visualizations/helpscout_charts.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HelpScout-specific Plotly chart functions.
|
| 3 |
+
All functions accept a HelpScout conversations DataFrame and return a
|
| 4 |
+
plotly.graph_objects.Figure.
|
| 5 |
+
"""
|
| 6 |
+
import json
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
|
| 13 |
+
# Ensure project root is on sys.path so visualization.* imports resolve
|
| 14 |
+
_root = Path(__file__).resolve().parent.parent.parent
|
| 15 |
+
if str(_root) not in sys.path:
|
| 16 |
+
sys.path.insert(0, str(_root))
|
| 17 |
+
|
| 18 |
+
from visualization.utils.helpscout_utils import (
|
| 19 |
+
explode_topics, parse_topics, topic_label, load_topic_taxonomy
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class HelpScoutCharts:
|
| 24 |
+
"""Plotly chart factory for HelpScout conversation data."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config_path=None):
|
| 27 |
+
if config_path is None:
|
| 28 |
+
config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
|
| 29 |
+
with open(config_path, "r") as f:
|
| 30 |
+
config = json.load(f)
|
| 31 |
+
|
| 32 |
+
hs_colors = config.get("color_schemes_helpscout", {})
|
| 33 |
+
self.topic_colors = hs_colors.get("topics", {})
|
| 34 |
+
self.status_colors = hs_colors.get("status", {})
|
| 35 |
+
self.flag_colors = hs_colors.get("boolean_flags", {})
|
| 36 |
+
self.sentiment_colors = config.get("color_schemes", {}).get("sentiment_polarity", {})
|
| 37 |
+
self.sentiment_order = config.get("sentiment_order", [])
|
| 38 |
+
self.chart_height = config.get("dashboard", {}).get("chart_height", 400)
|
| 39 |
+
self.taxonomy = load_topic_taxonomy()
|
| 40 |
+
|
| 41 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
# Sentiment charts
|
| 43 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
def create_sentiment_pie_chart(self, df, title="Sentiment Distribution"):
|
| 46 |
+
counts = df["sentiment_polarity"].value_counts()
|
| 47 |
+
ordered = [s for s in self.sentiment_order if s in counts.index]
|
| 48 |
+
counts = counts[ordered]
|
| 49 |
+
colors = [self.sentiment_colors.get(s, "#CCCCCC") for s in counts.index]
|
| 50 |
+
|
| 51 |
+
fig = go.Figure(go.Pie(
|
| 52 |
+
labels=counts.index,
|
| 53 |
+
values=counts.values,
|
| 54 |
+
marker=dict(colors=colors),
|
| 55 |
+
textinfo="label+percent",
|
| 56 |
+
hovertemplate="<b>%{label}</b><br>Count: %{value}<br>%{percent}<extra></extra>",
|
| 57 |
+
))
|
| 58 |
+
fig.update_layout(title=title, height=self.chart_height,
|
| 59 |
+
legend=dict(orientation="v", yanchor="middle", y=0.5))
|
| 60 |
+
return fig
|
| 61 |
+
|
| 62 |
+
def create_sentiment_score_gauge(self, avg_score, title="Sentiment Score"):
|
| 63 |
+
normalized = ((avg_score + 2) / 4) * 100
|
| 64 |
+
fig = go.Figure(go.Indicator(
|
| 65 |
+
mode="gauge+number",
|
| 66 |
+
value=normalized,
|
| 67 |
+
title={"text": title, "font": {"size": 18}},
|
| 68 |
+
number={"font": {"size": 36}},
|
| 69 |
+
gauge={
|
| 70 |
+
"axis": {"range": [0, 100]},
|
| 71 |
+
"bar": {"color": "darkblue"},
|
| 72 |
+
"steps": [
|
| 73 |
+
{"range": [0, 20], "color": "#D32F2F"},
|
| 74 |
+
{"range": [20, 40], "color": "#FF6F00"},
|
| 75 |
+
{"range": [40, 60], "color": "#FFB300"},
|
| 76 |
+
{"range": [60, 80], "color": "#7CB342"},
|
| 77 |
+
{"range": [80, 100],"color": "#00C851"},
|
| 78 |
+
],
|
| 79 |
+
},
|
| 80 |
+
))
|
| 81 |
+
fig.update_layout(height=300, margin=dict(l=20, r=20, t=60, b=20))
|
| 82 |
+
return fig
|
| 83 |
+
|
| 84 |
+
def create_sentiment_timeline(self, df, title="Sentiment Over Time", freq="W"):
|
| 85 |
+
if "first_message_at" not in df.columns:
|
| 86 |
+
return self._empty_fig(title, "No timestamp data")
|
| 87 |
+
df_t = df.copy()
|
| 88 |
+
df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
|
| 89 |
+
agg = df_t.groupby(["date", "sentiment_polarity"]).size().reset_index(name="count")
|
| 90 |
+
fig = go.Figure()
|
| 91 |
+
for s in self.sentiment_order:
|
| 92 |
+
d = agg[agg["sentiment_polarity"] == s]
|
| 93 |
+
if not d.empty:
|
| 94 |
+
fig.add_trace(go.Scatter(
|
| 95 |
+
x=d["date"], y=d["count"], name=s, mode="lines+markers",
|
| 96 |
+
line=dict(color=self.sentiment_colors.get(s, "#CCCCCC"), width=2),
|
| 97 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 98 |
+
))
|
| 99 |
+
fig.update_layout(title=title, xaxis_title="Date",
|
| 100 |
+
yaxis_title="Conversations", height=self.chart_height,
|
| 101 |
+
hovermode="x unified")
|
| 102 |
+
return fig
|
| 103 |
+
|
| 104 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 105 |
+
# Topic charts
|
| 106 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 107 |
+
|
| 108 |
+
def create_topic_bar_chart(self, df, title="Topic Distribution",
|
| 109 |
+
orientation="h", top_n=None):
|
| 110 |
+
exploded = explode_topics(df)
|
| 111 |
+
if exploded.empty:
|
| 112 |
+
return self._empty_fig(title, "No topic data")
|
| 113 |
+
counts = exploded["topic_id"].value_counts()
|
| 114 |
+
if top_n:
|
| 115 |
+
counts = counts.head(top_n)
|
| 116 |
+
labels = [topic_label(t, self.taxonomy) for t in counts.index]
|
| 117 |
+
colors = [self.topic_colors.get(t, "#607D8B") for t in counts.index]
|
| 118 |
+
|
| 119 |
+
if orientation == "h":
|
| 120 |
+
fig = go.Figure(go.Bar(
|
| 121 |
+
y=labels, x=counts.values, orientation="h",
|
| 122 |
+
marker=dict(color=colors),
|
| 123 |
+
text=counts.values, textposition="auto",
|
| 124 |
+
hovertemplate="<b>%{y}</b><br>%{x} conversations<extra></extra>",
|
| 125 |
+
))
|
| 126 |
+
fig.update_layout(title=title, xaxis_title="Conversations",
|
| 127 |
+
yaxis_title="Topic", height=self.chart_height,
|
| 128 |
+
yaxis={"categoryorder": "total ascending"})
|
| 129 |
+
else:
|
| 130 |
+
fig = go.Figure(go.Bar(
|
| 131 |
+
x=labels, y=counts.values,
|
| 132 |
+
marker=dict(color=colors),
|
| 133 |
+
text=counts.values, textposition="auto",
|
| 134 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 135 |
+
))
|
| 136 |
+
fig.update_layout(title=title, xaxis_title="Topic",
|
| 137 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 138 |
+
return fig
|
| 139 |
+
|
| 140 |
+
def create_topic_pie_chart(self, df, title="Topic Distribution"):
|
| 141 |
+
exploded = explode_topics(df)
|
| 142 |
+
if exploded.empty:
|
| 143 |
+
return self._empty_fig(title, "No topic data")
|
| 144 |
+
counts = exploded["topic_id"].value_counts()
|
| 145 |
+
labels = [topic_label(t, self.taxonomy) for t in counts.index]
|
| 146 |
+
colors = [self.topic_colors.get(t, "#607D8B") for t in counts.index]
|
| 147 |
+
fig = go.Figure(go.Pie(
|
| 148 |
+
labels=labels, values=counts.values,
|
| 149 |
+
marker=dict(colors=colors),
|
| 150 |
+
textinfo="label+percent",
|
| 151 |
+
hovertemplate="<b>%{label}</b><br>%{value}<br>%{percent}<extra></extra>",
|
| 152 |
+
))
|
| 153 |
+
fig.update_layout(title=title, height=self.chart_height)
|
| 154 |
+
return fig
|
| 155 |
+
|
| 156 |
+
def create_topic_sentiment_heatmap(self, df, title="Topic Γ Sentiment Heatmap"):
|
| 157 |
+
exploded = explode_topics(df)
|
| 158 |
+
if exploded.empty or "sentiment_polarity" not in exploded.columns:
|
| 159 |
+
return self._empty_fig(title, "No data")
|
| 160 |
+
pivot = pd.crosstab(exploded["topic_id"], exploded["sentiment_polarity"])
|
| 161 |
+
pivot.index = [topic_label(t, self.taxonomy) for t in pivot.index]
|
| 162 |
+
ordered_cols = [s for s in self.sentiment_order if s in pivot.columns]
|
| 163 |
+
pivot = pivot[ordered_cols] if ordered_cols else pivot
|
| 164 |
+
|
| 165 |
+
fig = go.Figure(go.Heatmap(
|
| 166 |
+
z=pivot.values,
|
| 167 |
+
x=pivot.columns.tolist(),
|
| 168 |
+
y=pivot.index.tolist(),
|
| 169 |
+
colorscale="Blues",
|
| 170 |
+
text=pivot.values,
|
| 171 |
+
texttemplate="%{text}",
|
| 172 |
+
hovertemplate="<b>%{y} β %{x}</b><br>%{z}<extra></extra>",
|
| 173 |
+
colorbar=dict(title="Conversations"),
|
| 174 |
+
))
|
| 175 |
+
fig.update_layout(title=title, xaxis_title="Sentiment",
|
| 176 |
+
yaxis_title="Topic", height=self.chart_height + 100)
|
| 177 |
+
return fig
|
| 178 |
+
|
| 179 |
+
def create_topic_timeline(self, df, title="Topic Volume Over Time",
|
| 180 |
+
freq="W", top_n=5):
|
| 181 |
+
if "first_message_at" not in df.columns:
|
| 182 |
+
return self._empty_fig(title, "No timestamp data")
|
| 183 |
+
exploded = explode_topics(df)
|
| 184 |
+
if exploded.empty:
|
| 185 |
+
return self._empty_fig(title, "No topic data")
|
| 186 |
+
|
| 187 |
+
top_topics = exploded["topic_id"].value_counts().head(top_n).index.tolist()
|
| 188 |
+
exploded = exploded[exploded["topic_id"].isin(top_topics)].copy()
|
| 189 |
+
exploded["date"] = pd.to_datetime(exploded["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
|
| 190 |
+
agg = exploded.groupby(["date", "topic_id"]).size().reset_index(name="count")
|
| 191 |
+
|
| 192 |
+
fig = go.Figure()
|
| 193 |
+
for t in top_topics:
|
| 194 |
+
d = agg[agg["topic_id"] == t]
|
| 195 |
+
if not d.empty:
|
| 196 |
+
fig.add_trace(go.Scatter(
|
| 197 |
+
x=d["date"], y=d["count"],
|
| 198 |
+
name=topic_label(t, self.taxonomy), mode="lines+markers",
|
| 199 |
+
line=dict(color=self.topic_colors.get(t, "#607D8B"), width=2),
|
| 200 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 201 |
+
))
|
| 202 |
+
fig.update_layout(title=title, xaxis_title="Date",
|
| 203 |
+
yaxis_title="Conversations", height=self.chart_height,
|
| 204 |
+
hovermode="x unified")
|
| 205 |
+
return fig
|
| 206 |
+
|
| 207 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 208 |
+
# Volume & timelines
|
| 209 |
+
# βββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββ
|
| 210 |
+
|
| 211 |
+
def create_volume_timeline(self, df, title="Conversation Volume Over Time",
|
| 212 |
+
freq="W"):
|
| 213 |
+
if "first_message_at" not in df.columns:
|
| 214 |
+
return self._empty_fig(title, "No timestamp data")
|
| 215 |
+
df_t = df.copy()
|
| 216 |
+
df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
|
| 217 |
+
agg = df_t.groupby("date").size().reset_index(name="count")
|
| 218 |
+
fig = go.Figure(go.Bar(
|
| 219 |
+
x=agg["date"], y=agg["count"],
|
| 220 |
+
marker_color="#1982C4",
|
| 221 |
+
hovertemplate="<b>%{x}</b><br>%{y} conversations<extra></extra>",
|
| 222 |
+
))
|
| 223 |
+
fig.update_layout(title=title, xaxis_title="Date",
|
| 224 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 225 |
+
return fig
|
| 226 |
+
|
| 227 |
+
def create_refund_cancel_timeline(self, df, title="Refund & Cancellation Over Time",
|
| 228 |
+
freq="W"):
|
| 229 |
+
if "first_message_at" not in df.columns:
|
| 230 |
+
return self._empty_fig(title, "No timestamp data")
|
| 231 |
+
df_t = df.copy()
|
| 232 |
+
df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
|
| 233 |
+
|
| 234 |
+
fig = go.Figure()
|
| 235 |
+
for col, label, color in [
|
| 236 |
+
("is_refund_request", "Refund Requests", "#D32F2F"),
|
| 237 |
+
("is_cancellation", "Cancellations", "#FF6F00"),
|
| 238 |
+
("is_membership", "Membership Joins", "#00C851"),
|
| 239 |
+
]:
|
| 240 |
+
if col in df_t.columns:
|
| 241 |
+
agg = df_t[df_t[col] == True].groupby("date").size().reset_index(name="count")
|
| 242 |
+
if not agg.empty:
|
| 243 |
+
fig.add_trace(go.Scatter(
|
| 244 |
+
x=agg["date"], y=agg["count"], name=label,
|
| 245 |
+
mode="lines+markers", line=dict(color=color, width=2),
|
| 246 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 247 |
+
))
|
| 248 |
+
fig.update_layout(title=title, xaxis_title="Date",
|
| 249 |
+
yaxis_title="Conversations", height=self.chart_height,
|
| 250 |
+
hovermode="x unified")
|
| 251 |
+
return fig
|
| 252 |
+
|
| 253 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
+
# Status / source / flags
|
| 255 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 256 |
+
|
| 257 |
+
def create_status_distribution(self, df, title="Conversations by Status"):
|
| 258 |
+
if "status" not in df.columns:
|
| 259 |
+
return self._empty_fig(title, "No status data")
|
| 260 |
+
counts = df["status"].value_counts()
|
| 261 |
+
colors = [self.status_colors.get(s, self.status_colors.get("default", "#607D8B"))
|
| 262 |
+
for s in counts.index]
|
| 263 |
+
fig = go.Figure(go.Bar(
|
| 264 |
+
x=counts.index, y=counts.values,
|
| 265 |
+
marker=dict(color=colors),
|
| 266 |
+
text=counts.values, textposition="auto",
|
| 267 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 268 |
+
))
|
| 269 |
+
fig.update_layout(title=title, xaxis_title="Status",
|
| 270 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 271 |
+
return fig
|
| 272 |
+
|
| 273 |
+
def create_source_distribution(self, df, title="Conversations by Source Type"):
|
| 274 |
+
if "source_type" not in df.columns:
|
| 275 |
+
return self._empty_fig(title, "No source data")
|
| 276 |
+
counts = df["source_type"].value_counts()
|
| 277 |
+
fig = go.Figure(go.Bar(
|
| 278 |
+
x=counts.index, y=counts.values,
|
| 279 |
+
marker_color="#1982C4",
|
| 280 |
+
text=counts.values, textposition="auto",
|
| 281 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 282 |
+
))
|
| 283 |
+
fig.update_layout(title=title, xaxis_title="Source",
|
| 284 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 285 |
+
return fig
|
| 286 |
+
|
| 287 |
+
def create_boolean_flags_chart(self, df, title="Key Billing & Membership Flags"):
|
| 288 |
+
labels, values, colors = [], [], []
|
| 289 |
+
for col, label in [("is_refund_request", "Refund Requests"),
|
| 290 |
+
("is_cancellation", "Cancellations"),
|
| 291 |
+
("is_membership", "Membership Joins")]:
|
| 292 |
+
if col in df.columns:
|
| 293 |
+
labels.append(label)
|
| 294 |
+
values.append(int(df[col].sum()))
|
| 295 |
+
colors.append(self.flag_colors.get(col, "#607D8B"))
|
| 296 |
+
|
| 297 |
+
if not values:
|
| 298 |
+
return self._empty_fig(title, "No flag data")
|
| 299 |
+
|
| 300 |
+
fig = go.Figure(go.Bar(
|
| 301 |
+
x=labels, y=values,
|
| 302 |
+
marker=dict(color=colors),
|
| 303 |
+
text=values, textposition="auto",
|
| 304 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 305 |
+
))
|
| 306 |
+
fig.update_layout(title=title, xaxis_title="Flag",
|
| 307 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 308 |
+
return fig
|
| 309 |
+
|
| 310 |
+
def create_escalation_breakdown(self, df, title="Escalation Queue by Topic"):
|
| 311 |
+
if "is_escalation" not in df.columns:
|
| 312 |
+
return self._empty_fig(title, "No escalation data")
|
| 313 |
+
|
| 314 |
+
exploded = explode_topics(df)
|
| 315 |
+
if exploded.empty:
|
| 316 |
+
return self._empty_fig(title, "No topic data")
|
| 317 |
+
|
| 318 |
+
pivot = pd.crosstab(exploded["topic_id"], exploded["is_escalation"])
|
| 319 |
+
pivot.index = [topic_label(t, self.taxonomy) for t in pivot.index]
|
| 320 |
+
|
| 321 |
+
fig = go.Figure()
|
| 322 |
+
for flag, label, color in [(False, "Normal", "#4CAF50"), (True, "Escalation", "#D32F2F")]:
|
| 323 |
+
if flag in pivot.columns:
|
| 324 |
+
fig.add_trace(go.Bar(
|
| 325 |
+
name=label, y=pivot.index, x=pivot[flag],
|
| 326 |
+
orientation="h", marker_color=color,
|
| 327 |
+
hovertemplate="<b>%{y}</b><br>%{x}<extra></extra>",
|
| 328 |
+
))
|
| 329 |
+
fig.update_layout(title=title, barmode="stack", xaxis_title="Conversations",
|
| 330 |
+
yaxis_title="Topic", height=self.chart_height,
|
| 331 |
+
yaxis={"categoryorder": "total ascending"})
|
| 332 |
+
return fig
|
| 333 |
+
|
| 334 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 335 |
+
# Duration & thread count
|
| 336 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 337 |
+
|
| 338 |
+
def create_duration_histogram(self, df, title="Conversation Duration Distribution"):
|
| 339 |
+
if "duration_hours" not in df.columns:
|
| 340 |
+
return self._empty_fig(title, "No duration data")
|
| 341 |
+
d = df["duration_hours"].dropna()
|
| 342 |
+
fig = go.Figure(go.Histogram(
|
| 343 |
+
x=d, nbinsx=40, marker_color="#1982C4",
|
| 344 |
+
hovertemplate="Duration: %{x:.1f}h<br>Count: %{y}<extra></extra>",
|
| 345 |
+
))
|
| 346 |
+
fig.update_layout(title=title, xaxis_title="Duration (hours)",
|
| 347 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 348 |
+
return fig
|
| 349 |
+
|
| 350 |
+
def create_thread_count_histogram(self, df, title="Thread Count Distribution"):
|
| 351 |
+
if "thread_count" not in df.columns:
|
| 352 |
+
return self._empty_fig(title, "No thread data")
|
| 353 |
+
t = df["thread_count"].dropna()
|
| 354 |
+
fig = go.Figure(go.Histogram(
|
| 355 |
+
x=t, nbinsx=30, marker_color="#9C27B0",
|
| 356 |
+
hovertemplate="Threads: %{x}<br>Count: %{y}<extra></extra>",
|
| 357 |
+
))
|
| 358 |
+
fig.update_layout(title=title, xaxis_title="Number of Threads",
|
| 359 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 360 |
+
return fig
|
| 361 |
+
|
| 362 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
+
# Emotion (same logic as DistributionCharts but with helpscout df)
|
| 364 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 365 |
+
|
| 366 |
+
def create_emotion_bar_chart(self, df, title="Emotion Distribution",
|
| 367 |
+
orientation="h"):
|
| 368 |
+
if "emotions" not in df.columns or df["emotions"].isna().all():
|
| 369 |
+
return self._empty_fig(title, "No emotion data")
|
| 370 |
+
|
| 371 |
+
emotion_colors = {
|
| 372 |
+
"joy": "#FFD700", "excitement": "#FF6B35", "gratitude": "#4CAF50",
|
| 373 |
+
"admiration": "#2196F3", "curiosity": "#00BCD4", "humor": "#9C27B0",
|
| 374 |
+
"frustration": "#FF9800", "disappointment": "#795548",
|
| 375 |
+
"sadness": "#607D8B", "anger": "#D32F2F", "neutral": "#9E9E9E",
|
| 376 |
+
}
|
| 377 |
+
df_e = df.copy()
|
| 378 |
+
df_e["emotions"] = df_e["emotions"].str.split(",")
|
| 379 |
+
df_e = df_e.explode("emotions")
|
| 380 |
+
df_e["emotions"] = df_e["emotions"].str.strip().str.lower()
|
| 381 |
+
counts = df_e["emotions"].dropna().value_counts()
|
| 382 |
+
colors = [emotion_colors.get(e, "#CCCCCC") for e in counts.index]
|
| 383 |
+
|
| 384 |
+
if orientation == "h":
|
| 385 |
+
fig = go.Figure(go.Bar(
|
| 386 |
+
y=counts.index, x=counts.values, orientation="h",
|
| 387 |
+
marker=dict(color=colors), text=counts.values, textposition="auto",
|
| 388 |
+
hovertemplate="<b>%{y}</b><br>%{x}<extra></extra>",
|
| 389 |
+
))
|
| 390 |
+
fig.update_layout(title=title, xaxis_title="Conversations",
|
| 391 |
+
yaxis_title="Emotion", height=self.chart_height,
|
| 392 |
+
yaxis={"categoryorder": "total ascending"})
|
| 393 |
+
else:
|
| 394 |
+
fig = go.Figure(go.Bar(
|
| 395 |
+
x=counts.index, y=counts.values,
|
| 396 |
+
marker=dict(color=colors), text=counts.values, textposition="auto",
|
| 397 |
+
hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
|
| 398 |
+
))
|
| 399 |
+
fig.update_layout(title=title, xaxis_title="Emotion",
|
| 400 |
+
yaxis_title="Conversations", height=self.chart_height)
|
| 401 |
+
return fig
|
| 402 |
+
|
| 403 |
+
# οΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 404 |
+
# Helpers
|
| 405 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 406 |
+
|
| 407 |
+
@staticmethod
|
| 408 |
+
def _empty_fig(title, message):
|
| 409 |
+
fig = go.Figure()
|
| 410 |
+
fig.add_annotation(text=message, xref="paper", yref="paper",
|
| 411 |
+
x=0.5, y=0.5, showarrow=False, font=dict(size=14))
|
| 412 |
+
fig.update_layout(title=title, height=300)
|
| 413 |
+
return fig
|