Danialebrat commited on
Commit
58db664
Β·
1 Parent(s): bf7e929

Adding HelpScout to UI

Browse files
Files changed (33) hide show
  1. .idea/vcs.xml +3 -1
  2. process_helpscout/README.md +339 -0
  3. process_helpscout/agents/README.md +310 -0
  4. process_helpscout/agents/__init__.py +0 -0
  5. process_helpscout/agents/base_agent.py +58 -0
  6. process_helpscout/agents/sentiment_analysis_agent.py +229 -0
  7. process_helpscout/agents/topic_extraction_agent.py +268 -0
  8. process_helpscout/config_files/processing_config.json +125 -0
  9. process_helpscout/config_files/topics.json +90 -0
  10. process_helpscout/data_fetcher.py +77 -0
  11. process_helpscout/fetch_and_export.py +183 -0
  12. process_helpscout/html_cleaner.py +169 -0
  13. process_helpscout/main.py +423 -0
  14. process_helpscout/snowflake_conn.py +106 -0
  15. process_helpscout/workflow/__init__.py +0 -0
  16. process_helpscout/workflow/conversation_processor.py +334 -0
  17. visualization/README.md +279 -140
  18. visualization/agents/helpscout_summary_agent.py +309 -0
  19. visualization/app.py +38 -10
  20. visualization/components/dashboard.py +55 -1
  21. visualization/components/helpscout_analysis.py +491 -0
  22. visualization/components/helpscout_dashboard.py +278 -0
  23. visualization/components/sentiment_analysis.py +38 -6
  24. visualization/config/viz_config.json +61 -1
  25. visualization/data/data_loader.py +25 -5
  26. visualization/data/helpscout_data_loader.py +382 -0
  27. visualization/utils/auth.py +0 -2
  28. visualization/utils/data_processor.py +46 -0
  29. visualization/utils/helpscout_pdf.py +471 -0
  30. visualization/utils/helpscout_utils.py +107 -0
  31. visualization/utils/pdf_exporter.py +80 -0
  32. visualization/visualizations/distribution_charts.py +131 -0
  33. visualization/visualizations/helpscout_charts.py +413 -0
.idea/vcs.xml CHANGED
@@ -1,4 +1,6 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
- <component name="VcsDirectoryMappings" defaultProject="true" />
 
 
4
  </project>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
  </project>
process_helpscout/README.md ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HelpScout Processing Pipeline
2
+
3
+ Extracts, cleans, and enriches customer support conversations from HelpScout.
4
+ The module has two distinct responsibilities:
5
+
6
+ 1. **Data export** (`fetch_and_export.py`) β€” fetches raw threads, cleans HTML, and exports CSVs for the Streamlit dashboard.
7
+ 2. **AI processing pipeline** (`main.py`) β€” fetches the same conversations, runs them through a two-step agentic workflow (sentiment + topic extraction), and writes enriched records to Snowflake.
8
+
9
+ ---
10
+
11
+ ## Folder Structure
12
+
13
+ ```
14
+ process_helpscout/
15
+ β”‚
16
+ β”œβ”€β”€ main.py # Pipeline entry point (parallel processing)
17
+ β”œβ”€β”€ data_fetcher.py # Fetches & aggregates conversations; deduplication check
18
+ β”œβ”€β”€ fetch_and_export.py # CSV export script (separate from the pipeline)
19
+ β”œβ”€β”€ html_cleaner.py # HTML β†’ clean plain text (shared by both workflows)
20
+ β”œβ”€β”€ snowflake_conn.py # Snowflake connection wrapper
21
+ β”‚
22
+ β”œβ”€β”€ agents/ # LLM-based extraction agents
23
+ β”‚ β”œβ”€β”€ README.md # Agent architecture docs (read this to extend)
24
+ β”‚ β”œβ”€β”€ base_agent.py # Abstract base class for all agents
25
+ β”‚ β”œβ”€β”€ sentiment_analysis_agent.py # Classifies sentiment polarity + emotions
26
+ β”‚ └── topic_extraction_agent.py # Assigns topic tags + billing flags
27
+ β”‚
28
+ β”œβ”€β”€ workflow/
29
+ β”‚ └── conversation_processor.py # LangGraph workflow: sentiment β†’ topics β†’ END
30
+ β”‚
31
+ β”œβ”€β”€ config_files/
32
+ β”‚ β”œβ”€β”€ processing_config.json # Agent models, batch settings, output table, sentiment categories
33
+ β”‚ └── topics.json # HelpScout topic taxonomy (source of truth for topic extraction)
34
+ β”‚
35
+ β”œβ”€β”€ queries/
36
+ β”‚ └── helpscout_conversations.sql # SQL that fetches customer threads from Snowflake
37
+ β”‚
38
+ β”œβ”€β”€ sql/
39
+ β”‚ └── create_features_table.sql # DDL β€” run once before first pipeline execution
40
+ β”‚
41
+ β”œβ”€β”€ output/ # Auto-created; holds CSV exports
42
+ β”‚ β”œβ”€β”€ helpscout_threads.csv
43
+ β”‚ └── helpscout_conversations.csv
44
+ β”‚
45
+ └── visualization/ # Streamlit dashboard (reads from CSV exports)
46
+ β”œβ”€β”€ app.py
47
+ β”œβ”€β”€ components/dashboard.py
48
+ └── utils/data_processor.py
49
+ ```
50
+
51
+ ---
52
+
53
+ ## Data Flow
54
+
55
+ ### CSV Export (Dashboard)
56
+
57
+ ```
58
+ Snowflake (STITCH.HELPSCOUT.CONVERSATION_THREADS)
59
+ β”‚ queries/helpscout_conversations.sql
60
+ β–Ό
61
+ fetch_and_export.py
62
+ β”‚ process_threads() β€” clean HTML, add word_count, date columns
63
+ β”‚ aggregate_conversations() β€” one row per conversation_id
64
+ β–Ό
65
+ output/helpscout_threads.csv (one row per message thread)
66
+ output/helpscout_conversations.csv (one row per conversation)
67
+ β”‚
68
+ β–Ό
69
+ visualization/app.py β†’ Streamlit dashboard
70
+ ```
71
+
72
+ ### AI Processing Pipeline
73
+
74
+ ```
75
+ Snowflake (STITCH.HELPSCOUT.CONVERSATION_THREADS)
76
+ β”‚ Same SQL β€” customer threads only, Feb 17 2026+
77
+ β–Ό
78
+ data_fetcher.fetch_conversations()
79
+ β”‚ Cleans HTML (html_cleaner.py)
80
+ β”‚ Aggregates to one row per conversation
81
+ β”‚ Checks HELPSCOUT_CONVERSATION_FEATURES for already-processed IDs
82
+ β–Ό
83
+ main.py β€” splits into parallel batches
84
+ β”‚
85
+ β”œβ”€β”€ Worker 1: ConversationProcessingWorkflow
86
+ β”‚ β”œβ”€β”€ Node 1: SentimentAnalysisAgent β†’ polarity + emotions
87
+ β”‚ └── Node 2: TopicExtractionAgent β†’ topics + billing flags
88
+ β”‚
89
+ β”œβ”€β”€ Worker 2: ... (same)
90
+ └── Worker N: ... (same)
91
+ β”‚
92
+ β–Ό
93
+ SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
94
+ ```
95
+
96
+ ---
97
+
98
+ ## Setup
99
+
100
+ ### 1. Environment variables
101
+
102
+ All credentials are read from the project root `.env` file.
103
+
104
+ | Key | Description |
105
+ |-----|-------------|
106
+ | `SNOWFLAKE_USER` | Snowflake username |
107
+ | `SNOWFLAKE_PASSWORD` | Snowflake password |
108
+ | `SNOWFLAKE_ACCOUNT` | Snowflake account identifier |
109
+ | `SNOWFLAKE_ROLE` | Role with access to `STITCH`, `ESTUARY`, and `SOCIAL_MEDIA_DB` |
110
+ | `SNOWFLAKE_WAREHOUSE` | Compute warehouse |
111
+ | `OPENAI_API_KEY` | Required for the AI pipeline only |
112
+
113
+ ### 2. Dependencies
114
+
115
+ All dependencies are in the project root `requirements.txt`:
116
+ - `snowflake-snowpark-python`
117
+ - `beautifulsoup4`
118
+ - `pandas`, `numpy`
119
+ - `langchain-openai`, `langgraph`
120
+ - `python-dotenv`
121
+ - `streamlit`, `plotly` (dashboard only)
122
+
123
+ ### 3. Create the output table (once)
124
+
125
+ Before running the pipeline for the first time, execute the DDL in Snowflake:
126
+
127
+ ```sql
128
+ -- Run this in your Snowflake worksheet or via the Snowflake CLI
129
+ -- File: sql/create_features_table.sql
130
+ ```
131
+
132
+ This creates `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES` with a primary key on `CONVERSATION_ID`. The pipeline always appends β€” it never truncates the table.
133
+
134
+ ---
135
+
136
+ ## Usage
137
+
138
+ ### Run the AI processing pipeline
139
+
140
+ ```bash
141
+ cd process_helpscout
142
+
143
+ # Process all new conversations (parallel, recommended)
144
+ python main.py
145
+
146
+ # Limit to 100 conversations β€” useful for a first test run
147
+ python main.py --limit 100
148
+
149
+ # Sequential mode β€” single process, easier to read logs when debugging
150
+ python main.py --sequential
151
+
152
+ # Use a custom config file
153
+ python main.py --config /path/to/my_config.json
154
+ ```
155
+
156
+ On every run the pipeline:
157
+ 1. Fetches all conversations (from Feb 17 2026 to today)
158
+ 2. Queries the output table for already-processed `CONVERSATION_ID`s
159
+ 3. Skips those β€” only new conversations are sent to the LLM
160
+ 4. Appends results to the Snowflake output table
161
+
162
+ ### Run the CSV export (dashboard data)
163
+
164
+ ```bash
165
+ cd process_helpscout
166
+ python fetch_and_export.py
167
+ ```
168
+
169
+ ### Launch the Streamlit dashboard
170
+
171
+ ```bash
172
+ cd process_helpscout
173
+ streamlit run visualization/app.py
174
+ ```
175
+
176
+ ---
177
+
178
+ ## Output Table
179
+
180
+ **Table:** `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES`
181
+
182
+ | Column | Type | Description |
183
+ |--------|------|-------------|
184
+ | `CONVERSATION_ID` | VARCHAR | HelpScout conversation ID (primary key) |
185
+ | `CUSTOMER_EMAIL` | VARCHAR | Customer email address |
186
+ | `CUSTOMER_FIRST` | VARCHAR | Customer first name |
187
+ | `CUSTOMER_LAST` | VARCHAR | Customer last name |
188
+ | `CUSTOMER_HS_ID` | NUMBER | HelpScout internal customer ID |
189
+ | `THREAD_COUNT` | NUMBER | Number of customer message threads |
190
+ | `FIRST_MESSAGE_AT` | TIMESTAMP_TZ | When the first customer message was sent |
191
+ | `LAST_MESSAGE_AT` | TIMESTAMP_TZ | When the last customer message was sent |
192
+ | `DURATION_HOURS` | FLOAT | Hours between first and last message |
193
+ | `STATUS` | VARCHAR | Last known HelpScout status |
194
+ | `STATE` | VARCHAR | Last known HelpScout state |
195
+ | `SOURCE_TYPE` | VARCHAR | e.g. `email`, `chat` |
196
+ | `SOURCE_VIA` | VARCHAR | e.g. `api`, `mailbox` |
197
+ | `COMBINED_TEXT` | TEXT | Raw aggregated customer messages |
198
+ | `CONVERSATION_TEXT_USED` | TEXT | Formatted + truncated text sent to the LLM |
199
+ | `SENTIMENT_POLARITY` | VARCHAR | `very_positive` / `positive` / `neutral` / `negative` / `very_negative` |
200
+ | `EMOTIONS` | VARCHAR | Comma-separated emotion values (NULL if none valid) |
201
+ | `SENTIMENT_CONFIDENCE` | VARCHAR | `high` / `medium` / `low` |
202
+ | `SENTIMENT_NOTES` | TEXT | 1-2 sentence LLM explanation of the sentiment |
203
+ | `TOPICS` | VARCHAR | Comma-separated topic IDs (multi-label) |
204
+ | `IS_REFUND_REQUEST` | BOOLEAN | Customer explicitly asked for a refund |
205
+ | `IS_CANCELLATION` | BOOLEAN | Customer explicitly wants to cancel |
206
+ | `IS_MEMBERSHIP` | BOOLEAN | Customer wants to join/rejoin and purchase membership |
207
+ | `TOPIC_CONFIDENCE` | VARCHAR | `high` / `medium` / `low` |
208
+ | `TOPIC_NOTES` | TEXT | 1-2 sentence LLM explanation of topics |
209
+ | `SUMMARY` | TEXT | 2-3 sentence neutral summary of the conversation |
210
+ | `PROCESSING_ERRORS` | TEXT | Semicolon-separated errors (NULL on full success) |
211
+ | `PROCESSED_AT` | TIMESTAMP_NTZ | When this record was written by the pipeline |
212
+ | `WORKFLOW_VERSION` | VARCHAR | Pipeline version for auditability |
213
+
214
+ ---
215
+
216
+ ## Configuration
217
+
218
+ All pipeline settings live in `config_files/processing_config.json`.
219
+
220
+ ### Agent models
221
+
222
+ ```json
223
+ "agents": {
224
+ "sentiment_analysis": {
225
+ "model": "gpt-4o-mini",
226
+ "temperature": 0.2,
227
+ "max_retries": 3
228
+ },
229
+ "topic_extraction": {
230
+ "model": "gpt-4o-mini",
231
+ "temperature": 0.2,
232
+ "max_retries": 3
233
+ }
234
+ }
235
+ ```
236
+
237
+ Switch any agent to `gpt-4o` for higher accuracy (at higher cost) by changing the `"model"` value.
238
+
239
+ ### Conversation length
240
+
241
+ ```json
242
+ "processing": {
243
+ "max_conversation_chars": 3000,
244
+ "min_batch_size": 10,
245
+ "max_batch_size": 50
246
+ }
247
+ ```
248
+
249
+ `max_conversation_chars` controls how many characters of conversation text are sent to the LLM. Increasing this improves context for long conversations but raises token costs. The workflow formats messages as `[1] msg\n[2] msg…` and truncates at this limit.
250
+
251
+ ### Output destination
252
+
253
+ ```json
254
+ "output": {
255
+ "database": "SOCIAL_MEDIA_DB",
256
+ "schema": "ML_FEATURES",
257
+ "table": "HELPSCOUT_CONVERSATION_FEATURES"
258
+ }
259
+ ```
260
+
261
+ To write to a different table (e.g. a staging or test table), change these values and re-run the DDL in `sql/create_features_table.sql` for the new table name.
262
+
263
+ ### Sentiment categories
264
+
265
+ The `sentiment_polarity` and `emotions` blocks in `processing_config.json` define the valid values for classification. Adding, removing, or renaming a category here is automatically reflected in both the LLM prompt and the output validation β€” no code changes required.
266
+
267
+ ### Topic taxonomy
268
+
269
+ Topic definitions live in `config_files/topics.json`. This file is the single source of truth: the `TopicExtractionAgent` builds its system prompt directly from it. To add a new topic:
270
+
271
+ 1. Add an entry to the `"topics"` array with a unique `id`, `label`, and `description`.
272
+ 2. If the topic has boolean sub-flags (like billing), add a `"flags"` key β€” then update `topic_extraction_agent.py` to extract those flags.
273
+ 3. Re-run the pipeline β€” the new topic will be available immediately.
274
+
275
+ ---
276
+
277
+ ## SQL Query
278
+
279
+ **File:** `queries/helpscout_conversations.sql`
280
+
281
+ | Design decision | Detail |
282
+ |-----------------|--------|
283
+ | Date filter | `CREATED_AT >= '2026-02-17'` to current date |
284
+ | Team exclusion | Anti-join with `USORA_USERS WHERE access_level = 'team'` β€” only customer messages reach the pipeline |
285
+ | Thread types | `TYPE IN ('customer', 'message')` β€” excludes notes, forwarded threads, system messages |
286
+ | JSON extraction | Snowflake semi-structured syntax: `COLUMN:field::VARCHAR` |
287
+
288
+ To change the date range, edit the `WHERE ct.CREATED_AT >= '...'` line in the SQL file.
289
+
290
+ ---
291
+
292
+ ## HTML Cleaner
293
+
294
+ `html_cleaner.py` runs a four-stage pipeline on every message body:
295
+
296
+ | Stage | What it removes |
297
+ |-------|----------------|
298
+ | `_remove_quoted_sections()` | `<blockquote>` tags and Gmail/Outlook/Yahoo quoted-reply CSS wrappers |
299
+ | `_remove_boilerplate()` | `<table>`, `<img>`, `<script>`, `<style>` tags and footer/unsubscribe blocks |
300
+ | `_extract_text()` | Extracts plain text while preserving line breaks |
301
+ | `_clean_text()` | Strips invisible Unicode, collapses whitespace, removes `>` quote lines, cuts off at "On … wrote:" markers |
302
+
303
+ To add a new boilerplate pattern, append a string to `footer_keywords` inside `_remove_boilerplate()`, or add a CSS class fragment to `_QUOTED_CLASS_PATTERNS` at the top of the file.
304
+
305
+ ---
306
+
307
+ ## Extending the Pipeline
308
+
309
+ ### Add a third agentic step
310
+
311
+ 1. Create `agents/your_new_agent.py` inheriting from `BaseAgent` (see `agents/README.md`).
312
+ 2. Add a new node method `_your_node()` in `workflow/conversation_processor.py`.
313
+ 3. Add the node and a new edge in `_build_workflow()`:
314
+ ```python
315
+ graph.add_node("your_step", self._your_node)
316
+ graph.add_edge("topic_extraction", "your_step")
317
+ graph.add_edge("your_step", END)
318
+ ```
319
+ 4. Add the corresponding output fields to `ConversationState`.
320
+ 5. Map new columns in `main.py`'s `column_map` dict and add them to the DDL.
321
+
322
+ ### Change the date range
323
+
324
+ Edit `queries/helpscout_conversations.sql`:
325
+ ```sql
326
+ ct.CREATED_AT >= '2026-02-17 00:00:00' -- ← change start date
327
+ ```
328
+
329
+ ### Include team replies
330
+
331
+ Remove the anti-join in `helpscout_conversations.sql` and broaden `TYPE` to include `'note'` and `'message'`. Be sure to update the HTML cleaning and aggregation if team messages need different handling.
332
+
333
+ ### Process a different HelpScout mailbox
334
+
335
+ Add a `WHERE` clause on a mailbox ID column if available, or filter by `source_via` / `status`.
336
+
337
+ ### Automate daily runs
338
+
339
+ Schedule `main.py` with a cron job, Airflow DAG, or any task scheduler. Because the pipeline skips already-processed conversations, re-running it daily processes only new conversations β€” no manual bookkeeping needed.
process_helpscout/agents/README.md ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agents
2
+
3
+ The agents package contains the LLM-based extraction components used in the HelpScout processing pipeline. Each agent is a self-contained class responsible for one well-defined task.
4
+
5
+ ---
6
+
7
+ ## Architecture
8
+
9
+ ```
10
+ BaseAgent (base_agent.py)
11
+ β”‚
12
+ β”œβ”€β”€ SentimentAnalysisAgent (sentiment_analysis_agent.py)
13
+ β”‚ Classifies overall sentiment polarity and emotions
14
+ β”‚ from a customer support conversation.
15
+ β”‚
16
+ └── TopicExtractionAgent (topic_extraction_agent.py)
17
+ Assigns one or more topic tags and extracts
18
+ billing-specific boolean flags.
19
+ ```
20
+
21
+ All agents follow the same contract defined in `BaseAgent`:
22
+
23
+ | Method | Required | Description |
24
+ |--------|----------|-------------|
25
+ | `validate_input(input_data)` | Yes | Returns `True` if the input dict has the required fields |
26
+ | `process(input_data)` | Yes | Main entry point β€” validates, calls LLM, returns result dict |
27
+ | `log_processing(message, level)` | Inherited | Logs `[AgentName] message` at the given level |
28
+ | `handle_error(error, context)` | Inherited | Returns a standardised `{"success": False, "error": ...}` dict |
29
+
30
+ The workflow (`workflow/conversation_processor.py`) calls `agent.process(input_data)` for each node. Agents never call each other β€” they are orchestrated exclusively by the workflow.
31
+
32
+ ---
33
+
34
+ ## BaseAgent (`base_agent.py`)
35
+
36
+ Defines the interface every agent must implement. Contains no LLM logic.
37
+
38
+ ### Key attributes set from config
39
+
40
+ ```python
41
+ self.model # LLM model name, e.g. "gpt-4o-mini"
42
+ self.temperature # Sampling temperature (default: 0.2)
43
+ self.max_retries # Reserved for retry logic in subclasses
44
+ ```
45
+
46
+ These are read from the agent's block in `config_files/processing_config.json`:
47
+ ```json
48
+ "agents": {
49
+ "sentiment_analysis": { "model": "gpt-4o-mini", "temperature": 0.2, "max_retries": 3 }
50
+ }
51
+ ```
52
+
53
+ ### Return contract
54
+
55
+ Every `process()` implementation must return a dict with at minimum:
56
+ ```python
57
+ {"success": True, ...} # on success β€” include extracted fields
58
+ {"success": False, "error": "<reason>"} # on failure
59
+ ```
60
+
61
+ The workflow checks `success` to decide whether to mark a conversation as failed.
62
+
63
+ ---
64
+
65
+ ## SentimentAnalysisAgent (`sentiment_analysis_agent.py`)
66
+
67
+ Classifies the overall **sentiment polarity** and **emotions** expressed across a customer's conversation messages.
68
+
69
+ ### Input
70
+
71
+ ```python
72
+ agent.process({
73
+ "conversation_text": "<formatted, truncated customer messages>"
74
+ })
75
+ ```
76
+
77
+ The `conversation_text` is prepared by the workflow before calling the agent β€” it is numbered, pipe-delimited messages truncated to `max_conversation_chars`.
78
+
79
+ ### Output (on success)
80
+
81
+ ```python
82
+ {
83
+ "success": True,
84
+ "sentiment_polarity": "negative", # one of the 5 polarity values
85
+ "emotions": "frustration, disappointment", # comma-separated, or None (soft-fail)
86
+ "sentiment_confidence": "high",
87
+ "sentiment_notes": "Customer is frustrated by repeated login failures."
88
+ }
89
+ ```
90
+
91
+ ### Validation rules
92
+
93
+ | Field | Behaviour on invalid value |
94
+ |-------|---------------------------|
95
+ | `sentiment_polarity` | Hard fail β€” conversation is not stored |
96
+ | `emotions` | Soft fail β€” `None` is stored, conversation is still written |
97
+ | `confidence` | Silently corrected to `"medium"` |
98
+
99
+ ### Where categories are defined
100
+
101
+ Polarity and emotion categories (their `value` and `description` strings) live in `config_files/processing_config.json` under `"sentiment_polarity"` and `"emotions"`. The system prompt is **built at init time from the config**, so updating the config is all you need to change what the LLM is instructed to classify.
102
+
103
+ ### Modifying the sentiment prompt
104
+
105
+ The system prompt is assembled in `_build_system_prompt()`. To change the framing or add additional instructions, edit that method directly. The category lists are injected automatically from config β€” do not hardcode them in the prompt.
106
+
107
+ ---
108
+
109
+ ## TopicExtractionAgent (`topic_extraction_agent.py`)
110
+
111
+ Assigns one or more **topic tags** from the Musora HelpScout taxonomy, extracts three **billing/membership boolean flags**, and produces a brief **neutral summary** of the conversation.
112
+
113
+ ### Input
114
+
115
+ ```python
116
+ agent.process({
117
+ "conversation_text": "<formatted, truncated customer messages>"
118
+ })
119
+ ```
120
+
121
+ ### Output (on success)
122
+
123
+ ```python
124
+ {
125
+ "success": True,
126
+ "topics": "billing_and_subscription, account_and_access", # comma-separated IDs
127
+ "is_refund_request": True, # customer explicitly asked for money back
128
+ "is_cancellation": False, # customer did NOT explicitly ask to cancel
129
+ "is_membership": False, # customer wants to join/rejoin and purchase membership
130
+ "topic_confidence": "high",
131
+ "topic_notes": "Customer was unexpectedly charged and is requesting a refund.",
132
+ "summary": "The customer reports being charged after believing they had cancelled their subscription. They are requesting a full refund and confirmation that no further charges will occur."
133
+ }
134
+ ```
135
+
136
+ ### Validation rules
137
+
138
+ | Field | Behaviour on invalid value |
139
+ |-------|---------------------------|
140
+ | `topics` | Hard fail if no valid topic IDs remain after filtering |
141
+ | `is_refund_request` / `is_cancellation` / `is_membership` | Coerced to `bool`; defaults to `False` if missing |
142
+ | `confidence` | Silently corrected to `"medium"` |
143
+ | `summary` | Soft fail β€” `""` stored if missing; conversation still written |
144
+
145
+ ### Where topics are defined
146
+
147
+ All topic definitions live in `config_files/topics.json`. The agent builds its system prompt directly from this file at init time β€” adding, removing, or rewriting a topic description requires only a config change.
148
+
149
+ ### Billing and membership flags
150
+
151
+ `is_refund_request`, `is_cancellation`, and `is_membership` are extracted on every conversation regardless of which topics are assigned. They are defined in `topics.json` under `billing_and_subscription.flags` for documentation purposes, but the agent always asks the LLM to evaluate them independently.
152
+
153
+ ### Summary
154
+
155
+ The `summary` field is a 2-3 sentence factual, third-person overview of the conversation β€” what the customer contacted support about, relevant context they provided, and their core request. It is designed to give a reader instant context without reading the full conversation, and can also be used as compact input when chaining LLM calls.
156
+
157
+ ---
158
+
159
+ ## How to Add a New Agent
160
+
161
+ Follow these steps to add a third extraction step (e.g. urgency scoring):
162
+
163
+ ### Step 1 β€” Create the agent file
164
+
165
+ ```python
166
+ # agents/urgency_agent.py
167
+ from agents.base_agent import BaseAgent
168
+ from langchain_openai import ChatOpenAI
169
+ from langchain.schema import HumanMessage, SystemMessage
170
+ import json, logging
171
+
172
+ logger = logging.getLogger(__name__)
173
+
174
+ class UrgencyAgent(BaseAgent):
175
+
176
+ def __init__(self, config, api_key):
177
+ super().__init__("UrgencyAgent", config)
178
+ self.llm = ChatOpenAI(
179
+ model=self.model,
180
+ temperature=self.temperature,
181
+ api_key=api_key,
182
+ model_kwargs={"response_format": {"type": "json_object"}},
183
+ )
184
+ self._system_prompt = (
185
+ "Classify the urgency of this customer support conversation.\n"
186
+ 'Return JSON: {"urgency": "high"|"medium"|"low", "urgency_notes": "<reason>"}'
187
+ )
188
+
189
+ def validate_input(self, input_data):
190
+ return "conversation_text" in input_data and bool(input_data["conversation_text"])
191
+
192
+ def process(self, input_data):
193
+ if not self.validate_input(input_data):
194
+ return {"success": False, "error": "Missing conversation_text"}
195
+ try:
196
+ response = self.llm.invoke([
197
+ SystemMessage(content=self._system_prompt),
198
+ HumanMessage(content=input_data["conversation_text"]),
199
+ ])
200
+ raw = json.loads(response.content)
201
+ urgency = raw.get("urgency", "medium")
202
+ if urgency not in {"high", "medium", "low"}:
203
+ urgency = "medium"
204
+ return {
205
+ "success": True,
206
+ "urgency": urgency,
207
+ "urgency_notes": raw.get("urgency_notes", ""),
208
+ }
209
+ except Exception as e:
210
+ return self.handle_error(e, "urgency_classification")
211
+ ```
212
+
213
+ ### Step 2 β€” Add config for the new agent
214
+
215
+ In `config_files/processing_config.json`:
216
+ ```json
217
+ "agents": {
218
+ "sentiment_analysis": { ... },
219
+ "topic_extraction": { ... },
220
+ "urgency": {
221
+ "model": "gpt-4o-mini",
222
+ "temperature": 0.1,
223
+ "max_retries": 3
224
+ }
225
+ }
226
+ ```
227
+
228
+ ### Step 3 β€” Add a node to the workflow
229
+
230
+ In `workflow/conversation_processor.py`:
231
+
232
+ ```python
233
+ # 1. Import the new agent
234
+ from agents.urgency_agent import UrgencyAgent
235
+
236
+ # 2. Instantiate in __init__
237
+ self.urgency_agent = UrgencyAgent(config["agents"]["urgency"], api_key)
238
+
239
+ # 3. Add fields to ConversationState
240
+ urgency: str
241
+ urgency_notes: str
242
+
243
+ # 4. Add the node method
244
+ def _urgency_node(self, state):
245
+ try:
246
+ result = self.urgency_agent.process({"conversation_text": state["conversation_text"]})
247
+ if result.get("success"):
248
+ state["urgency"] = result.get("urgency")
249
+ state["urgency_notes"] = result.get("urgency_notes", "")
250
+ else:
251
+ state["processing_errors"] = state.get("processing_errors", []) + [
252
+ f"Urgency failed: {result.get('error')}"
253
+ ]
254
+ state["urgency"] = None
255
+ except Exception as e:
256
+ state["processing_errors"] = state.get("processing_errors", []) + [str(e)]
257
+ return state
258
+
259
+ # 5. Wire into the graph in _build_workflow()
260
+ graph.add_node("urgency", self._urgency_node)
261
+ graph.add_edge("topic_extraction", "urgency") # replaces the old edge to END
262
+ graph.add_edge("urgency", END)
263
+ ```
264
+
265
+ ### Step 4 β€” Add output columns
266
+
267
+ In `main.py`, add to the `column_map` dict:
268
+ ```python
269
+ "urgency": "URGENCY",
270
+ "urgency_notes": "URGENCY_NOTES",
271
+ ```
272
+
273
+ In `sql/create_features_table.sql`, add:
274
+ ```sql
275
+ URGENCY VARCHAR(20),
276
+ URGENCY_NOTES TEXT,
277
+ ```
278
+
279
+ Run `ALTER TABLE` or recreate the table for the new columns to appear.
280
+
281
+ ---
282
+
283
+ ## How to Modify an Existing Agent
284
+
285
+ ### Change the LLM model or temperature
286
+
287
+ Edit `config_files/processing_config.json` β€” no code change needed.
288
+
289
+ ### Add or rename a sentiment category
290
+
291
+ In `config_files/processing_config.json`, update `sentiment_polarity.categories` or `emotions.categories`. The agent reads these at init and builds the prompt and validation set dynamically. The only code-level change is updating the output table column type/constraint if the new value is longer than the current `VARCHAR` size.
292
+
293
+ ### Add or rename a topic
294
+
295
+ In `config_files/topics.json`, add or edit an entry in the `"topics"` array. The `TopicExtractionAgent` reads this file at init β€” the new topic appears in the prompt and validation automatically.
296
+
297
+ ### Change the conversation truncation limit
298
+
299
+ In `config_files/processing_config.json`:
300
+ ```json
301
+ "processing": {
302
+ "max_conversation_chars": 3000
303
+ }
304
+ ```
305
+
306
+ This is read by the workflow (`conversation_processor.py`) before formatting the conversation text β€” no agent code changes needed.
307
+
308
+ ### Modify the system prompt framing
309
+
310
+ Each agent builds its prompt in a `_build_system_prompt()` method. Edit that method directly. Category lists are always injected from config β€” avoid hardcoding values that already live in the JSON.
process_helpscout/agents/__init__.py ADDED
File without changes
process_helpscout/agents/base_agent.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Agent class for all agents in the HelpScout processing workflow.
3
+ Provides a common interface and consistent error handling.
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from typing import Dict, Any
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class BaseAgent(ABC):
14
+ """
15
+ Abstract base class for all agents in the agentic workflow.
16
+ Enforces a consistent interface and provides shared utilities.
17
+ """
18
+
19
+ def __init__(self, name: str, config: Dict[str, Any]):
20
+ self.name = name
21
+ self.config = config
22
+ self.model = config.get("model", "gpt-5-nano")
23
+ self.temperature = config.get("temperature", 0.2)
24
+ self.max_retries = config.get("max_retries", 3)
25
+ logger.info(f"Initialized {self.name} with model {self.model}")
26
+
27
+ @abstractmethod
28
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
29
+ """
30
+ Process input data and return results.
31
+ Must be implemented by all concrete agent classes.
32
+ """
33
+ pass
34
+
35
+ @abstractmethod
36
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
37
+ """
38
+ Validate input data before processing.
39
+ Returns True if input is valid, False otherwise.
40
+ """
41
+ pass
42
+
43
+ def log_processing(self, message: str, level: str = "info"):
44
+ log_method = getattr(logger, level, logger.info)
45
+ log_method(f"[{self.name}] {message}")
46
+
47
+ def handle_error(self, error: Exception, context: str = "") -> Dict[str, Any]:
48
+ error_msg = f"Error in {self.name}"
49
+ if context:
50
+ error_msg += f" ({context})"
51
+ error_msg += f": {str(error)}"
52
+ logger.error(error_msg)
53
+ return {
54
+ "success": False,
55
+ "error": str(error),
56
+ "agent": self.name,
57
+ "context": context,
58
+ }
process_helpscout/agents/sentiment_analysis_agent.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sentiment Analysis Agent for HelpScout customer support conversations.
3
+
4
+ Classifies the overall sentiment polarity and emotions from a customer's
5
+ conversation with Musora support. Unlike the social media variant, this
6
+ agent operates on full conversations (multiple messages) rather than
7
+ individual comments, and does not extract intents or compute requires_reply
8
+ (all support tickets inherently require a response).
9
+ """
10
+
11
+ from typing import Dict, Any, List, Optional
12
+ import json
13
+ from langchain_openai import ChatOpenAI
14
+ from langchain.schema import HumanMessage, SystemMessage
15
+ from agents.base_agent import BaseAgent
16
+ import logging
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class SentimentAnalysisAgent(BaseAgent):
22
+ """
23
+ Classifies the sentiment polarity and emotions of a customer support
24
+ conversation from HelpScout.
25
+
26
+ Design decisions:
27
+ - System prompt is built once at init from the config categories
28
+ - Emotions are soft-fail: None stored when the field is missing or invalid
29
+ - Input is the formatted conversation text (already truncated upstream)
30
+ """
31
+
32
+ def __init__(self, config: Dict[str, Any], api_key: str, processing_config: Dict[str, Any]):
33
+ """
34
+ Args:
35
+ config: Agent-level config dict (model, temperature, max_retries)
36
+ api_key: OpenAI API key
37
+ processing_config: Full processing_config.json content (for categories)
38
+ """
39
+ super().__init__("SentimentAnalysisAgent", config)
40
+ self.api_key = api_key
41
+
42
+ # Pre-compute valid value sets from config for O(1) validation
43
+ self._valid_polarities = {
44
+ cat["value"] for cat in processing_config["sentiment_polarity"]["categories"]
45
+ }
46
+ self._valid_emotions = {
47
+ cat["value"] for cat in processing_config["emotions"]["categories"]
48
+ }
49
+ self._emotions_soft_fail = processing_config["emotions"].get("soft_fail", True)
50
+
51
+ self.llm = ChatOpenAI(
52
+ model=self.model,
53
+ temperature=self.temperature,
54
+ api_key=self.api_key,
55
+ model_kwargs={"response_format": {"type": "json_object"}},
56
+ )
57
+
58
+ # Build system prompt once β€” reused for every LLM call
59
+ self._system_prompt = self._build_system_prompt(processing_config)
60
+
61
+ # ------------------------------------------------------------------
62
+ # Prompt construction
63
+ # ------------------------------------------------------------------
64
+
65
+ def _build_system_prompt(self, processing_config: Dict[str, Any]) -> str:
66
+ polarity_lines = "\n".join(
67
+ f"- {cat['value']}: {cat['description']}"
68
+ for cat in processing_config["sentiment_polarity"]["categories"]
69
+ )
70
+ emotion_lines = "\n".join(
71
+ f"- {cat['value']}: {cat['description']}"
72
+ for cat in processing_config["emotions"]["categories"]
73
+ )
74
+
75
+ return (
76
+ "You are analyzing customer support conversations for Musora, a music education platform.\n\n"
77
+ "You will receive one or more messages from a customer (team responses are excluded). "
78
+ "Classify the overall sentiment and emotional tone of the CUSTOMER's messages as a whole.\n\n"
79
+ "Return JSON only:\n"
80
+ '{"sentiment_polarity": <value>, "emotions": [<values>], '
81
+ '"confidence": "high"|"medium"|"low", "analysis_notes": "<1-2 sentences>"}\n\n'
82
+ f"POLARITY (pick one):\n{polarity_lines}\n\n"
83
+ f"EMOTIONS (multi-label, pick all that apply; use [\"neutral\"] if none detected):\n{emotion_lines}\n\n"
84
+ "Guidelines:\n"
85
+ "- Base your classification on the customer's overall tone, not isolated words\n"
86
+ "- A customer reporting a technical issue with no emotional language β†’ neutral\n"
87
+ "- A customer expressing frustration alongside their issue β†’ negative\n"
88
+ "- analysis_notes: 1-2 sentences highlighting the key sentiment drivers"
89
+ )
90
+
91
+ def _build_user_prompt(self, conversation_text: str) -> str:
92
+ return f"Customer conversation:\n\n{conversation_text}"
93
+
94
+ # ------------------------------------------------------------------
95
+ # Output validation
96
+ # ------------------------------------------------------------------
97
+
98
+ def _parse_emotions(self, raw_emotions: Any) -> Optional[List[str]]:
99
+ """Soft-fail emotion parsing β€” returns None instead of raising."""
100
+ if not raw_emotions:
101
+ return None
102
+ if isinstance(raw_emotions, str):
103
+ raw_emotions = [e.strip() for e in raw_emotions.split(",")]
104
+ if not isinstance(raw_emotions, list):
105
+ return None
106
+ valid = [e for e in raw_emotions if e in self._valid_emotions]
107
+ return valid if valid else None
108
+
109
+ def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
110
+ """
111
+ Validate LLM output against config-defined allowed values.
112
+ - Invalid polarity β†’ hard fail (conversation will not be stored)
113
+ - Invalid emotions β†’ soft fail (None; conversation still stored)
114
+ - Invalid confidence β†’ corrected to "medium"
115
+ """
116
+ polarity = raw.get("sentiment_polarity")
117
+ if not polarity or polarity not in self._valid_polarities:
118
+ return {
119
+ "success": False,
120
+ "error": (
121
+ f"Invalid sentiment_polarity '{polarity}'. "
122
+ f"Expected one of: {sorted(self._valid_polarities)}"
123
+ ),
124
+ }
125
+
126
+ confidence = raw.get("confidence", "medium")
127
+ if confidence not in {"high", "medium", "low"}:
128
+ confidence = "medium"
129
+
130
+ emotions = self._parse_emotions(raw.get("emotions"))
131
+
132
+ return {
133
+ "success": True,
134
+ "sentiment_polarity": polarity,
135
+ "emotions": emotions,
136
+ "confidence": confidence,
137
+ "analysis_notes": str(raw.get("analysis_notes", "")).strip(),
138
+ }
139
+
140
+ # ------------------------------------------------------------------
141
+ # Core analysis
142
+ # ------------------------------------------------------------------
143
+
144
+ def analyze(self, conversation_text: str) -> Dict[str, Any]:
145
+ """
146
+ Call the LLM to classify sentiment of the customer conversation.
147
+
148
+ Args:
149
+ conversation_text: Pre-formatted, truncated conversation text
150
+
151
+ Returns:
152
+ Success dict with sentiment fields, or failure dict with error key.
153
+ """
154
+ user_prompt = self._build_user_prompt(conversation_text)
155
+
156
+ try:
157
+ messages = [
158
+ SystemMessage(content=self._system_prompt),
159
+ HumanMessage(content=user_prompt),
160
+ ]
161
+ response = self.llm.invoke(messages)
162
+ raw = json.loads(response.content)
163
+
164
+ validated = self._validate_result(raw)
165
+ if not validated["success"]:
166
+ self.log_processing(f"Validation failed: {validated['error']}", "warning")
167
+ return validated
168
+
169
+ emotions_list = validated.get("emotions")
170
+ return {
171
+ "success": True,
172
+ "sentiment_polarity": validated["sentiment_polarity"],
173
+ "emotions": ", ".join(emotions_list) if emotions_list else None,
174
+ "sentiment_confidence": validated["confidence"],
175
+ "sentiment_notes": validated["analysis_notes"],
176
+ }
177
+
178
+ except json.JSONDecodeError as e:
179
+ self.log_processing(f"JSON decode error: {e}", "warning")
180
+ return {"success": False, "error": f"JSON parse error: {e}"}
181
+
182
+ except Exception as e:
183
+ self.log_processing(f"Sentiment analysis failed: {e}", "error")
184
+ return {"success": False, "error": str(e)}
185
+
186
+ # ------------------------------------------------------------------
187
+ # Agent interface
188
+ # ------------------------------------------------------------------
189
+
190
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
191
+ return "conversation_text" in input_data and bool(input_data["conversation_text"])
192
+
193
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
194
+ """
195
+ Args:
196
+ input_data: Must contain 'conversation_text' (formatted, truncated).
197
+
198
+ Returns:
199
+ Dict with sentiment fields merged on top of input_data.
200
+ """
201
+ try:
202
+ if not self.validate_input(input_data):
203
+ return {
204
+ "success": False,
205
+ "error": "Invalid input: 'conversation_text' is required and must be non-empty",
206
+ }
207
+
208
+ self.log_processing("Analyzing conversation sentiment", "debug")
209
+ result = self.analyze(input_data["conversation_text"])
210
+
211
+ output = {
212
+ "success": result.get("success", False),
213
+ "sentiment_polarity": result.get("sentiment_polarity"),
214
+ "emotions": result.get("emotions"),
215
+ "sentiment_confidence": result.get("sentiment_confidence"),
216
+ "sentiment_notes": result.get("sentiment_notes", ""),
217
+ }
218
+ if "error" in result:
219
+ output["sentiment_error"] = result["error"]
220
+
221
+ # Preserve all original input fields
222
+ for key, value in input_data.items():
223
+ if key not in output:
224
+ output[key] = value
225
+
226
+ return output
227
+
228
+ except Exception as e:
229
+ return self.handle_error(e, "sentiment_analysis")
process_helpscout/agents/topic_extraction_agent.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Topic Extraction Agent for HelpScout customer support conversations.
3
+
4
+ Assigns one or more topic tags from the Musora HelpScout taxonomy to a
5
+ customer conversation. Also extracts three boolean billing signals:
6
+ - is_refund_request: customer explicitly wants their money back
7
+ - is_cancellation: customer wants to cancel their subscription
8
+ - is_membership: customer wants to join/rejoin and purchase membership
9
+
10
+ Also produces a brief neutral summary (2-3 sentences) of the conversation.
11
+
12
+ Topic definitions are loaded from config_files/topics.json so any taxonomy
13
+ update is automatically reflected in the prompt without code changes.
14
+ """
15
+
16
+ from typing import Dict, Any, List, Optional
17
+ import json
18
+ from langchain_openai import ChatOpenAI
19
+ from langchain.schema import HumanMessage, SystemMessage
20
+ from agents.base_agent import BaseAgent
21
+ import logging
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class TopicExtractionAgent(BaseAgent):
27
+ """
28
+ Extracts topic tags and billing flags from a customer support conversation.
29
+
30
+ Design decisions:
31
+ - Topics are multi-label: a conversation can receive multiple tags
32
+ - The 'uncategorized' topic is valid but discouraged (see topics.json notes)
33
+ - is_refund_request / is_cancellation are always extracted independently,
34
+ even when billing_and_subscription is not the primary topic
35
+ - System prompt is built once at init from topics.json
36
+ """
37
+
38
+ def __init__(self, config: Dict[str, Any], api_key: str, topics_config: Dict[str, Any]):
39
+ """
40
+ Args:
41
+ config: Agent-level config dict (model, temperature, max_retries)
42
+ api_key: OpenAI API key
43
+ topics_config: Parsed topics.json content
44
+ """
45
+ super().__init__("TopicExtractionAgent", config)
46
+ self.api_key = api_key
47
+ self.topics_config = topics_config
48
+
49
+ # Pre-compute valid topic ID set for O(1) validation
50
+ self._valid_topics = {topic["id"] for topic in topics_config["topics"]}
51
+
52
+ self.llm = ChatOpenAI(
53
+ model=self.model,
54
+ temperature=self.temperature,
55
+ api_key=self.api_key,
56
+ model_kwargs={"response_format": {"type": "json_object"}},
57
+ )
58
+
59
+ # Build system prompt once β€” reused for every LLM call
60
+ self._system_prompt = self._build_system_prompt()
61
+
62
+ # ------------------------------------------------------------------
63
+ # Prompt construction
64
+ # ------------------------------------------------------------------
65
+
66
+ def _build_system_prompt(self) -> str:
67
+ topic_lines = "\n".join(
68
+ f"- {topic['id']}: {topic['description']}"
69
+ for topic in self.topics_config["topics"]
70
+ )
71
+
72
+ usage_notes = "\n".join(
73
+ f" β€’ {note}"
74
+ for note in self.topics_config.get("_meta", {}).get("usage_notes", [])
75
+ )
76
+
77
+ return (
78
+ "You are classifying customer support conversations for Musora, a music education platform.\n\n"
79
+ "Assign one or more topic tags to the customer's conversation based on what they are "
80
+ "contacting support about.\n\n"
81
+ "Return JSON only:\n"
82
+ '{\n'
83
+ ' "topics": [<topic_ids>],\n'
84
+ ' "is_refund_request": true|false,\n'
85
+ ' "is_cancellation": true|false,\n'
86
+ ' "is_membership": true|false,\n'
87
+ ' "confidence": "high"|"medium"|"low",\n'
88
+ ' "topic_notes": "<1-2 sentences explaining the classification>",\n'
89
+ ' "summary": "<2-3 sentence neutral summary of the conversation>"\n'
90
+ '}\n\n'
91
+ f"AVAILABLE TOPICS (use the id values exactly):\n{topic_lines}\n\n"
92
+ f"RULES:\n{usage_notes}\n\n"
93
+ "BILLING FLAGS (always extract, regardless of topic):\n"
94
+ " β€’ is_refund_request: true ONLY when the customer explicitly asks for money back\n"
95
+ " β€’ is_cancellation: true ONLY when the customer explicitly wants to cancel their subscription\n"
96
+ " β€’ is_membership: true ONLY when the customer wants to join or rejoin and purchase a membership\n\n"
97
+ "SUMMARY GUIDELINES:\n"
98
+ " β€’ Write 2-3 sentences maximum\n"
99
+ " β€’ Be factual and neutral β€” do not repeat sentiment or topic labels\n"
100
+ " β€’ Capture: what the customer contacted support about, any key context or history they provided, "
101
+ "and the core request or outcome they are seeking\n"
102
+ " β€’ Write in third person (e.g. 'The customer reports...')\n\n"
103
+ "IMPORTANT:\n"
104
+ " - Focus on the customer's messages; ignore any team response context\n"
105
+ " - Use exact topic id strings from the list above\n"
106
+ " - topic_notes: briefly explain why you chose these topics"
107
+ )
108
+
109
+ def _build_user_prompt(self, conversation_text: str) -> str:
110
+ return f"Customer conversation:\n\n{conversation_text}"
111
+
112
+ # ------------------------------------------------------------------
113
+ # Output validation
114
+ # ------------------------------------------------------------------
115
+
116
+ def _validate_topics(self, raw_topics: Any) -> Optional[List[str]]:
117
+ """
118
+ Validate and filter the topics list from LLM output.
119
+ Returns None if no valid topics remain (hard fail).
120
+ """
121
+ if not raw_topics:
122
+ return None
123
+ if isinstance(raw_topics, str):
124
+ raw_topics = [t.strip() for t in raw_topics.split(",")]
125
+ if not isinstance(raw_topics, list):
126
+ return None
127
+ valid = [t for t in raw_topics if t in self._valid_topics]
128
+ return valid if valid else None
129
+
130
+ def _validate_result(self, raw: Dict[str, Any]) -> Dict[str, Any]:
131
+ """
132
+ Validate LLM output.
133
+ - No valid topics β†’ hard fail
134
+ - Invalid confidence β†’ corrected to "medium"
135
+ - Boolean flags: default to False if missing or non-boolean
136
+ """
137
+ topics = self._validate_topics(raw.get("topics"))
138
+ if not topics:
139
+ return {
140
+ "success": False,
141
+ "error": (
142
+ f"No valid topics in response: {raw.get('topics')}. "
143
+ f"Expected values from: {sorted(self._valid_topics)}"
144
+ ),
145
+ }
146
+
147
+ confidence = raw.get("confidence", "medium")
148
+ if confidence not in {"high", "medium", "low"}:
149
+ confidence = "medium"
150
+
151
+ is_refund = raw.get("is_refund_request", False)
152
+ is_cancel = raw.get("is_cancellation", False)
153
+ is_membership = raw.get("is_membership", False)
154
+
155
+ # Coerce to bool in case LLM returns strings
156
+ if not isinstance(is_refund, bool):
157
+ is_refund = str(is_refund).lower() in ("true", "1", "yes")
158
+ if not isinstance(is_cancel, bool):
159
+ is_cancel = str(is_cancel).lower() in ("true", "1", "yes")
160
+ if not isinstance(is_membership, bool):
161
+ is_membership = str(is_membership).lower() in ("true", "1", "yes")
162
+
163
+ return {
164
+ "success": True,
165
+ "topics": topics,
166
+ "is_refund_request": is_refund,
167
+ "is_cancellation": is_cancel,
168
+ "is_membership": is_membership,
169
+ "confidence": confidence,
170
+ "topic_notes": str(raw.get("topic_notes", "")).strip(),
171
+ "summary": str(raw.get("summary", "")).strip(),
172
+ }
173
+
174
+ # ------------------------------------------------------------------
175
+ # Core extraction
176
+ # ------------------------------------------------------------------
177
+
178
+ def extract(self, conversation_text: str) -> Dict[str, Any]:
179
+ """
180
+ Call the LLM to assign topics and billing flags.
181
+
182
+ Args:
183
+ conversation_text: Pre-formatted, truncated conversation text
184
+
185
+ Returns:
186
+ Success dict with topic fields, or failure dict with error key.
187
+ """
188
+ user_prompt = self._build_user_prompt(conversation_text)
189
+
190
+ try:
191
+ messages = [
192
+ SystemMessage(content=self._system_prompt),
193
+ HumanMessage(content=user_prompt),
194
+ ]
195
+ response = self.llm.invoke(messages)
196
+ raw = json.loads(response.content)
197
+
198
+ validated = self._validate_result(raw)
199
+ if not validated["success"]:
200
+ self.log_processing(f"Validation failed: {validated['error']}", "warning")
201
+ return validated
202
+
203
+ return {
204
+ "success": True,
205
+ "topics": ", ".join(validated["topics"]), # comma-separated for DB storage
206
+ "is_refund_request": validated["is_refund_request"],
207
+ "is_cancellation": validated["is_cancellation"],
208
+ "is_membership": validated["is_membership"],
209
+ "topic_confidence": validated["confidence"],
210
+ "topic_notes": validated["topic_notes"],
211
+ "summary": validated["summary"],
212
+ }
213
+
214
+ except json.JSONDecodeError as e:
215
+ self.log_processing(f"JSON decode error: {e}", "warning")
216
+ return {"success": False, "error": f"JSON parse error: {e}"}
217
+
218
+ except Exception as e:
219
+ self.log_processing(f"Topic extraction failed: {e}", "error")
220
+ return {"success": False, "error": str(e)}
221
+
222
+ # ------------------------------------------------------------------
223
+ # Agent interface
224
+ # ------------------------------------------------------------------
225
+
226
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
227
+ return "conversation_text" in input_data and bool(input_data["conversation_text"])
228
+
229
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
230
+ """
231
+ Args:
232
+ input_data: Must contain 'conversation_text'.
233
+
234
+ Returns:
235
+ Dict with topic fields merged on top of input_data.
236
+ """
237
+ try:
238
+ if not self.validate_input(input_data):
239
+ return {
240
+ "success": False,
241
+ "error": "Invalid input: 'conversation_text' is required and must be non-empty",
242
+ }
243
+
244
+ self.log_processing("Extracting topics from conversation", "debug")
245
+ result = self.extract(input_data["conversation_text"])
246
+
247
+ output = {
248
+ "success": result.get("success", False),
249
+ "topics": result.get("topics"),
250
+ "is_refund_request": result.get("is_refund_request", False),
251
+ "is_cancellation": result.get("is_cancellation", False),
252
+ "is_membership": result.get("is_membership", False),
253
+ "topic_confidence": result.get("topic_confidence"),
254
+ "topic_notes": result.get("topic_notes", ""),
255
+ "summary": result.get("summary", ""),
256
+ }
257
+ if "error" in result:
258
+ output["topic_error"] = result["error"]
259
+
260
+ # Preserve all original input fields
261
+ for key, value in input_data.items():
262
+ if key not in output:
263
+ output[key] = value
264
+
265
+ return output
266
+
267
+ except Exception as e:
268
+ return self.handle_error(e, "topic_extraction")
process_helpscout/config_files/processing_config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_meta": {
3
+ "description": "Configuration for the HelpScout conversation processing pipeline. Controls agent models, processing limits, and output destination.",
4
+ "version": "1.0.0"
5
+ },
6
+
7
+ "agents": {
8
+ "sentiment_analysis": {
9
+ "model": "gpt-5-nano",
10
+ "temperature": 0.2,
11
+ "max_retries": 3
12
+ },
13
+ "topic_extraction": {
14
+ "model": "gpt-5-nano",
15
+ "temperature": 0.2,
16
+ "max_retries": 3
17
+ }
18
+ },
19
+
20
+ "sentiment_polarity": {
21
+ "categories": [
22
+ {
23
+ "value": "very_positive",
24
+ "label": "Very Positive",
25
+ "description": "Extremely enthusiastic, excited, deeply grateful, or highly satisfied"
26
+ },
27
+ {
28
+ "value": "positive",
29
+ "label": "Positive",
30
+ "description": "Generally positive, appreciative, supportive, or encouraging"
31
+ },
32
+ {
33
+ "value": "neutral",
34
+ "label": "Neutral",
35
+ "description": "Factual, informational, balanced, or lacking clear emotional tone"
36
+ },
37
+ {
38
+ "value": "negative",
39
+ "label": "Negative",
40
+ "description": "Disappointed, critical, frustrated, or mildly dissatisfied"
41
+ },
42
+ {
43
+ "value": "very_negative",
44
+ "label": "Very Negative",
45
+ "description": "Highly critical, angry, abusive, or extremely dissatisfied"
46
+ }
47
+ ]
48
+ },
49
+
50
+ "emotions": {
51
+ "soft_fail": true,
52
+ "multi_label": true,
53
+ "categories": [
54
+ {
55
+ "value": "joy",
56
+ "label": "Joy",
57
+ "description": "Happiness, delight, or elation"
58
+ },
59
+ {
60
+ "value": "excitement",
61
+ "label": "Excitement",
62
+ "description": "Enthusiasm, energy, or eagerness"
63
+ },
64
+ {
65
+ "value": "gratitude",
66
+ "label": "Gratitude",
67
+ "description": "Thankfulness or appreciation"
68
+ },
69
+ {
70
+ "value": "admiration",
71
+ "label": "Admiration",
72
+ "description": "Deep respect or positive regard for the platform, team or products"
73
+ },
74
+ {
75
+ "value": "curiosity",
76
+ "label": "Curiosity",
77
+ "description": "Interest, eagerness to learn, or wondering about something"
78
+ },
79
+ {
80
+ "value": "frustration",
81
+ "label": "Frustration",
82
+ "description": "Irritation, annoyance, or blocked goals"
83
+ },
84
+ {
85
+ "value": "disappointment",
86
+ "label": "Disappointment",
87
+ "description": "Unmet expectations or a let-down feeling"
88
+ },
89
+ {
90
+ "value": "sadness",
91
+ "label": "Sadness",
92
+ "description": "Sorrow, emotional heaviness, or distress"
93
+ },
94
+ {
95
+ "value": "anger",
96
+ "label": "Anger",
97
+ "description": "Strong outrage or hostility"
98
+ },
99
+ {
100
+ "value": "humor",
101
+ "label": "Humor",
102
+ "description": "Amusement, playfulness, or levity in tone"
103
+ },
104
+ {
105
+ "value": "neutral",
106
+ "label": "Neutral",
107
+ "description": "No discernible emotion; use only when no other emotion applies"
108
+ }
109
+ ]
110
+ },
111
+
112
+ "processing": {
113
+ "max_conversation_chars": 5000,
114
+ "min_batch_size": 10,
115
+ "max_batch_size": 50
116
+ },
117
+
118
+ "output": {
119
+ "database": "SOCIAL_MEDIA_DB",
120
+ "schema": "ML_FEATURES",
121
+ "table": "HELPSCOUT_CONVERSATION_FEATURES"
122
+ },
123
+
124
+ "sql_query_file": "queries/helpscout_conversations.sql"
125
+ }
process_helpscout/config_files/topics.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_meta": {
3
+ "version": "1.0.0",
4
+ "last_updated": "2025-04-09",
5
+ "description": "Musora HelpScout auto-tagging taxonomy. Used as the source configuration for the LLM-based tagging pipeline. Topics are mutually exclusive at the top level; a conversation may receive multiple topic tags. Sub-categories are listed for reference and future use in a separate config. Special boolean flags are defined inline for high-signal billing events.",
6
+ "usage_notes": [
7
+ "Assign one or more topic tags per conversation.",
8
+ "Boolean flags under billing_and_subscription should be extracted independently even when the parent topic is detected.",
9
+ "Use the 'uncategorized' topic when no other topic clearly applies β€” never as a fallback for uncertain cases.",
10
+ "feedback_and_suggestions should be used as a supplementary tag alongside a primary topic when applicable."
11
+ ]
12
+ },
13
+
14
+ "topics": [
15
+
16
+ {
17
+ "id": "video_and_playback",
18
+ "label": "Video & Playback",
19
+ "description": "The student is experiencing a problem with audio or video content during viewing. The issue is with how media plays, not with the surrounding app or UI. "
20
+ },
21
+
22
+ {
23
+ "id": "app_and_technical_errors",
24
+ "label": "App & Technical Errors",
25
+ "description": "A software bug, crash, or system failure that is NOT limited to video playback. The app, website, technology related, or a specific feature is broken, unresponsive, or showing an error message. Use this when the problem is with the platform itself rather than the content being watched."
26
+ },
27
+
28
+ {
29
+ "id": "navigation_and_ux",
30
+ "label": "Navigation & UX",
31
+ "description": "The student is confused by the interface or cannot find something, but is not technically blocked from accessing it. The issue is about discoverability, layout clarity, or unintuitive design rather than a bug or access restriction. Often triggered by redesigns or renamed features."
32
+ },
33
+ {
34
+ "id": "account_and_access",
35
+ "label": "Account & Access",
36
+ "description": "The student cannot log in, is locked out, or cannot access content they are entitled to. Also covers profile and settings issues. Distinct from billing: use this when the problem is authentication or permissions, even if the underlying cause might be a billing state."
37
+ },
38
+
39
+ {
40
+ "id": "billing_and_subscription",
41
+ "label": "Billing & Subscription",
42
+ "description": "Any conversation involving money, charges, plan status, or membership. This includes unexpected charges, plan changes, promotions, and invoice requests. ",
43
+ "flags": {
44
+ "is_refund_request": {
45
+ "type": "boolean",
46
+ "description": "True when the student is explicitly asking for their money back, regardless of reason."
47
+ },
48
+ "is_cancellation": {
49
+ "type": "boolean",
50
+ "description": "True when the student wants to cancel their subscription or membership, even if they haven't asked for a refund."
51
+ },
52
+ "is_membership": {
53
+ "type": "boolean",
54
+ "description": "True when the student wants to join/rejoin and purchase membership."
55
+ }
56
+ }
57
+ },
58
+
59
+ {
60
+ "id": "learning_and_progress",
61
+ "label": "Learning & Progress",
62
+ "description": "Issues with how the student's learning journey, including asking for help or recommendations, is tracked or structured over time. Covers broken progress tracking, practice session logging, playlist management, curriculum navigation, and access to legacy or assigned content. The problem is with the learning system, not the content itself."
63
+ },
64
+
65
+ {
66
+ "id": "content_and_resources",
67
+ "label": "Content & Resources",
68
+ "description": "Problems with the lesson content itself or supplementary learning materials β€” not the video player. Covers missing PDFs, sheet music, backing tracks, incorrect lesson information, requests for new content, and missing assignment or review links."
69
+ },
70
+
71
+ {
72
+ "id": "community_and_notifications",
73
+ "label": "Community & Notifications",
74
+ "description": "Issues involving forums, comments, student profiles, social features, or the delivery of notifications. Use this when the problem is about communication and social interaction within the platform, not content access or playback."
75
+ },
76
+
77
+ {
78
+ "id": "feedback_and_suggestions",
79
+ "label": "Feedback & Suggestions",
80
+ "description": "The student is sharing an opinion, making a feature request, or expressing general satisfaction or dissatisfaction β€” not reporting a specific failure. This should typically be applied as a supplementary tag alongside a primary topic when a complaint conversation also carries strong sentiment or a request for new functionality."
81
+ },
82
+
83
+ {
84
+ "id": "uncategorized",
85
+ "label": "Uncategorized",
86
+ "description": "Assign ONLY when no other topic clearly applies after careful consideration. Do not use as a fallback for low-confidence cases where a topic still partially fits β€” prefer the closest matching topic. The primary purpose of this tag is to surface new conversation patterns that may warrant expanding the taxonomy."
87
+ }
88
+
89
+ ]
90
+ }
process_helpscout/data_fetcher.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Fetcher for the HelpScout processing pipeline.
3
+
4
+ Responsible for:
5
+ 1. Fetching raw customer threads from Snowflake (reusing fetch_and_export logic)
6
+ 2. Cleaning HTML and aggregating to conversation level
7
+ 3. Checking which conversations have already been processed (for deduplication)
8
+
9
+ Reuses fetch_raw(), process_threads(), and aggregate_conversations() from
10
+ fetch_and_export.py so the cleaning and aggregation logic stays in one place.
11
+ """
12
+
13
+ import logging
14
+ from pathlib import Path
15
+ from typing import Set
16
+
17
+ import pandas as pd
18
+
19
+ from snowflake_conn import SnowflakeConn
20
+ from fetch_and_export import fetch_raw, process_threads, aggregate_conversations
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def fetch_conversations(conn: SnowflakeConn) -> pd.DataFrame:
26
+ """
27
+ Fetch, clean, and aggregate all customer conversations from HelpScout.
28
+
29
+ Returns one row per conversation_id with the following key columns:
30
+ - conversation_id
31
+ - combined_text (all customer messages joined with ' | ')
32
+ - customer_email, customer_first, customer_last, customer_hs_id
33
+ - thread_count, first_message_at, last_message_at, duration_hours
34
+ - status, state, source_type, source_via
35
+
36
+ Returns an empty DataFrame if no data is available.
37
+ """
38
+ raw_df = fetch_raw(conn)
39
+ if raw_df.empty:
40
+ logger.warning("No raw threads returned from Snowflake.")
41
+ return pd.DataFrame()
42
+
43
+ threads_df = process_threads(raw_df)
44
+ if threads_df.empty:
45
+ logger.warning("All threads were empty after HTML cleaning.")
46
+ return pd.DataFrame()
47
+
48
+ conversations_df = aggregate_conversations(threads_df)
49
+ logger.info(f"Ready to process: {len(conversations_df):,} conversations")
50
+ return conversations_df
51
+
52
+
53
+ def fetch_processed_ids(
54
+ conn: SnowflakeConn,
55
+ database: str,
56
+ schema: str,
57
+ table: str,
58
+ ) -> Set[str]:
59
+ """
60
+ Return the set of conversation_ids already stored in the output table.
61
+
62
+ Returns an empty set if the table does not exist yet (first run) or if
63
+ the query fails for any other reason β€” the pipeline will then process
64
+ all conversations.
65
+ """
66
+ try:
67
+ query = f"SELECT CONVERSATION_ID FROM {database}.{schema}.{table}"
68
+ df = conn.run_query(query, description="fetch_processed_ids")
69
+ ids = set(df["conversation_id"].dropna().astype(str).tolist())
70
+ logger.info(f"Found {len(ids):,} already-processed conversations in {table}")
71
+ return ids
72
+ except Exception as exc:
73
+ logger.warning(
74
+ f"Could not fetch processed IDs from {database}.{schema}.{table} "
75
+ f"(table may not exist yet): {exc}"
76
+ )
77
+ return set()
process_helpscout/fetch_and_export.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout Data Fetcher & Exporter
3
+ ==================================
4
+ Fetches raw conversation data from Snowflake, cleans HTML bodies,
5
+ computes derived columns, and exports two CSV files:
6
+
7
+ output/helpscout_threads.csv β€” one row per message thread
8
+ output/helpscout_conversations.csv β€” one row per conversation (aggregated)
9
+
10
+ Run:
11
+ python fetch_and_export.py
12
+ """
13
+
14
+ import logging
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+ import numpy as np
20
+
21
+ # Local modules
22
+ from snowflake_conn import SnowflakeConn
23
+ from html_cleaner import clean_html_series
24
+
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format="%(asctime)s [%(levelname)s] %(message)s",
28
+ handlers=[logging.StreamHandler(sys.stdout)],
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Paths
34
+ # ---------------------------------------------------------------------------
35
+ BASE_DIR = Path(__file__).resolve().parent
36
+ SQL_FILE = BASE_DIR / "queries" / "helpscout_conversations.sql"
37
+ OUTPUT_DIR = BASE_DIR / "output"
38
+ OUTPUT_DIR.mkdir(exist_ok=True)
39
+
40
+ THREADS_CSV = OUTPUT_DIR / "helpscout_threads.csv"
41
+ CONVERSATIONS_CSV = OUTPUT_DIR / "helpscout_conversations.csv"
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Fetch
46
+ # ---------------------------------------------------------------------------
47
+ def fetch_raw(conn: SnowflakeConn) -> pd.DataFrame:
48
+ logger.info("Fetching HelpScout threads from Snowflake…")
49
+ df = conn.run_query_from_file(SQL_FILE, description="helpscout_conversations")
50
+ logger.info(f"Fetched {len(df):,} raw thread rows.")
51
+ return df
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Clean & enrich threads
56
+ # ---------------------------------------------------------------------------
57
+ def process_threads(df: pd.DataFrame) -> pd.DataFrame:
58
+ logger.info("Cleaning HTML bodies…")
59
+ df = df.copy()
60
+
61
+ # Parse timestamps
62
+ for col in ("created_at", "opened_at"):
63
+ if col in df.columns:
64
+ df[col] = pd.to_datetime(df[col], utc=True, errors="coerce")
65
+
66
+ # Clean HTML β†’ plain text
67
+ df["body_clean"] = clean_html_series(df["body"])
68
+
69
+ # Drop rows where cleaning produced empty text
70
+ before = len(df)
71
+ df = df[df["body_clean"].str.strip().str.len() > 0].copy()
72
+ logger.info(f"Dropped {before - len(df):,} rows with empty body after cleaning.")
73
+
74
+ # Derived columns
75
+ df["word_count"] = df["body_clean"].str.split().str.len().fillna(0).astype(int)
76
+ df["char_count"] = df["body_clean"].str.len().fillna(0).astype(int)
77
+
78
+ # Date helpers
79
+ df["date"] = df["created_at"].dt.date
80
+ df["week"] = df["created_at"].dt.to_period("W").dt.start_time
81
+ df["month"] = df["created_at"].dt.to_period("M").dt.start_time
82
+ df["hour_of_day"] = df["created_at"].dt.hour
83
+ df["day_of_week"] = df["created_at"].dt.day_name()
84
+
85
+ # Normalise free-text columns
86
+ for col in ("source_type", "source_via", "status", "state", "type"):
87
+ if col in df.columns:
88
+ df[col] = df[col].fillna("unknown").str.lower().str.strip()
89
+
90
+ # Identify the display name for the sender
91
+ df["sender_name"] = (
92
+ (df.get("created_by_first", "").fillna("") + " " +
93
+ df.get("created_by_last", "").fillna("")).str.strip()
94
+ )
95
+ df["sender_name"] = df["sender_name"].replace("", "Unknown")
96
+
97
+ logger.info(f"Processed threads: {len(df):,} rows.")
98
+ return df
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # Aggregate to conversation level
103
+ # ---------------------------------------------------------------------------
104
+ def aggregate_conversations(threads: pd.DataFrame) -> pd.DataFrame:
105
+ logger.info("Aggregating to conversation level…")
106
+
107
+ agg = (
108
+ threads.groupby("conversation_id")
109
+ .agg(
110
+ first_message_at=("created_at", "min"),
111
+ last_message_at=("created_at", "max"),
112
+ thread_count=("thread_id", "count"),
113
+ customer_email=("customer_email", "first"),
114
+ customer_first=("customer_first", "first"),
115
+ customer_last=("customer_last", "first"),
116
+ customer_hs_id=("customer_hs_id", "first"),
117
+ source_type=("source_type", "first"),
118
+ source_via=("source_via", "first"),
119
+ status=("status", "last"), # last known status
120
+ state=("state", "last"),
121
+ total_word_count=("word_count", "sum"),
122
+ avg_word_count=("word_count", "mean"),
123
+ combined_text=("body_clean", lambda x: " | ".join(x.dropna())),
124
+ )
125
+ .reset_index()
126
+ )
127
+
128
+ # Duration in hours from first to last message
129
+ agg["duration_hours"] = (
130
+ (agg["last_message_at"] - agg["first_message_at"])
131
+ .dt.total_seconds()
132
+ .div(3600)
133
+ .round(2)
134
+ )
135
+
136
+ agg["date"] = agg["first_message_at"].dt.date
137
+ agg["week"] = agg["first_message_at"].dt.to_period("W").dt.start_time
138
+ agg["month"] = agg["first_message_at"].dt.to_period("M").dt.start_time
139
+
140
+ logger.info(f"Aggregated {len(agg):,} unique conversations.")
141
+ return agg
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # Export
146
+ # ---------------------------------------------------------------------------
147
+ def export(threads: pd.DataFrame, conversations: pd.DataFrame) -> None:
148
+ # Drop raw HTML before saving (keeps CSV manageable)
149
+ threads_export = threads.drop(columns=["body"], errors="ignore")
150
+
151
+ threads_export.to_csv(THREADS_CSV, index=False, encoding="utf-8-sig")
152
+ logger.info(f"Exported threads β†’ {THREADS_CSV}")
153
+
154
+ conversations.to_csv(CONVERSATIONS_CSV, index=False, encoding="utf-8-sig")
155
+ logger.info(f"Exported conversations β†’ {CONVERSATIONS_CSV}")
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # Main
160
+ # ---------------------------------------------------------------------------
161
+ def main():
162
+ conn = SnowflakeConn()
163
+ try:
164
+ raw_df = fetch_raw(conn)
165
+ if raw_df.empty:
166
+ logger.warning("No data returned. Check date range and table access.")
167
+ return
168
+
169
+ threads_df = process_threads(raw_df)
170
+ conversations_df = aggregate_conversations(threads_df)
171
+ export(threads_df, conversations_df)
172
+
173
+ logger.info("Done.")
174
+ logger.info(f" Threads: {len(threads_df):,}")
175
+ logger.info(f" Conversations: {len(conversations_df):,}")
176
+ logger.info(f" Unique customers: {conversations_df['customer_email'].nunique():,}")
177
+
178
+ finally:
179
+ conn.close()
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()
process_helpscout/html_cleaner.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Cleaner for HelpScout message bodies.
3
+
4
+ Strategy:
5
+ 1. Remove blockquotes (quoted previous email threads).
6
+ 2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
7
+ 3. Remove HelpScout / marketing email boilerplate sections.
8
+ 4. Extract plain text from the remaining DOM.
9
+ 5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
10
+ """
11
+
12
+ import re
13
+ import unicodedata
14
+ from bs4 import BeautifulSoup, Comment
15
+
16
+ # CSS class / id fragments that indicate quoted / boilerplate content
17
+ _QUOTED_CLASS_PATTERNS = [
18
+ "gmail_extra",
19
+ "gmail_quote",
20
+ "ex-gmail",
21
+ "yahoo_quoted",
22
+ "moz-cite-prefix",
23
+ "OutlookMessageHeader",
24
+ "protonmail_quote",
25
+ "apple-mail-previous",
26
+ ]
27
+
28
+ # Markers that indicate the start of a quoted section (text-based heuristics)
29
+ _QUOTE_TEXT_MARKERS = [
30
+ r"On .{5,80} wrote:", # "On Mar 2, 2026 ... wrote:"
31
+ r"From:\s",
32
+ r"Sent:\s",
33
+ r"To:\s.*\nCc:",
34
+ r">{1,}", # > quoted lines (plain text fallback)
35
+ ]
36
+
37
+ _COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]
38
+
39
+ # Tags whose entire sub-tree we drop unconditionally
40
+ _DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}
41
+
42
+ # Invisible / spacer Unicode characters
43
+ _INVISIBLE_CHARS = re.compile(
44
+ r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
45
+ )
46
+
47
+ # Collapse multiple blank lines to one
48
+ _MULTI_BLANK = re.compile(r"\n{3,}")
49
+
50
+
51
+ def _remove_quoted_sections(soup: BeautifulSoup) -> None:
52
+ """Remove DOM nodes that represent quoted/threaded email history."""
53
+
54
+ # 1. All <blockquote> tags
55
+ for tag in soup.find_all("blockquote"):
56
+ tag.decompose()
57
+
58
+ # 2. Divs / spans with known quoted-reply class names
59
+ # Collect candidates first; decompose() invalidates attrs on child nodes
60
+ # that may still appear later in the iteration, so we guard with a check.
61
+ candidates = soup.find_all(True)
62
+ for tag in candidates:
63
+ if tag.attrs is None:
64
+ # Already decomposed (child of a previously decomposed parent)
65
+ continue
66
+ css_classes = " ".join(tag.get("class") or []).lower()
67
+ tag_id = (tag.get("id") or "").lower()
68
+ combined = css_classes + " " + tag_id
69
+ if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
70
+ tag.decompose()
71
+
72
+ # 3. HTML comments (<!-- --> contain no user text)
73
+ for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
74
+ comment.extract()
75
+
76
+
77
+ def _remove_boilerplate(soup: BeautifulSoup) -> None:
78
+ """Remove marketing / footer / unsubscribe sections."""
79
+
80
+ # Drop heavy layout tags entirely (tables, images carry no message text)
81
+ for tag in soup.find_all(_DROP_TAGS):
82
+ tag.decompose()
83
+
84
+ # Drop any element whose text is purely an unsubscribe / footer line
85
+ footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
86
+ for tag in soup.find_all(True):
87
+ if tag.attrs is None:
88
+ continue
89
+ text = tag.get_text(separator=" ", strip=True).lower()
90
+ if any(kw in text for kw in footer_keywords) and len(text) < 300:
91
+ tag.decompose()
92
+
93
+
94
+ def _extract_text(soup: BeautifulSoup) -> str:
95
+ """Get plain text from the cleaned soup, preserving line breaks."""
96
+ lines = []
97
+ for element in soup.recursiveChildGenerator():
98
+ if isinstance(element, str):
99
+ stripped = element.strip()
100
+ if stripped:
101
+ lines.append(stripped)
102
+ elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
103
+ lines.append("\n")
104
+ return " ".join(lines)
105
+
106
+
107
+ def _clean_text(raw: str) -> str:
108
+ """Final text cleanup: invisible chars, excessive whitespace, quote markers."""
109
+
110
+ # Remove invisible spacers
111
+ text = _INVISIBLE_CHARS.sub("", raw)
112
+
113
+ # Normalize unicode (e.g. soft-hyphen variants)
114
+ text = unicodedata.normalize("NFKC", text)
115
+
116
+ # Collapse whitespace sequences (keep single newlines intentional)
117
+ text = re.sub(r"[ \t]+", " ", text)
118
+ text = re.sub(r" \n", "\n", text)
119
+ text = re.sub(r"\n ", "\n", text)
120
+ text = _MULTI_BLANK.sub("\n\n", text)
121
+
122
+ # Remove lines that are purely quote markers ("> some text")
123
+ lines = text.split("\n")
124
+ lines = [ln for ln in lines if not ln.strip().startswith(">")]
125
+ text = "\n".join(lines)
126
+
127
+ # Cut off at first "On <date> wrote:" marker (inline quoted replies)
128
+ for pattern in _COMPILED_QUOTE_MARKERS:
129
+ match = pattern.search(text)
130
+ if match and match.start() > 20: # don't cut if marker is at very start
131
+ text = text[: match.start()].strip()
132
+ break
133
+
134
+ return text.strip()
135
+
136
+
137
+ def clean_html(html_body: str) -> str:
138
+ """
139
+ Full pipeline: HTML β†’ clean plain text containing only the customer's message.
140
+
141
+ Args:
142
+ html_body: Raw HTML string from CONVERSATION_THREADS.BODY
143
+
144
+ Returns:
145
+ Clean UTF-8 plain text string.
146
+ """
147
+ if not html_body or not html_body.strip():
148
+ return ""
149
+
150
+ soup = BeautifulSoup(html_body, "html.parser")
151
+
152
+ _remove_quoted_sections(soup)
153
+ _remove_boilerplate(soup)
154
+
155
+ raw_text = _extract_text(soup)
156
+ return _clean_text(raw_text)
157
+
158
+
159
+ def clean_html_series(series):
160
+ """
161
+ Vectorized version for a pandas Series.
162
+
163
+ Args:
164
+ series: pd.Series of HTML strings
165
+
166
+ Returns:
167
+ pd.Series of cleaned plain text strings
168
+ """
169
+ return series.fillna("").apply(clean_html)
process_helpscout/main.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main execution script for the HelpScout conversation processing pipeline.
3
+
4
+ Steps:
5
+ 1. Fetch all customer conversations from Snowflake (HTML cleaned + aggregated)
6
+ 2. Filter out conversations already in the output table
7
+ 3. Run sentiment analysis + topic extraction in parallel batches
8
+ 4. Append results to SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
9
+
10
+ Run:
11
+ python main.py # process all new conversations, parallel
12
+ python main.py --limit 100 # process at most 100 conversations
13
+ python main.py --sequential # single-process mode (useful for debugging)
14
+ python main.py --config <path> # use a custom config file
15
+ """
16
+
17
+ import json
18
+ import logging
19
+ import os
20
+ import sys
21
+ import argparse
22
+ import traceback
23
+ from datetime import datetime
24
+ from multiprocessing import Pool, cpu_count
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List
27
+
28
+ import pandas as pd
29
+ from dotenv import load_dotenv
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Path setup β€” allows imports from the process_helpscout package directory
33
+ # ---------------------------------------------------------------------------
34
+ SCRIPT_DIR = Path(__file__).resolve().parent
35
+ ROOT_DIR = SCRIPT_DIR.parent
36
+
37
+ load_dotenv(ROOT_DIR / ".env")
38
+ sys.path.insert(0, str(SCRIPT_DIR))
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Logging β€” file + console; log directory is created on first run
42
+ # ---------------------------------------------------------------------------
43
+ _logs_dir = SCRIPT_DIR / "logs"
44
+ _logs_dir.mkdir(exist_ok=True)
45
+
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
49
+ handlers=[
50
+ logging.FileHandler(
51
+ _logs_dir / f"helpscout_processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
52
+ ),
53
+ logging.StreamHandler(),
54
+ ],
55
+ )
56
+ logger = logging.getLogger(__name__)
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Local imports (after sys.path is set)
60
+ # ---------------------------------------------------------------------------
61
+ from snowflake_conn import SnowflakeConn
62
+ from data_fetcher import fetch_conversations, fetch_processed_ids
63
+ from workflow.conversation_processor import ConversationProcessingWorkflow
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Batch size helper
68
+ # ---------------------------------------------------------------------------
69
+
70
+ def calculate_optimal_batch_size(
71
+ total: int,
72
+ num_workers: int,
73
+ min_batch: int = 10,
74
+ max_batch: int = 50,
75
+ ) -> int:
76
+ """
77
+ Distribute work evenly across workers within the configured min/max bounds.
78
+
79
+ Args:
80
+ total: Total number of conversations to process
81
+ num_workers: Number of parallel worker processes
82
+ min_batch: Minimum conversations per batch
83
+ max_batch: Maximum conversations per batch
84
+
85
+ Returns:
86
+ Optimal batch size
87
+ """
88
+ if total <= min_batch:
89
+ return total
90
+ batch_size = total // num_workers
91
+ return max(min_batch, min(max_batch, batch_size))
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Batch worker β€” runs in a separate process (must be module-level for pickle)
96
+ # ---------------------------------------------------------------------------
97
+
98
+ def process_batch_worker(batch_data: tuple) -> dict:
99
+ """
100
+ Worker function executed in a separate process for one batch of conversations.
101
+
102
+ Each worker creates its own Snowflake connection and workflow instance so
103
+ resources are not shared across processes.
104
+
105
+ Args:
106
+ batch_data: (batch_num, conversations, config, api_key)
107
+
108
+ Returns:
109
+ Statistics dict for this batch.
110
+ """
111
+ batch_num, batch_conversations, config, api_key = batch_data
112
+ worker_logger = logging.getLogger(f"Worker-{batch_num}")
113
+
114
+ try:
115
+ worker_logger.info(f"Batch {batch_num}: Processing {len(batch_conversations)} conversations")
116
+
117
+ # Worker-local Snowflake connection and workflow
118
+ conn = SnowflakeConn()
119
+ workflow = ConversationProcessingWorkflow(config, api_key)
120
+
121
+ # Run the workflow
122
+ results = workflow.process_batch(batch_conversations)
123
+ results_df = pd.DataFrame(results)
124
+
125
+ # Separate successful results
126
+ initial_count = len(results_df)
127
+ df_ok = results_df[results_df["success"] == True].copy()
128
+ failed_count = initial_count - len(df_ok)
129
+
130
+ worker_logger.info(
131
+ f"Batch {batch_num}: {len(df_ok)} successful, {failed_count} failed"
132
+ )
133
+
134
+ # ----------------------------------------------------------------
135
+ # Build output DataFrame with Snowflake column names
136
+ # ----------------------------------------------------------------
137
+ column_map = {
138
+ "conversation_id": "CONVERSATION_ID",
139
+ "customer_email": "CUSTOMER_EMAIL",
140
+ "customer_first": "CUSTOMER_FIRST",
141
+ "customer_last": "CUSTOMER_LAST",
142
+ "customer_hs_id": "CUSTOMER_HS_ID",
143
+ "thread_count": "THREAD_COUNT",
144
+ "first_message_at": "FIRST_MESSAGE_AT",
145
+ "last_message_at": "LAST_MESSAGE_AT",
146
+ "duration_hours": "DURATION_HOURS",
147
+ "status": "STATUS",
148
+ "state": "STATE",
149
+ "source_type": "SOURCE_TYPE",
150
+ "source_via": "SOURCE_VIA",
151
+ "combined_text": "COMBINED_TEXT",
152
+ "conversation_text": "CONVERSATION_TEXT_USED",
153
+ "sentiment_polarity": "SENTIMENT_POLARITY",
154
+ "emotions": "EMOTIONS",
155
+ "sentiment_confidence": "SENTIMENT_CONFIDENCE",
156
+ "sentiment_notes": "SENTIMENT_NOTES",
157
+ "topics": "TOPICS",
158
+ "is_refund_request": "IS_REFUND_REQUEST",
159
+ "is_cancellation": "IS_CANCELLATION",
160
+ "is_membership": "IS_MEMBERSHIP",
161
+ "topic_confidence": "TOPIC_CONFIDENCE",
162
+ "topic_notes": "TOPIC_NOTES",
163
+ "summary": "SUMMARY",
164
+ "processing_errors": "PROCESSING_ERRORS",
165
+ }
166
+
167
+ output_df = pd.DataFrame()
168
+ for src_col, tgt_col in column_map.items():
169
+ output_df[tgt_col] = df_ok[src_col] if src_col in df_ok.columns else None
170
+
171
+ # Flatten processing_errors list to a semicolon-separated string
172
+ if "PROCESSING_ERRORS" in output_df.columns:
173
+ output_df["PROCESSING_ERRORS"] = output_df["PROCESSING_ERRORS"].apply(
174
+ lambda x: "; ".join(x) if isinstance(x, list) else (str(x) if x else None)
175
+ )
176
+
177
+ # Pipeline metadata
178
+ output_df["PROCESSED_AT"] = datetime.now()
179
+ output_df["WORKFLOW_VERSION"] = "1.0"
180
+
181
+ # ----------------------------------------------------------------
182
+ # Store to Snowflake
183
+ # ----------------------------------------------------------------
184
+ out_cfg = config["output"]
185
+ if not output_df.empty:
186
+ conn.store_df_to_snowflake(
187
+ table_name=out_cfg["table"],
188
+ dataframe=output_df,
189
+ database=out_cfg["database"],
190
+ schema=out_cfg["schema"],
191
+ overwrite=False, # Always append; deduplication is handled upstream
192
+ )
193
+
194
+ conn.close()
195
+
196
+ return {
197
+ "batch_num": batch_num,
198
+ "success": True,
199
+ "total_processed": initial_count,
200
+ "total_stored": len(output_df),
201
+ "failed_count": failed_count,
202
+ "error": None,
203
+ }
204
+
205
+ except Exception as exc:
206
+ error_msg = f"Batch {batch_num} failed: {exc}"
207
+ worker_logger.error(error_msg)
208
+ worker_logger.error(traceback.format_exc())
209
+ return {
210
+ "batch_num": batch_num,
211
+ "success": False,
212
+ "total_processed": len(batch_conversations),
213
+ "total_stored": 0,
214
+ "failed_count": len(batch_conversations),
215
+ "error": str(exc),
216
+ }
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # Main processor class
221
+ # ---------------------------------------------------------------------------
222
+
223
+ class HelpScoutProcessor:
224
+ """
225
+ Orchestrates the end-to-end HelpScout conversation processing pipeline.
226
+
227
+ Typical usage:
228
+ processor = HelpScoutProcessor()
229
+ processor.run(limit=500)
230
+ """
231
+
232
+ def __init__(self, config_path: str = None):
233
+ """
234
+ Args:
235
+ config_path: Path to processing_config.json.
236
+ Defaults to config_files/processing_config.json
237
+ relative to this script.
238
+ """
239
+ if config_path is None:
240
+ config_path = SCRIPT_DIR / "config_files" / "processing_config.json"
241
+
242
+ with open(config_path, "r") as f:
243
+ self.config = json.load(f)
244
+
245
+ self.conn = SnowflakeConn()
246
+
247
+ self.api_key = os.getenv("OPENAI_API_KEY")
248
+ if not self.api_key:
249
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
250
+
251
+ logger.info("HelpScoutProcessor initialized")
252
+
253
+ def _calculate_num_workers(self) -> int:
254
+ """CPU count minus 2, capped at 5 β€” mirrors the processing_comments pattern."""
255
+ num_cpus = cpu_count()
256
+ num_workers = max(1, min(5, num_cpus - 2))
257
+ logger.info(f"Using {num_workers} parallel workers (CPU count: {num_cpus})")
258
+ return num_workers
259
+
260
+ def run(self, limit: int = None, sequential: bool = False):
261
+ """
262
+ Execute the full pipeline.
263
+
264
+ Args:
265
+ limit: Cap the number of conversations processed in this run.
266
+ Useful for incremental or test runs. Default: process all new.
267
+ sequential: If True, bypass multiprocessing (single-process debug mode).
268
+ """
269
+ try:
270
+ logger.info("=" * 70)
271
+ logger.info("HelpScout Conversation Processing Pipeline")
272
+ logger.info(f"Mode: {'SEQUENTIAL (debug)' if sequential else 'PARALLEL'}")
273
+ logger.info("=" * 70)
274
+
275
+ # ------------------------------------------------------------------
276
+ # Step 1: Fetch + preprocess conversations
277
+ # ------------------------------------------------------------------
278
+ logger.info("Step 1: Fetching conversations from Snowflake...")
279
+ conversations_df = fetch_conversations(self.conn)
280
+
281
+ if conversations_df.empty:
282
+ logger.warning("No conversations returned. Exiting.")
283
+ return
284
+
285
+ logger.info(f"Fetched {len(conversations_df):,} total conversations")
286
+
287
+ # ------------------------------------------------------------------
288
+ # Step 2: Skip already-processed conversations
289
+ # ------------------------------------------------------------------
290
+ out_cfg = self.config["output"]
291
+ processed_ids = fetch_processed_ids(
292
+ self.conn,
293
+ out_cfg["database"],
294
+ out_cfg["schema"],
295
+ out_cfg["table"],
296
+ )
297
+
298
+ if processed_ids:
299
+ before = len(conversations_df)
300
+ conversations_df = conversations_df[
301
+ ~conversations_df["conversation_id"].astype(str).isin(processed_ids)
302
+ ].copy()
303
+ skipped = before - len(conversations_df)
304
+ logger.info(f"Skipped {skipped:,} already-processed conversations")
305
+
306
+ if conversations_df.empty:
307
+ logger.info("All conversations are already processed. Nothing to do.")
308
+ return
309
+
310
+ # ------------------------------------------------------------------
311
+ # Step 3: Apply optional limit
312
+ # ------------------------------------------------------------------
313
+ if limit:
314
+ conversations_df = conversations_df.head(limit)
315
+ logger.info(f"Limit applied: processing {len(conversations_df):,} conversations")
316
+
317
+ total = len(conversations_df)
318
+ logger.info(f"Processing {total:,} new conversations...")
319
+
320
+ # ------------------------------------------------------------------
321
+ # Step 4: Split into batches
322
+ # ------------------------------------------------------------------
323
+ num_workers = self._calculate_num_workers()
324
+ proc_cfg = self.config.get("processing", {})
325
+ batch_size = calculate_optimal_batch_size(
326
+ total,
327
+ num_workers,
328
+ min_batch=proc_cfg.get("min_batch_size", 10),
329
+ max_batch=proc_cfg.get("max_batch_size", 50),
330
+ )
331
+
332
+ conversations = conversations_df.to_dict("records")
333
+ batches = []
334
+ for i in range(0, total, batch_size):
335
+ batch = conversations[i : i + batch_size]
336
+ batch_num = (i // batch_size) + 1
337
+ batches.append((batch_num, batch, self.config, self.api_key))
338
+
339
+ logger.info(
340
+ f"Split into {len(batches)} batch(es) "
341
+ f"(batch size: {batch_size}, workers: {num_workers})"
342
+ )
343
+
344
+ # ------------------------------------------------------------------
345
+ # Step 5: Run batches
346
+ # ------------------------------------------------------------------
347
+ start_time = datetime.now()
348
+
349
+ if sequential:
350
+ results = [process_batch_worker(b) for b in batches]
351
+ else:
352
+ with Pool(processes=num_workers) as pool:
353
+ results = pool.map(process_batch_worker, batches)
354
+
355
+ elapsed = (datetime.now() - start_time).total_seconds()
356
+
357
+ # ------------------------------------------------------------------
358
+ # Step 6: Summary
359
+ # ------------------------------------------------------------------
360
+ total_processed = sum(r["total_processed"] for r in results)
361
+ total_stored = sum(r["total_stored"] for r in results)
362
+ total_failed = sum(r["failed_count"] for r in results)
363
+ failed_batches = [r for r in results if not r["success"]]
364
+
365
+ logger.info("=" * 70)
366
+ logger.info("Pipeline Summary")
367
+ logger.info(f" Output table : {out_cfg['database']}.{out_cfg['schema']}.{out_cfg['table']}")
368
+ logger.info(f" Processed : {total_processed:,}")
369
+ logger.info(f" Stored : {total_stored:,}")
370
+ logger.info(f" Failed : {total_failed:,}")
371
+ if failed_batches:
372
+ logger.error(f" Failed batches ({len(failed_batches)}):")
373
+ for fb in failed_batches:
374
+ logger.error(f" Batch {fb['batch_num']}: {fb['error']}")
375
+ logger.info(f" Elapsed : {elapsed:.1f}s")
376
+ logger.info(
377
+ f" Avg per conv : {elapsed / max(total_processed, 1):.2f}s"
378
+ )
379
+ logger.info("=" * 70)
380
+
381
+ except Exception as exc:
382
+ logger.error(f"Pipeline failed: {exc}", exc_info=True)
383
+ raise
384
+
385
+ finally:
386
+ self.conn.close()
387
+ logger.info("Snowflake connection closed")
388
+
389
+
390
+ # ---------------------------------------------------------------------------
391
+ # CLI entry point
392
+ # ---------------------------------------------------------------------------
393
+
394
+ def main():
395
+ parser = argparse.ArgumentParser(
396
+ description="Process HelpScout conversations: sentiment analysis + topic extraction"
397
+ )
398
+ parser.add_argument(
399
+ "--limit",
400
+ type=int,
401
+ default=None,
402
+ help="Maximum number of new conversations to process in this run (default: all)",
403
+ )
404
+ parser.add_argument(
405
+ "--sequential",
406
+ action="store_true",
407
+ default=False,
408
+ help="Single-process mode β€” useful for debugging (default: parallel)",
409
+ )
410
+ parser.add_argument(
411
+ "--config",
412
+ type=str,
413
+ default=None,
414
+ help="Path to processing_config.json (default: config_files/processing_config.json)",
415
+ )
416
+ args = parser.parse_args()
417
+
418
+ processor = HelpScoutProcessor(config_path=args.config)
419
+ processor.run(limit=args.limit, sequential=args.sequential)
420
+
421
+
422
+ if __name__ == "__main__":
423
+ main()
process_helpscout/snowflake_conn.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Snowflake connection layer for the HelpScout processing module.
3
+ Adapted from processing_comments/SnowFlakeConnection.py.
4
+ """
5
+ import os
6
+ from pathlib import Path
7
+ from snowflake.snowpark import Session
8
+ from dotenv import load_dotenv
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Load .env from the project root (two levels up from this file)
14
+ _root_env = Path(__file__).resolve().parent.parent / ".env"
15
+ load_dotenv(dotenv_path=_root_env)
16
+
17
+
18
+ class SnowflakeConn:
19
+ """Thin wrapper around Snowpark Session for running read queries."""
20
+
21
+ def __init__(self):
22
+ self.session = self._connect()
23
+
24
+ # ------------------------------------------------------------------
25
+ def _connect(self) -> Session:
26
+ conn_params = dict(
27
+ user=os.getenv("SNOWFLAKE_USER"),
28
+ password=os.getenv("SNOWFLAKE_PASSWORD"),
29
+ account=os.getenv("SNOWFLAKE_ACCOUNT"),
30
+ role=os.getenv("SNOWFLAKE_ROLE"),
31
+ warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
32
+ # No default database/schema β€” queries use fully-qualified names
33
+ )
34
+ session = Session.builder.configs(conn_params).create()
35
+ logger.info("Snowflake session created successfully.")
36
+ return session
37
+
38
+ # ------------------------------------------------------------------
39
+ def run_query(self, query: str, description: str = "query"):
40
+ """Execute a SELECT query and return a pandas DataFrame."""
41
+ try:
42
+ df = self.session.sql(query).to_pandas()
43
+ df.columns = df.columns.str.lower()
44
+ logger.info(f"Query '{description}' returned {len(df):,} rows.")
45
+ return df
46
+ except Exception as exc:
47
+ logger.error(f"Error executing '{description}': {exc}")
48
+ raise
49
+
50
+ # ------------------------------------------------------------------
51
+ def run_query_from_file(self, sql_path: str, description: str = ""):
52
+ """Read a .sql file and execute it, returning a pandas DataFrame."""
53
+ sql_path = Path(sql_path)
54
+ if not sql_path.exists():
55
+ raise FileNotFoundError(f"SQL file not found: {sql_path}")
56
+ query = sql_path.read_text(encoding="utf-8")
57
+ return self.run_query(query, description or sql_path.name)
58
+
59
+ # ------------------------------------------------------------------
60
+ def store_df_to_snowflake(
61
+ self,
62
+ table_name: str,
63
+ dataframe,
64
+ database: str,
65
+ schema: str,
66
+ overwrite: bool = False,
67
+ ):
68
+ """
69
+ Write a pandas DataFrame to a Snowflake table.
70
+
71
+ Args:
72
+ table_name: Target table name (without database/schema prefix)
73
+ dataframe: pandas DataFrame to write
74
+ database: Target Snowflake database
75
+ schema: Target Snowflake schema
76
+ overwrite: If True, truncate the table before inserting;
77
+ if False (default), append rows
78
+ """
79
+ if dataframe is None or len(dataframe) == 0:
80
+ logger.warning(f"store_df_to_snowflake: empty DataFrame, skipping write to {table_name}")
81
+ return
82
+
83
+ mode = "overwrite" if overwrite else "append"
84
+ try:
85
+ self.session.write_pandas(
86
+ df=dataframe,
87
+ table_name=table_name,
88
+ database=database,
89
+ schema=schema,
90
+ overwrite=overwrite,
91
+ auto_create_table=False, # Table must be created via SQL first
92
+ quote_identifiers=False,
93
+ use_logical_type = True
94
+ )
95
+ logger.info(
96
+ f"Stored {len(dataframe):,} rows to {database}.{schema}.{table_name} "
97
+ f"(mode={mode})"
98
+ )
99
+ except Exception as exc:
100
+ logger.error(f"Error storing to {database}.{schema}.{table_name}: {exc}")
101
+ raise
102
+
103
+ # ------------------------------------------------------------------
104
+ def close(self):
105
+ self.session.close()
106
+ logger.info("Snowflake session closed.")
process_helpscout/workflow/__init__.py ADDED
File without changes
process_helpscout/workflow/conversation_processor.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation Processing Workflow using LangGraph.
3
+
4
+ Two-node linear graph:
5
+ sentiment_analysis β†’ topic_extraction β†’ END
6
+
7
+ All conversations are assumed to be in English (no translation step).
8
+ The workflow operates on the full customer conversation text, pre-formatted
9
+ and truncated upstream before entering the graph.
10
+ """
11
+
12
+ from typing import Dict, Any, List, TypedDict, Annotated
13
+ import operator
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+ from langgraph.graph import StateGraph, END
18
+ from agents.sentiment_analysis_agent import SentimentAnalysisAgent
19
+ from agents.topic_extraction_agent import TopicExtractionAgent
20
+ import logging
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Maximum characters to send to the LLM β€” balances context richness vs. cost
25
+ _MAX_CONVERSATION_CHARS = 5000
26
+
27
+
28
+ class ConversationState(TypedDict):
29
+ """
30
+ State flowing through the conversation processing workflow.
31
+
32
+ Source fields come from the aggregated conversations DataFrame.
33
+ Processing fields are added/updated by each workflow node.
34
+ """
35
+ # --- Source / aggregation fields ---
36
+ conversation_id: str
37
+ customer_email: str
38
+ customer_first: str
39
+ customer_last: str
40
+ customer_hs_id: Any
41
+ thread_count: int
42
+ first_message_at: Any
43
+ last_message_at: Any
44
+ duration_hours: float
45
+ status: str
46
+ state: str
47
+ source_type: str
48
+ source_via: str
49
+ combined_text: str # Raw aggregated customer messages (pipe-separated)
50
+
51
+ # --- Pipeline input ---
52
+ conversation_text: str # Formatted + truncated text sent to agents
53
+
54
+ # --- Sentiment analysis outputs ---
55
+ sentiment_polarity: str
56
+ emotions: str # Comma-separated emotion values, or None
57
+ sentiment_confidence: str
58
+ sentiment_notes: str
59
+
60
+ # --- Topic extraction outputs ---
61
+ topics: str # Comma-separated topic IDs
62
+ is_refund_request: bool
63
+ is_cancellation: bool
64
+ is_membership: bool
65
+ topic_confidence: str
66
+ topic_notes: str
67
+ summary: str # 2-3 sentence neutral conversation summary
68
+
69
+ # --- Metadata ---
70
+ processing_errors: Annotated[List[str], operator.add]
71
+ success: bool
72
+
73
+
74
+ class ConversationProcessingWorkflow:
75
+ """
76
+ LangGraph-based workflow for processing HelpScout conversations.
77
+
78
+ Graph structure:
79
+ [START] β†’ sentiment_analysis β†’ topic_extraction β†’ [END]
80
+
81
+ Both nodes receive the same conversation_text. The workflow is
82
+ intentionally linear β€” no conditional edges β€” because every
83
+ conversation goes through both steps.
84
+ """
85
+
86
+ def __init__(self, config: Dict[str, Any], api_key: str):
87
+ """
88
+ Args:
89
+ config: Full processing_config.json content
90
+ api_key: OpenAI API key
91
+ """
92
+ self.config = config
93
+ self.api_key = api_key
94
+
95
+ # Agent-level configs
96
+ sentiment_agent_config = config["agents"]["sentiment_analysis"]
97
+ topic_agent_config = config["agents"]["topic_extraction"]
98
+
99
+ # Load topics.json β€” path is relative to this file's parent directory
100
+ workflow_dir = Path(__file__).resolve().parent
101
+ module_dir = workflow_dir.parent
102
+ topics_path = module_dir / "config_files" / "topics.json"
103
+
104
+ with open(topics_path, "r") as f:
105
+ topics_config = json.load(f)
106
+
107
+ # Override max chars from config if provided
108
+ proc = config.get("processing", {})
109
+ self._max_chars = proc.get("max_conversation_chars", _MAX_CONVERSATION_CHARS)
110
+
111
+ # Initialize agents
112
+ self.sentiment_agent = SentimentAnalysisAgent(sentiment_agent_config, api_key, config)
113
+ self.topic_agent = TopicExtractionAgent(topic_agent_config, api_key, topics_config)
114
+
115
+ # Compile workflow graph
116
+ self.workflow = self._build_workflow()
117
+ logger.info("ConversationProcessingWorkflow initialized")
118
+
119
+ # ------------------------------------------------------------------
120
+ # Graph construction
121
+ # ------------------------------------------------------------------
122
+
123
+ def _build_workflow(self) -> StateGraph:
124
+ graph = StateGraph(ConversationState)
125
+
126
+ graph.add_node("sentiment_analysis", self._sentiment_node)
127
+ graph.add_node("topic_extraction", self._topic_node)
128
+
129
+ graph.set_entry_point("sentiment_analysis")
130
+ graph.add_edge("sentiment_analysis", "topic_extraction")
131
+ graph.add_edge("topic_extraction", END)
132
+
133
+ return graph.compile()
134
+
135
+ # ------------------------------------------------------------------
136
+ # Preprocessing
137
+ # ------------------------------------------------------------------
138
+
139
+ def _format_conversation(self, combined_text: str) -> str:
140
+ """
141
+ Convert pipe-separated combined_text into a numbered message format
142
+ suitable for the LLM, truncated to self._max_chars.
143
+
144
+ Input: "I can't log in | Still not working | Please help!"
145
+ Output: "[1] I can't log in\n[2] Still not working\n[3] Please help!"
146
+ """
147
+ if not combined_text or not str(combined_text).strip():
148
+ return ""
149
+
150
+ messages = [m.strip() for m in str(combined_text).split("|") if m.strip()]
151
+ total_messages = len(messages)
152
+
153
+ parts = []
154
+ char_count = 0
155
+
156
+ for i, msg in enumerate(messages, 1):
157
+ entry = f"[{i}] {msg}"
158
+ if char_count + len(entry) + 1 > self._max_chars:
159
+ parts.append(f"[...truncated after {i - 1} of {total_messages} messages]")
160
+ break
161
+ parts.append(entry)
162
+ char_count += len(entry) + 1
163
+
164
+ return "\n".join(parts)
165
+
166
+ # ------------------------------------------------------------------
167
+ # Workflow nodes
168
+ # ------------------------------------------------------------------
169
+
170
+ def _sentiment_node(self, state: ConversationState) -> ConversationState:
171
+ """Node 1: Classify sentiment polarity and emotions."""
172
+ try:
173
+ # Format conversation text once β€” reused by both nodes
174
+ state["conversation_text"] = self._format_conversation(state.get("combined_text", ""))
175
+
176
+ if not state["conversation_text"]:
177
+ state["processing_errors"] = state.get("processing_errors", []) + [
178
+ "Empty conversation text after formatting"
179
+ ]
180
+ state["success"] = False
181
+ return state
182
+
183
+ result = self.sentiment_agent.process({"conversation_text": state["conversation_text"]})
184
+
185
+ if result.get("success", False):
186
+ state["sentiment_polarity"] = result.get("sentiment_polarity")
187
+ state["emotions"] = result.get("emotions")
188
+ state["sentiment_confidence"] = result.get("sentiment_confidence")
189
+ state["sentiment_notes"] = result.get("sentiment_notes", "")
190
+ state["success"] = True
191
+ else:
192
+ error_msg = f"Sentiment analysis failed: {result.get('error', 'Unknown error')}"
193
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
194
+ state["success"] = False
195
+ state["sentiment_polarity"] = None
196
+ state["emotions"] = None
197
+ state["sentiment_confidence"] = None
198
+ state["sentiment_notes"] = ""
199
+
200
+ logger.debug(f"Sentiment: {state['sentiment_polarity']} | Conversation: {state['conversation_id']}")
201
+
202
+ except Exception as e:
203
+ error_msg = f"Sentiment node error: {str(e)}"
204
+ logger.error(error_msg)
205
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
206
+ state["success"] = False
207
+
208
+ return state
209
+
210
+ def _topic_node(self, state: ConversationState) -> ConversationState:
211
+ """Node 2: Extract topic tags and billing flags."""
212
+ try:
213
+ # Skip topic extraction if sentiment already failed β€” no point in a partial record
214
+ if not state.get("success", False):
215
+ state["topics"] = None
216
+ state["is_refund_request"] = False
217
+ state["is_cancellation"] = False
218
+ state["is_membership"] = False
219
+ state["topic_confidence"] = None
220
+ state["topic_notes"] = ""
221
+ state["summary"] = ""
222
+ return state
223
+
224
+ result = self.topic_agent.process({"conversation_text": state["conversation_text"]})
225
+
226
+ if result.get("success", False):
227
+ state["topics"] = result.get("topics")
228
+ state["is_refund_request"] = result.get("is_refund_request", False)
229
+ state["is_cancellation"] = result.get("is_cancellation", False)
230
+ state["is_membership"] = result.get("is_membership", False)
231
+ state["topic_confidence"] = result.get("topic_confidence")
232
+ state["topic_notes"] = result.get("topic_notes", "")
233
+ state["summary"] = result.get("summary", "")
234
+ state["success"] = True
235
+ else:
236
+ error_msg = f"Topic extraction failed: {result.get('error', 'Unknown error')}"
237
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
238
+ state["success"] = False
239
+ state["topics"] = None
240
+ state["is_refund_request"] = False
241
+ state["is_cancellation"] = False
242
+ state["is_membership"] = False
243
+ state["topic_confidence"] = None
244
+ state["topic_notes"] = ""
245
+ state["summary"] = ""
246
+
247
+ logger.debug(f"Topics: {state['topics']} | Conversation: {state['conversation_id']}")
248
+
249
+ except Exception as e:
250
+ error_msg = f"Topic node error: {str(e)}"
251
+ logger.error(error_msg)
252
+ state["processing_errors"] = state.get("processing_errors", []) + [error_msg]
253
+ state["success"] = False
254
+
255
+ return state
256
+
257
+ # ------------------------------------------------------------------
258
+ # Public API
259
+ # ------------------------------------------------------------------
260
+
261
+ def process_conversation(self, conversation_data: Dict[str, Any]) -> Dict[str, Any]:
262
+ """
263
+ Process a single conversation through the full workflow.
264
+
265
+ Args:
266
+ conversation_data: Dict with aggregated conversation fields
267
+ (conversation_id, combined_text, customer_*, etc.)
268
+
269
+ Returns:
270
+ Dict with all original fields plus extracted sentiment and topic fields.
271
+ """
272
+ combined_text = conversation_data.get("combined_text", "")
273
+
274
+ if not combined_text or not str(combined_text).strip():
275
+ logger.warning(f"Skipping conversation with empty text: {conversation_data.get('conversation_id')}")
276
+ return {
277
+ **conversation_data,
278
+ "success": False,
279
+ "processing_errors": ["combined_text is empty β€” nothing to analyze"],
280
+ "conversation_text": "",
281
+ }
282
+
283
+ initial_state = {
284
+ "conversation_id": str(conversation_data.get("conversation_id", "")),
285
+ "customer_email": conversation_data.get("customer_email"),
286
+ "customer_first": conversation_data.get("customer_first"),
287
+ "customer_last": conversation_data.get("customer_last"),
288
+ "customer_hs_id": conversation_data.get("customer_hs_id"),
289
+ "thread_count": conversation_data.get("thread_count"),
290
+ "first_message_at": conversation_data.get("first_message_at"),
291
+ "last_message_at": conversation_data.get("last_message_at"),
292
+ "duration_hours": conversation_data.get("duration_hours"),
293
+ "status": conversation_data.get("status"),
294
+ "state": conversation_data.get("state"),
295
+ "source_type": conversation_data.get("source_type"),
296
+ "source_via": conversation_data.get("source_via"),
297
+ "combined_text": str(combined_text).strip(),
298
+ "conversation_text": "", # filled by sentiment node
299
+ "processing_errors": [],
300
+ "success": True,
301
+ }
302
+
303
+ try:
304
+ final_state = self.workflow.invoke(initial_state)
305
+
306
+ # Merge any extra fields from the source that weren't in initial_state
307
+ result = dict(final_state)
308
+ for key, value in conversation_data.items():
309
+ if key not in result:
310
+ result[key] = value
311
+
312
+ return result
313
+
314
+ except Exception as e:
315
+ logger.error(f"Workflow execution error for {conversation_data.get('conversation_id')}: {e}")
316
+ return {
317
+ **conversation_data,
318
+ "success": False,
319
+ "processing_errors": [str(e)],
320
+ "conversation_text": "",
321
+ }
322
+
323
+ def process_batch(self, conversations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
324
+ """Process a list of conversations sequentially within the batch."""
325
+ results = []
326
+ total = len(conversations)
327
+
328
+ for idx, conv in enumerate(conversations, 1):
329
+ logger.info(f"Processing conversation {idx}/{total} (id={conv.get('conversation_id')})")
330
+ result = self.process_conversation(conv)
331
+ results.append(result)
332
+
333
+ logger.info(f"Batch complete: {total} conversations processed")
334
+ return results
visualization/README.md CHANGED
@@ -1,6 +1,6 @@
1
  # Musora Sentiment Analysis Dashboard
2
 
3
- A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter) and the **Musora internal app** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora).
4
 
5
  ---
6
 
@@ -12,9 +12,12 @@ A Streamlit dashboard for visualising sentiment analysis results from **social m
12
  4. [Pages](#pages)
13
  5. [Global Filters & Session State](#global-filters--session-state)
14
  6. [Snowflake Queries](#snowflake-queries)
15
- 7. [Adding or Changing Things](#adding-or-changing-things)
16
- 8. [Running the App](#running-the-app)
17
- 9. [Configuration Reference](#configuration-reference)
 
 
 
18
 
19
  ---
20
 
@@ -22,28 +25,38 @@ A Streamlit dashboard for visualising sentiment analysis results from **social m
22
 
23
  ```
24
  visualization/
25
- β”œβ”€β”€ app.py # Entry point β€” routing, sidebar, session state
26
  β”œβ”€β”€ config/
27
- β”‚ └── viz_config.json # Colors, query strings, dashboard settings
28
  β”œβ”€β”€ data/
29
- β”‚ └── data_loader.py # All Snowflake queries and caching logic
 
30
  β”œβ”€β”€ utils/
31
- β”‚ β”œβ”€β”€ data_processor.py # Pandas aggregations (intent dist, content summary, etc.)
32
- β”‚ └── metrics.py # KPI calculations (sentiment score, urgency, etc.)
 
 
 
 
33
  β”œβ”€β”€ components/
34
- β”‚ β”œβ”€β”€ dashboard.py # Dashboard page renderer
35
- β”‚ β”œβ”€β”€ sentiment_analysis.py # Sentiment Analysis page renderer
36
- β”‚ └── reply_required.py # Reply Required page renderer
 
 
37
  β”œβ”€β”€ visualizations/
38
- β”‚ β”œβ”€β”€ sentiment_charts.py # Plotly sentiment chart functions
39
- β”‚ β”œβ”€β”€ distribution_charts.py # Plotly distribution / heatmap / scatter functions
40
- β”‚ β”œβ”€β”€ demographic_charts.py # Plotly demographic chart functions
41
- β”‚ └── content_cards.py # Streamlit card components (comment cards, content cards)
 
42
  β”œβ”€β”€ agents/
43
- β”‚ └── content_summary_agent.py # AI analysis agent (OpenAI) for comment summarisation
 
 
44
  β”œβ”€β”€ img/
45
- β”‚ └── musora.png # Sidebar logo
46
- └── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session)
47
  ```
48
 
49
  ---
@@ -53,213 +66,331 @@ visualization/
53
  ```
54
  Snowflake
55
  β”‚
56
- β–Ό
57
- data_loader.py ← Three separate loading modes (see below)
 
 
 
 
 
 
58
  β”‚
59
- β”œβ”€β”€ load_dashboard_data() ──► st.session_state['dashboard_df']
60
- β”‚ └─► app.py sidebar (filter options, counts)
61
- β”‚ └─► dashboard.py (all charts)
62
- β”‚
63
- β”œβ”€β”€ load_sa_data() ──► st.session_state['sa_contents']
64
- β”‚ (on-demand, button) st.session_state['sa_comments']
65
- β”‚ └─► sentiment_analysis.py
66
- β”‚
67
- └── load_reply_required_data() β–Ί st.session_state['rr_df']
68
- (on-demand, button) └─► reply_required.py
69
  ```
70
 
71
  **Key principle:** Data is loaded as little as possible, as late as possible.
72
 
73
- - The **Dashboard** uses a lightweight query (no text columns, no content join) cached for 24 hours.
74
- - The **Sentiment Analysis** and **Reply Required** pages never load data automatically β€” they wait for the user to click **Fetch Data**.
75
- - All data is stored in `st.session_state` so page navigation and widget interactions do not re-trigger Snowflake queries.
76
 
77
  ---
78
 
79
  ## Data Loading Strategy
80
 
81
- All loading logic lives in **`data/data_loader.py`** (`SentimentDataLoader` class).
82
 
83
- ### `load_dashboard_data()`
84
- - Uses `dashboard_query` from `viz_config.json`.
85
  - Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
86
- - No text columns, no `DIM_CONTENT` join β€” significantly faster than the full query.
87
- - Also merges demographics data if `demographics_query` is configured.
88
- - Cached for **24 hours** (`@st.cache_data(ttl=86400)`).
89
- - Called once by `app.py` at startup; result stored in `st.session_state['dashboard_df']`.
90
 
91
- ### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, date_range)`
92
- - Runs **two** sequential Snowflake queries:
93
  1. **Content aggregation** β€” groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
94
- 2. **Sampled comments** β€” for the top N `content_sk`s only, fetches up to 50 comments per sentiment group per content (negative, positive, other), using Snowflake `QUALIFY ROW_NUMBER()`. `display_text` is computed in SQL (`CASE WHEN IS_ENGLISH = FALSE AND TRANSLATED_TEXT IS NOT NULL THEN TRANSLATED_TEXT ELSE ORIGINAL_TEXT END`).
95
- - Returns a tuple `(contents_df, comments_df)`.
96
- - Cached for **24 hours**.
97
- - Called only when the user clicks **Fetch Data** on the Sentiment Analysis page.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- ### `load_reply_required_data(platforms, brands, date_range)`
100
- - Runs a single query filtering `REQUIRES_REPLY = TRUE`.
101
- - Dynamically includes/excludes the social media table and musora table based on selected platforms.
102
- - `display_text` computed in SQL.
103
- - Cached for **24 hours**.
104
- - Called only when the user clicks **Fetch Data** on the Reply Required page.
105
 
106
- ### Important: SQL Column Qualification
107
- Both the social media table (`COMMENT_SENTIMENT_FEATURES`) and the content dimension table (`DIM_CONTENT`) share column names. Any `WHERE` clause inside a query that joins these two tables **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`, `s.CHANNEL_NAME`) to avoid Snowflake `ambiguous column name` errors. The musora table (`MUSORA_COMMENT_SENTIMENT_FEATURES`) has no joins so unqualified column names are fine there.
108
 
109
  ---
110
 
111
  ## Pages
112
 
113
- ### Dashboard (`components/dashboard.py`)
114
 
115
- **Receives:** `filtered_df` β€” the lightweight dashboard dataframe (after optional global filter applied by `app.py`).
116
 
117
- **Does not need:** text, translations, content URLs. All charts work purely on aggregated columns (sentiment_polarity, brand, platform, intent, requires_reply, comment_timestamp).
118
 
119
  **Key sections:**
120
  - Summary stats + health indicator
121
  - Sentiment distribution (pie + gauge)
122
  - Sentiment by brand and platform (stacked + percentage bar charts)
123
- - Intent analysis
124
- - Brand-Platform heatmap
 
125
  - Reply requirements + urgency breakdown
126
- - Demographics (age, timezone, experience level) β€” only rendered if `author_id` is present and demographics were merged
127
-
128
- **To add a new chart:** create the chart function in `visualizations/` and call it from `render_dashboard()`. The function receives `filtered_df`.
129
 
130
  ---
131
 
132
- ### Sentiment Analysis (`components/sentiment_analysis.py`)
133
 
134
- **Receives:** `data_loader` instance only (no dataframe).
135
 
136
  **Flow:**
137
- 1. Reads `st.session_state['dashboard_df']` for filter option lists (platforms, brands, sentiments, intents).
138
  2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
139
- 3. Shows filter controls (platform, brand, sentiment, intent, top_n, min_comments, sort_by).
140
- 4. On **Fetch Data** click: calls `data_loader.load_sa_data(...)` and stores results in `st.session_state['sa_contents']` and `['sa_comments']`.
141
- 5. Renders content cards, per-content sentiment + intent charts, AI analysis buttons, and sampled comment expanders.
142
 
143
  **Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
144
 
145
- **Comments:** Sampled (up to 50 negative + 50 positive + 50 neutral per content). These are already in memory after the fetch β€” no extra query is needed when the user expands a comment section.
146
-
147
- **AI Analysis:** Uses `ContentSummaryAgent` (see `agents/`). Results cached in `st.session_state['content_summaries']`.
148
-
149
  ---
150
 
151
- ### Reply Required (`components/reply_required.py`)
152
 
153
  **Receives:** `data_loader` instance only.
154
 
155
  **Flow:**
156
- 1. Reads `st.session_state['dashboard_df']` for filter option lists.
157
- 2. Pre-populates platform, brand, and date from `st.session_state['global_filters']`.
158
- 3. On **Fetch Data** click: calls `data_loader.load_reply_required_data(...)` and stores result in `st.session_state['rr_df']`.
159
- 4. Shows urgency breakdown, in-page view filters (priority, platform, brand, intent β€” applied in Python, no new query), paginated comment cards, and a "Reply by Content" summary.
160
 
161
  **Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
162
 
163
  ---
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  ## Global Filters & Session State
166
 
167
- Global filters live in the sidebar (`app.py`) and are stored in `st.session_state['global_filters']` as a dict:
168
 
169
  ```python
170
- {
171
- 'platforms': ['facebook', 'instagram'], # list or []
172
  'brands': ['drumeo'],
173
  'sentiments': [],
174
  'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
175
  }
176
  ```
177
 
178
- - **Dashboard:** `app.py` applies global filters to `dashboard_df` using `data_loader.apply_filters()` and passes the result to `render_dashboard()`.
179
- - **Sentiment Analysis / Reply Required:** global filters are used to pre-populate their own filter widgets. The actual Snowflake query uses those values when the user clicks Fetch. The pages do **not** receive a pre-filtered dataframe.
180
-
181
  ### Full session state key reference
182
 
183
  | Key | Set by | Used by |
184
  |-----|--------|---------|
185
- | `dashboard_df` | `app.py` on startup | sidebar (filter options), dashboard, SA + RR (filter option lists) |
186
- | `global_filters` | sidebar "Apply Filters" button | app.py (dashboard filter), SA + RR (pre-populate widgets) |
187
- | `filters_applied` | sidebar buttons | app.py (whether to apply filters) |
188
- | `sa_contents` | SA fetch button | SA page rendering |
189
- | `sa_comments` | SA fetch button | SA page rendering |
190
- | `sa_fetch_key` | SA fetch button | SA page (detect stale data) |
191
- | `rr_df` | RR fetch button | RR page rendering |
192
- | `rr_fetch_key` | RR fetch button | RR page (detect stale data) |
193
  | `sentiment_page` | SA page / fetch | SA pagination |
194
  | `reply_page` | RR page / fetch | RR pagination |
195
- | `content_summaries` | AI analysis buttons | SA AI analysis display |
 
 
 
 
 
 
 
196
 
197
  ---
198
 
199
  ## Snowflake Queries
200
 
201
- All query strings are either stored in `config/viz_config.json` (static queries) or built dynamically in `data/data_loader.py` (page-specific queries).
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  ### Static queries (in `viz_config.json`)
204
 
205
  | Key | Purpose |
206
  |-----|---------|
207
- | `query` | Full query with all columns (legacy, kept for compatibility) |
208
- | `dashboard_query` | Lightweight query β€” no text, no DIM_CONTENT join |
209
- | `demographics_query` | Joins `usora_users` with `preprocessed.users` to get age/timezone/experience |
 
210
 
211
- ### Dynamic queries (built in `data_loader.py`)
212
 
213
  | Method | Description |
214
  |--------|-------------|
215
- | `_build_sa_content_query()` | Content aggregation for SA page; filters by platform + brand + date |
216
- | `_build_sa_comments_query()` | Sampled comments for SA page; uses `QUALIFY ROW_NUMBER() <= 50` |
217
- | `_build_rr_query()` | Reply-required comments; filters by platform/brand/date; conditionally includes social media and/or musora table |
218
 
219
- ### Data source tables
220
 
221
- | Table | Platform | Notes |
222
- |-------|----------|-------|
223
- | `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT` for `PERMALINK_URL` |
224
- | `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively; platform stored as `'musora'`, mapped to `'musora_app'` in queries |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  ---
227
 
228
  ## Adding or Changing Things
229
 
230
- ### Add a new chart to the Dashboard
231
  1. Write the chart function in the appropriate `visualizations/` file.
232
- 2. Call it from `render_dashboard()` in `components/dashboard.py`, passing `filtered_df`.
233
- 3. The chart function receives a lightweight df β€” it has no text columns but has all the columns listed in `dashboard_query`.
234
 
235
- ### Add a new filter to the Dashboard sidebar
236
- 1. Add the widget in `app.py` under the "Global Filters" section.
237
- 2. Store the selected value in the `global_filters` dict under `st.session_state`.
238
- 3. Pass it to `data_loader.apply_filters()`.
239
 
240
- ### Change what the Sentiment Analysis page queries
241
- - Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`.
242
- - If you add new columns to the content aggregation result, also update `_process_sa_content_stats()` so they are available in `contents_df`.
243
- - If you add new columns to the comments result, update `_process_sa_comments()`.
244
 
245
- ### Change what the Reply Required page queries
246
- - Edit `_build_rr_query()` in `data_loader.py`.
247
- - Remember: all column references inside the social media block (which has a `JOIN`) must be prefixed with `s.` to avoid Snowflake ambiguity errors.
248
 
249
  ### Change the cache duration
250
- - `@st.cache_data(ttl=86400)` is set on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, and `load_demographics_data`.
251
- - Change `86400` (seconds) to the desired TTL, or set `ttl=None` for no expiry.
252
- - Users can always force a refresh with the "Reload Data" button in the sidebar (which calls `st.cache_data.clear()` and deletes `st.session_state['dashboard_df']`).
253
 
254
  ### Add a new page
255
- 1. Create `components/new_page.py` with a `render_new_page(data_loader)` function.
256
  2. Import and add a radio option in `app.py`.
257
- 3. If the page needs its own Snowflake data, add a `load_new_page_data()` method to `SentimentDataLoader` following the same pattern as `load_sa_data`.
 
258
 
259
- ### Add a new column to the Dashboard query
260
- - Edit `dashboard_query` in `config/viz_config.json`.
261
- - Both UNION branches must select the same columns in the same order.
262
- - `_process_dashboard_dataframe()` in `data_loader.py` handles basic type casting β€” add processing there if needed.
263
 
264
  ---
265
 
@@ -280,6 +411,8 @@ SNOWFLAKE_ROLE
280
  SNOWFLAKE_DATABASE
281
  SNOWFLAKE_WAREHOUSE
282
  SNOWFLAKE_SCHEMA
 
 
283
  ```
284
 
285
  ---
@@ -291,19 +424,25 @@ SNOWFLAKE_SCHEMA
291
  | Section | What it configures |
292
  |---------|-------------------|
293
  | `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
294
- | `color_schemes.intent` | Hex colors for each intent label |
295
- | `color_schemes.platform` | Hex colors for each platform |
296
- | `color_schemes.brand` | Hex colors for each brand |
297
- | `sentiment_order` | Display order for sentiment categories in charts |
 
 
 
 
298
  | `intent_order` | Display order for intent categories |
 
299
  | `negative_sentiments` | Which sentiment values count as "negative" |
300
- | `dashboard.default_date_range_days` | Default date filter window (days) |
301
- | `dashboard.max_comments_display` | Max comments shown per pagination page |
302
- | `dashboard.chart_height` | Default Plotly chart height |
303
- | `dashboard.top_n_contents` | Default top-N for content ranking |
304
- | `snowflake.query` | Full query (legacy, all columns) |
305
- | `snowflake.dashboard_query` | Lightweight dashboard query (no text columns) |
306
- | `snowflake.demographics_query` | Demographics join query |
 
307
  | `demographics.age_groups` | Age bucket definitions (label β†’ [min, max]) |
308
  | `demographics.experience_groups` | Experience bucket definitions |
309
  | `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
 
1
  # Musora Sentiment Analysis Dashboard
2
 
3
+ A Streamlit dashboard for visualising sentiment analysis results from **social media comments** (Facebook, Instagram, YouTube, Twitter), the **Musora internal app**, and **HelpScout customer support conversations** across brands (Drumeo, Pianote, Guitareo, Singeo, Musora).
4
 
5
  ---
6
 
 
12
  4. [Pages](#pages)
13
  5. [Global Filters & Session State](#global-filters--session-state)
14
  6. [Snowflake Queries](#snowflake-queries)
15
+ 7. [Authentication](#authentication)
16
+ 8. [PDF Reports](#pdf-reports)
17
+ 9. [AI Agents](#ai-agents)
18
+ 10. [Adding or Changing Things](#adding-or-changing-things)
19
+ 11. [Running the App](#running-the-app)
20
+ 12. [Configuration Reference](#configuration-reference)
21
 
22
  ---
23
 
 
25
 
26
  ```
27
  visualization/
28
+ β”œβ”€β”€ app.py # Entry point β€” routing, sidebar, session state
29
  β”œβ”€β”€ config/
30
+ β”‚ └── viz_config.json # Colors, query strings, dashboard settings
31
  β”œβ”€β”€ data/
32
+ β”‚ β”œβ”€β”€ data_loader.py # Comment Snowflake queries and caching
33
+ β”‚ └── helpscout_data_loader.py # HelpScout Snowflake queries and caching
34
  β”œβ”€β”€ utils/
35
+ β”‚ β”œβ”€β”€ auth.py # Login page, authentication helpers
36
+ β”‚ β”œβ”€β”€ data_processor.py # Pandas aggregations (intent dist, content summary, etc.)
37
+ β”‚ β”œβ”€β”€ metrics.py # KPI calculations (sentiment score, urgency, etc.)
38
+ β”‚ β”œβ”€β”€ pdf_exporter.py # DashboardPDFExporter (comment dashboard PDF)
39
+ β”‚ β”œβ”€β”€ helpscout_utils.py # Pure helpers: parse_topics, explode_topics, boolean_flag_counts
40
+ β”‚ └── helpscout_pdf.py # HelpScoutDashboardPDF + HelpScoutAnalysisPDF
41
  β”œβ”€β”€ components/
42
+ β”‚ β”œβ”€β”€ dashboard.py # Comment Dashboard page renderer
43
+ β”‚ β”œβ”€β”€ sentiment_analysis.py # Sentiment Analysis page renderer
44
+ β”‚ β”œβ”€β”€ reply_required.py # Reply Required page renderer
45
+ β”‚ β”œβ”€β”€ helpscout_dashboard.py # HelpScout Dashboard page + compact summary widget
46
+ β”‚ └── helpscout_analysis.py # HelpScout Analysis page (filterβ†’fetchβ†’chartsβ†’LLMβ†’PDF)
47
  β”œβ”€β”€ visualizations/
48
+ β”‚ β”œβ”€β”€ sentiment_charts.py # Plotly sentiment chart functions
49
+ β”‚ β”œβ”€β”€ distribution_charts.py # Plotly distribution / heatmap / scatter functions
50
+ β”‚ β”œβ”€β”€ demographic_charts.py # Plotly demographic chart functions
51
+ β”‚ β”œβ”€β”€ content_cards.py # Streamlit card components (comment + content cards)
52
+ β”‚ └── helpscout_charts.py # HelpScoutCharts Plotly factory (16 chart types)
53
  β”œβ”€β”€ agents/
54
+ β”‚ β”œβ”€β”€ base_agent.py # BaseVisualizationAgent (shared interface)
55
+ β”‚ β”œβ”€β”€ content_summary_agent.py # AI analysis for comment content summarisation
56
+ β”‚ └── helpscout_summary_agent.py # HelpScoutSummaryAgent β€” page-level LLM summary from SUMMARY fields
57
  β”œβ”€β”€ img/
58
+ β”‚ └── musora.png # Sidebar logo
59
+ └── SnowFlakeConnection.py # Snowflake connection wrapper (Snowpark session)
60
  ```
61
 
62
  ---
 
66
  ```
67
  Snowflake
68
  β”‚
69
+ β”œβ”€β”€ data_loader.py (SentimentDataLoader)
70
+ β”‚ β”œβ”€β”€ load_dashboard_data() ──► st.session_state['dashboard_df']
71
+ β”‚ β”‚ └─► sidebar (filter options, counts)
72
+ β”‚ β”‚ └─► dashboard.py (all charts)
73
+ β”‚ β”œβ”€β”€ load_sa_data() ──► st.session_state['sa_contents', 'sa_comments']
74
+ β”‚ β”‚ (on-demand, Fetch button) └─► sentiment_analysis.py
75
+ β”‚ └── load_reply_required_data() ──► st.session_state['rr_df']
76
+ β”‚ (on-demand, Fetch button) └─► reply_required.py
77
  β”‚
78
+ └── helpscout_data_loader.py (HelpScoutDataLoader)
79
+ β”œβ”€β”€ load_dashboard_data() ──► st.session_state['helpscout_df']
80
+ β”‚ └─► helpscout_dashboard.py
81
+ β”‚ └─► dashboard.py (compact summary)
82
+ └── load_analysis_data() ──► st.session_state['hs_analysis_df']
83
+ (on-demand, Fetch button) └─► helpscout_analysis.py
 
 
 
 
84
  ```
85
 
86
  **Key principle:** Data is loaded as little as possible, as late as possible.
87
 
88
+ - **Dashboard** queries are lightweight (no text columns, no content join) and cached 24 hours.
89
+ - **Sentiment Analysis**, **Reply Required**, and **HelpScout Analysis** pages wait for the user to click **Fetch Data**.
90
+ - All data lives in `st.session_state` so page navigation and widget interactions never re-trigger Snowflake queries.
91
 
92
  ---
93
 
94
  ## Data Loading Strategy
95
 
96
+ ### Comment data (`data/data_loader.py` β€” `SentimentDataLoader`)
97
 
98
+ #### `load_dashboard_data()`
 
99
  - Fetches only: `comment_sk, content_sk, platform, brand, sentiment_polarity, intent, requires_reply, detected_language, comment_timestamp, processed_at, author_id`.
100
+ - No text columns, no `DIM_CONTENT` join.
101
+ - Merges demographics data if `demographics_query` is configured.
102
+ - Cached **24 hours**. Called once at startup; stored in `st.session_state['dashboard_df']`.
 
103
 
104
+ #### `load_sa_data(platform, brand, top_n, min_comments, sort_by, sentiments, intents, emotions, date_range)`
105
+ - Runs two Snowflake queries:
106
  1. **Content aggregation** β€” groups by `content_sk`, counts per sentiment, computes severity score, returns top N.
107
+ 2. **Sampled comments** β€” up to 50 per sentiment group per content (`QUALIFY ROW_NUMBER() <= 50`). `display_text` computed in SQL.
108
+ - Returns `(contents_df, comments_df)`. Cached **24 hours**.
109
+
110
+ #### `load_reply_required_data(platforms, brands, date_range)`
111
+ - Filters `REQUIRES_REPLY = TRUE`. Conditionally includes the social media table and/or musora table. Cached **24 hours**.
112
+
113
+ #### SQL column qualification note
114
+ The social media table and `DIM_CONTENT` share column names. Any `WHERE` clause inside a query that joins them **must** use the table alias prefix (e.g. `s.PLATFORM`, `s.COMMENT_TIMESTAMP`) to avoid Snowflake `ambiguous column name` errors.
115
+
116
+ ---
117
+
118
+ ### HelpScout data (`data/helpscout_data_loader.py` β€” `HelpScoutDataLoader`)
119
+
120
+ #### `load_dashboard_data()`
121
+ - Lightweight query from `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES`.
122
+ - Columns: `conversation_id, status, source, created_at, updated_at, duration_hours, sentiment_polarity, topics, is_refund_request, is_cancellation, is_membership, customer_email`.
123
+ - Merges demographics (age/timezone/experience) via email join (`LOWER(customer_email) = LOWER(usora_users.email)`).
124
+ - Cached **24 hours**. Stored in `st.session_state['helpscout_df']`.
125
 
126
+ #### `load_analysis_data(date_start, date_end, topics, sentiments, statuses, sources, is_refund, is_cancellation, is_membership)`
127
+ - Adds `summary, sentiment_notes, topic_notes, customer_first_name, customer_last_name` columns.
128
+ - SQL `WHERE` pushdown for all filters; multi-label topic filter uses `ARRAY_CONTAINS('topic_id'::VARIANT, SPLIT(TOPICS, ','))`.
129
+ - Cached **24 hours** keyed on filter tuple. Stored in `st.session_state['hs_analysis_df']`.
 
 
130
 
131
+ #### `get_filter_options(df)`
132
+ - Returns `sentiments`, `topics` (exploded and label-mapped from taxonomy), `statuses`, `states`, `sources`.
133
 
134
  ---
135
 
136
  ## Pages
137
 
138
+ The app has **5 pages** navigated via the sidebar radio:
139
 
140
+ ### 1. Sentiment Dashboard (`components/dashboard.py`)
141
 
142
+ **Receives:** `filtered_df` β€” lightweight comment dataframe (after optional global filter from `app.py`).
143
 
144
  **Key sections:**
145
  - Summary stats + health indicator
146
  - Sentiment distribution (pie + gauge)
147
  - Sentiment by brand and platform (stacked + percentage bar charts)
148
+ - Intent analysis (bar + pie)
149
+ - Emotion analysis (bar + pie) β€” only when `emotions` column is non-null
150
+ - Brand–Platform heatmap
151
  - Reply requirements + urgency breakdown
152
+ - Demographics (age, timezone, experience) β€” only when demographics were merged
153
+ - **HelpScout compact summary** β€” appended at bottom; reads `st.session_state['helpscout_df']` directly (guarded by `try/except` so failures never break the main dashboard)
 
154
 
155
  ---
156
 
157
+ ### 2. Custom Sentiment Queries (`components/sentiment_analysis.py`)
158
 
159
+ **Receives:** `data_loader` instance only.
160
 
161
  **Flow:**
162
+ 1. Reads `st.session_state['dashboard_df']` for filter option lists.
163
  2. Pre-populates platform/brand dropdowns from `st.session_state['global_filters']`.
164
+ 3. On **Fetch Data**: calls `data_loader.load_sa_data(...)`, stores results in `st.session_state['sa_contents']` and `['sa_comments']`.
165
+ 4. Renders content cards, per-content sentiment + intent + emotion charts, AI analysis buttons, sampled comment expanders.
 
166
 
167
  **Pagination:** `st.session_state['sentiment_page']` (5 contents per page). Reset on new fetch.
168
 
 
 
 
 
169
  ---
170
 
171
+ ### 3. Reply Required (`components/reply_required.py`)
172
 
173
  **Receives:** `data_loader` instance only.
174
 
175
  **Flow:**
176
+ 1. Pre-populates platform/brand/date from `st.session_state['global_filters']`.
177
+ 2. On **Fetch Data**: calls `data_loader.load_reply_required_data(...)`, stores result in `st.session_state['rr_df']`.
178
+ 3. Shows urgency breakdown, in-page filters (applied in Python, no extra query), paginated comment cards, and "Reply by Content" summary.
 
179
 
180
  **Pagination:** `st.session_state['reply_page']` (10 comments per page). Reset on new fetch.
181
 
182
  ---
183
 
184
+ ### 4. HelpScout Dashboard (`components/helpscout_dashboard.py`)
185
+
186
+ **Receives:** `helpscout_loader` instance.
187
+
188
+ **Reads from:** `st.session_state['helpscout_df']` (loaded at app startup).
189
+
190
+ **Key sections:**
191
+ - PDF export button (HelpScout Dashboard PDF)
192
+ - 6 KPI metrics: total conversations, average duration, refund requests, cancellations, negative rate, membership joins
193
+ - Sentiment distribution (pie + bar)
194
+ - Topic distribution and sentiment heatmap (from `process_helpscout/config_files/topics.json` taxonomy)
195
+ - Boolean flags (refund, cancellation, membership) breakdown
196
+ - Status and source breakdown
197
+ - Timelines expander (daily conversation volume, refund/cancel trend)
198
+ - Depth expander (topic co-occurrence, escalation funnel)
199
+ - Demographics (age, timezone, experience)
200
+
201
+ > **Note:** Global sidebar filters (brand, platform, sentiment, date) do **not** apply to HelpScout pages β€” HelpScout is brand-agnostic and uses its own filter panel.
202
+
203
+ ---
204
+
205
+ ### 5. HelpScout Analysis (`components/helpscout_analysis.py`)
206
+
207
+ **Receives:** `helpscout_loader` instance.
208
+
209
+ **Flow:**
210
+ 1. **Filter panel** β€” date range, top_n, topics (multi-select with human-readable labels), sentiments, statuses, sources, and 3 boolean checkboxes (refund / cancellation / membership).
211
+ 2. **Fetch Data** button β€” calls `helpscout_loader.load_analysis_data(...)`, stale-checked via `fetch_key` tuple.
212
+ 3. **KPI row** + distribution charts (sentiment, topics, flags, status).
213
+ 4. **AI Summary section:**
214
+ - "Generate AI Summary" button β†’ calls `HelpScoutSummaryAgent`, stores result in `st.session_state['hs_analysis_summary']`.
215
+ - Renders: executive summary, top themes, top complaints, unexpected insights, notable quotes.
216
+ - "Export Analysis PDF" button β†’ generates `HelpScoutAnalysisPDF`.
217
+ 5. **Paginated conversation cards** β€” 10 per page; each card shows customer name, status, topics (label-mapped), summary, sentiment/topic notes.
218
+ 6. **CSV export** button.
219
+
220
+ **Pagination:** `st.session_state['hs_analysis_page']`. Reset on new fetch.
221
+
222
+ **Date range default:** Clamps to `max(min_date, max_date βˆ’ default_date_range_days)` so the default is always within the available data window.
223
+
224
+ ---
225
+
226
  ## Global Filters & Session State
227
 
228
+ Global filters apply **only to comment pages** (Dashboard, Sentiment Analysis, Reply Required). They have no effect on HelpScout pages.
229
 
230
  ```python
231
+ st.session_state['global_filters'] = {
232
+ 'platforms': ['facebook', 'instagram'],
233
  'brands': ['drumeo'],
234
  'sentiments': [],
235
  'date_range': (date(2025, 1, 1), date(2025, 12, 31)), # or None
236
  }
237
  ```
238
 
 
 
 
239
  ### Full session state key reference
240
 
241
  | Key | Set by | Used by |
242
  |-----|--------|---------|
243
+ | `dashboard_df` | `app.py` startup | sidebar, dashboard.py, SA + RR filter lists |
244
+ | `global_filters` | sidebar "Apply Filters" | app.py (dashboard filter), SA + RR pre-populate |
245
+ | `filters_applied` | sidebar buttons | app.py |
246
+ | `sa_contents` | SA fetch button | sentiment_analysis.py |
247
+ | `sa_comments` | SA fetch button | sentiment_analysis.py |
248
+ | `sa_fetch_key` | SA fetch button | SA stale-check |
249
+ | `rr_df` | RR fetch button | reply_required.py |
250
+ | `rr_fetch_key` | RR fetch button | RR stale-check |
251
  | `sentiment_page` | SA page / fetch | SA pagination |
252
  | `reply_page` | RR page / fetch | RR pagination |
253
+ | `content_summaries` | SA AI buttons | SA AI analysis display |
254
+ | `helpscout_df` | `app.py` startup | helpscout_dashboard.py, dashboard.py compact summary |
255
+ | `hs_analysis_df` | HS Analysis fetch | helpscout_analysis.py charts + cards |
256
+ | `hs_analysis_fetch_key` | HS Analysis fetch | HS Analysis stale-check |
257
+ | `hs_analysis_filter_desc` | HS Analysis fetch | human-readable filter string for PDF + agent |
258
+ | `hs_analysis_summary` | "Generate AI Summary" | HS Analysis summary renderer |
259
+ | `hs_analysis_summary_key` | "Generate AI Summary" | invalidated on re-fetch |
260
+ | `hs_analysis_page` | HS Analysis page / fetch | HS Analysis pagination |
261
 
262
  ---
263
 
264
  ## Snowflake Queries
265
 
266
+ ### Comment tables
267
+
268
+ | Table | Platform | Notes |
269
+ |-------|----------|-------|
270
+ | `SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES` | facebook, instagram, youtube, twitter | Needs `LEFT JOIN DIM_CONTENT` for `PERMALINK_URL` |
271
+ | `SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES` | musora_app | Has `PERMALINK_URL` and `THUMBNAIL_URL` natively |
272
+
273
+ ### HelpScout table
274
+
275
+ | Table | Notes |
276
+ |-------|-------|
277
+ | `SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES` | One row per conversation; multi-label topics in comma-separated `TOPICS` column |
278
 
279
  ### Static queries (in `viz_config.json`)
280
 
281
  | Key | Purpose |
282
  |-----|---------|
283
+ | `dashboard_query` | Lightweight comment query β€” no text, no DIM_CONTENT join |
284
+ | `demographics_query` | Joins `usora_users` + `preprocessed.users` for age/timezone/experience |
285
+ | `helpscout.dashboard_query` | Lightweight HelpScout query (no SUMMARY/notes) |
286
+ | `helpscout.demographics_query` | Same demographics join, keyed on `customer_email` |
287
 
288
+ ### Dynamic queries (built in `helpscout_data_loader.py`)
289
 
290
  | Method | Description |
291
  |--------|-------------|
292
+ | `_build_analysis_query()` | Full HelpScout query including SUMMARY/notes; multi-label topic filter via `ARRAY_CONTAINS` |
 
 
293
 
294
+ ---
295
 
296
+ ## Authentication
297
+
298
+ Module: `utils/auth.py`
299
+
300
+ - `AUTHORIZED_EMAILS` allowlist + `APP_TOKEN` env var.
301
+ - `render_login_page()` renders the login form and calls `st.stop()` when not authenticated.
302
+ - Gate is placed at the top of `app.py` (after `st.set_page_config`, before data loaders).
303
+ - Current user and logout button are shown in the sidebar.
304
+
305
+ **Required env vars:**
306
+ ```
307
+ APP_TOKEN=<shared token>
308
+ ```
309
+
310
+ ---
311
+
312
+ ## PDF Reports
313
+
314
+ ### Comment Dashboard PDF (`utils/pdf_exporter.py` β€” `DashboardPDFExporter`)
315
+
316
+ Generated from the "Export PDF Report" expander at the top of the Dashboard page.
317
+
318
+ Sections: cover, executive summary, sentiment, brand, platform, intent, cross-dimensional, volume, reply requirements, demographics (optional), language (optional), HelpScout summary (if data loaded), data summary.
319
+
320
+ ### HelpScout Dashboard PDF (`utils/helpscout_pdf.py` β€” `HelpScoutDashboardPDF`)
321
+
322
+ Generated from the HelpScout Dashboard page. Sections: cover, KPI summary, sentiment, topics, flags & escalation, status & source, timelines, demographics.
323
+
324
+ ### HelpScout Analysis PDF (`utils/helpscout_pdf.py` β€” `HelpScoutAnalysisPDF`)
325
+
326
+ Generated from the "Export Analysis PDF" button on the HelpScout Analysis page (only available after an AI Summary has been generated).
327
+
328
+ Sections: cover, filter summary, KPI summary, chart snapshots, AI summary (executive summary, top themes, top complaints, unexpected insights, notable quotes), conversation cards sample, metadata.
329
+
330
+ **Dependencies:** `fpdf2`, `kaleido` (for Plotly PNG rendering at 3Γ— scale).
331
+
332
+ ---
333
+
334
+ ## AI Agents
335
+
336
+ ### `ContentSummaryAgent` (`agents/content_summary_agent.py`)
337
+
338
+ Summarises sampled comments for a single content item on the Sentiment Analysis page. Called per-content when the user clicks the AI analysis button. Results cached in `st.session_state['content_summaries']`.
339
+
340
+ ### `HelpScoutSummaryAgent` (`agents/helpscout_summary_agent.py`)
341
+
342
+ Produces a **page-level** executive report from the filtered HelpScout conversations by reading their pre-extracted `SUMMARY` fields through an LLM.
343
+
344
+ - Stratified sample by `sentiment_polarity` β€” capped at `max_conversations` (default 300).
345
+ - Builds aggregate context: sentiment breakdown, top topics, flag counts, average duration, then per-conversation summaries (capped at 250 chars each).
346
+ - Prompt asks the LLM to surface patterns **beyond** the pre-tagged topics/sentiments.
347
+ - Output structure:
348
+
349
+ ```json
350
+ {
351
+ "executive_summary": "...",
352
+ "top_themes": [{"theme": "...", "description": "...", "prevalence": "..."}],
353
+ "top_complaints": ["..."],
354
+ "unexpected_insights": ["..."],
355
+ "notable_quotes": ["..."]
356
+ }
357
+ ```
358
+
359
+ - Uses `LLMHelper.get_structured_completion()` with up to 3 retries.
360
 
361
  ---
362
 
363
  ## Adding or Changing Things
364
 
365
+ ### Add a new chart to the Comment Dashboard
366
  1. Write the chart function in the appropriate `visualizations/` file.
367
+ 2. Call it from `render_dashboard()` in `components/dashboard.py`.
 
368
 
369
+ ### Add a new chart to the HelpScout Dashboard
370
+ 1. Add the chart method to `HelpScoutCharts` in `visualizations/helpscout_charts.py`.
371
+ 2. Call it from `render_helpscout_dashboard()` in `components/helpscout_dashboard.py`.
 
372
 
373
+ ### Add a new HelpScout filter
374
+ 1. Add the widget to the filter panel in `helpscout_analysis.py`.
375
+ 2. Include the new value in the `fetch_key` tuple.
376
+ 3. Add the corresponding `WHERE` clause condition to `_build_analysis_query()` in `helpscout_data_loader.py`.
377
 
378
+ ### Add a new HelpScout topic
379
+ - Edit `process_helpscout/config_files/topics.json` (the taxonomy file).
380
+ - `helpscout_utils.load_topic_taxonomy()` reloads it on each app start; no other changes needed.
381
 
382
  ### Change the cache duration
383
+ `@st.cache_data(ttl=86400)` appears on `load_dashboard_data`, `_fetch_sa_data`, `_fetch_rr_data`, `load_demographics_data`, and their HelpScout equivalents. Change `86400` to the desired TTL. Users can always force a refresh with "Reload Data" in the sidebar.
 
 
384
 
385
  ### Add a new page
386
+ 1. Create `components/new_page.py` with a `render_new_page(...)` function.
387
  2. Import and add a radio option in `app.py`.
388
+ 3. Add data loading to the appropriate loader class.
389
+ 4. If the page should be excluded from global comment filters, extend the `_hs_page` guard in `app.py`.
390
 
391
+ ### Change what the Sentiment Analysis page queries
392
+ - Edit `_build_sa_content_query()` and/or `_build_sa_comments_query()` in `data_loader.py`.
393
+ - Update `_process_sa_content_stats()` and/or `_process_sa_comments()` for new columns.
 
394
 
395
  ---
396
 
 
411
  SNOWFLAKE_DATABASE
412
  SNOWFLAKE_WAREHOUSE
413
  SNOWFLAKE_SCHEMA
414
+ OPENAI_API_KEY
415
+ APP_TOKEN
416
  ```
417
 
418
  ---
 
424
  | Section | What it configures |
425
  |---------|-------------------|
426
  | `color_schemes.sentiment_polarity` | Hex colors for each sentiment level |
427
+ | `color_schemes.intent` | Hex colors per intent label |
428
+ | `color_schemes.emotion` | Hex colors per emotion label |
429
+ | `color_schemes.platform` | Hex colors per platform |
430
+ | `color_schemes.brand` | Hex colors per brand |
431
+ | `color_schemes_helpscout.topics` | Hex colors for HelpScout topic bars |
432
+ | `color_schemes_helpscout.status` | Hex colors for conversation status values |
433
+ | `color_schemes_helpscout.boolean_flags` | Hex colors for refund/cancellation/membership flags |
434
+ | `sentiment_order` | Display order for sentiment categories |
435
  | `intent_order` | Display order for intent categories |
436
+ | `emotion_order` | Display order for emotion categories |
437
  | `negative_sentiments` | Which sentiment values count as "negative" |
438
+ | `dashboard.default_date_range_days` | Default date filter window for comment pages |
439
+ | `helpscout.default_date_range_days` | Default date filter window for HelpScout Analysis |
440
+ | `helpscout.max_summary_conversations` | Cap on conversations sent to LLM summary agent |
441
+ | `helpscout.escalation_sentiments` | Sentiment values that count as escalation |
442
+ | `snowflake.dashboard_query` | Lightweight comment dashboard query |
443
+ | `snowflake.demographics_query` | Demographics join query (comment pages) |
444
+ | `helpscout.dashboard_query` | Lightweight HelpScout dashboard query |
445
+ | `helpscout.demographics_query` | Demographics join query (HelpScout, keyed on email) |
446
  | `demographics.age_groups` | Age bucket definitions (label β†’ [min, max]) |
447
  | `demographics.experience_groups` | Experience bucket definitions |
448
  | `demographics.top_timezones_count` | How many timezones to show in the geographic chart |
visualization/agents/helpscout_summary_agent.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout Summary Agent
3
+ Generates a page-level summary report from filtered HelpScout conversations.
4
+ Analyses the already-extracted SUMMARY fields to surface patterns and insights
5
+ beyond the pre-tagged topics / sentiments.
6
+ """
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Any, Dict
11
+
12
+ import pandas as pd
13
+
14
+ # Ensure visualization/ is on sys.path so agents.*, utils.* imports resolve
15
+ _parent = Path(__file__).resolve().parent.parent
16
+ if str(_parent) not in sys.path:
17
+ sys.path.insert(0, str(_parent))
18
+
19
+ from agents.base_agent import BaseVisualizationAgent
20
+ from utils.llm_helper import LLMHelper
21
+ from utils.helpscout_utils import topic_label, load_topic_taxonomy
22
+
23
+
24
+ class HelpScoutSummaryAgent(BaseVisualizationAgent):
25
+ """
26
+ Produces an executive summary report from a filtered set of HelpScout
27
+ conversations by reading their SUMMARY fields through an LLM.
28
+ """
29
+
30
+ MAX_SUMMARY_CHARS = 250 # per conversation summary sent to LLM
31
+
32
+ def __init__(self, model: str = "gpt-5-nano", temperature: float = 1,
33
+ max_conversations: int = 300):
34
+ super().__init__(name="HelpScoutSummaryAgent", model=model, temperature=temperature)
35
+ self.llm_helper = LLMHelper(model=model, temperature=temperature)
36
+ self.max_conversations = max_conversations
37
+ self.taxonomy = load_topic_taxonomy()
38
+
39
+ # ─────────────────────────────────────────────────────────────
40
+ # BaseVisualizationAgent interface
41
+ # ─────────────────────────────────────────────────────────────
42
+
43
+ def validate_input(self, input_data: Dict[str, Any]) -> bool:
44
+ if "conversations" not in input_data:
45
+ self.log_processing("Missing 'conversations' key", level="error")
46
+ return False
47
+ if not isinstance(input_data["conversations"], pd.DataFrame):
48
+ self.log_processing("'conversations' must be a DataFrame", level="error")
49
+ return False
50
+ if "summary" not in input_data["conversations"].columns:
51
+ self.log_processing("DataFrame must contain a 'summary' column", level="error")
52
+ return False
53
+ return True
54
+
55
+ def process(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
56
+ """
57
+ Generate an aggregate summary report from filtered HelpScout conversations.
58
+
59
+ Args:
60
+ input_data: {
61
+ 'conversations': pd.DataFrame (must have 'summary' column),
62
+ 'filter_description': str (human-readable applied filters),
63
+ 'max_conversations': int (optional; overrides instance default),
64
+ }
65
+
66
+ Returns:
67
+ {
68
+ 'success': bool,
69
+ 'summary': {
70
+ 'executive_summary': str,
71
+ 'top_themes': [{'theme': str, 'description': str, 'prevalence': str}],
72
+ 'top_complaints': [str],
73
+ 'unexpected_insights': [str],
74
+ 'recommended_actions': [{'priority': str, 'action': str, 'rationale': str}],
75
+ 'notable_quotes': [str],
76
+ },
77
+ 'metadata': {
78
+ 'total_conversations_analyzed': int,
79
+ 'model_used': str,
80
+ 'tokens_used': int,
81
+ 'filter_applied': str,
82
+ },
83
+ 'error': str | None,
84
+ }
85
+ """
86
+ try:
87
+ if not self.validate_input(input_data):
88
+ return {"success": False, "error": "Invalid input data", "summary": None}
89
+
90
+ df = input_data["conversations"]
91
+ filter_desc = input_data.get("filter_description", "No filters applied")
92
+ max_convs = input_data.get("max_conversations", self.max_conversations)
93
+
94
+ total_available = len(df)
95
+
96
+ if total_available == 0:
97
+ return self._empty_result(filter_desc)
98
+
99
+ # Sample if over cap β€” stratified by sentiment to preserve signal
100
+ df_sample = self._stratified_sample(df, max_convs)
101
+ n_analyzed = len(df_sample)
102
+
103
+ self.log_processing(
104
+ f"Analysing {n_analyzed} of {total_available} conversations"
105
+ f" (filter: {filter_desc[:60]})"
106
+ )
107
+
108
+ # Build aggregate context for the LLM
109
+ agg_context = self._build_aggregate_context(df_sample, df)
110
+ prompt = self._build_prompt(agg_context, filter_desc, n_analyzed)
111
+
112
+ system_msg = (
113
+ "You are an expert customer support analyst for Musora, "
114
+ "a music education platform (Drumeo, Pianote, Guitareo, Singeo, PlayBass). "
115
+ "Your role is to synthesize customer support conversation summaries "
116
+ "and surface actionable insights that go beyond simple tagging."
117
+ )
118
+
119
+ response = self.llm_helper.get_structured_completion(
120
+ prompt=prompt,
121
+ system_message=system_msg,
122
+ max_retries=3,
123
+ )
124
+
125
+ if not response["success"]:
126
+ return self.handle_error(
127
+ Exception(response.get("error", "LLM call failed")),
128
+ context=f"filter={filter_desc[:60]}"
129
+ )
130
+
131
+ summary = response["content"]
132
+ summary = self._ensure_defaults(summary)
133
+
134
+ return {
135
+ "success": True,
136
+ "summary": summary,
137
+ "metadata": {
138
+ "total_conversations_analyzed": n_analyzed,
139
+ "total_available": total_available,
140
+ "model_used": response["model"],
141
+ "tokens_used": response["usage"]["total_tokens"],
142
+ "filter_applied": filter_desc,
143
+ },
144
+ "error": None,
145
+ }
146
+
147
+ except Exception as e:
148
+ return self.handle_error(e, context=input_data.get("filter_description", ""))
149
+
150
+ # ─────────────────────────────────────────────────────────────
151
+ # Private helpers
152
+ # ─────────────────────────────────────────────────────────────
153
+
154
+ def _stratified_sample(self, df: pd.DataFrame, cap: int) -> pd.DataFrame:
155
+ """Stratified sample by sentiment to keep signal diversity."""
156
+ if len(df) <= cap:
157
+ return df
158
+ try:
159
+ strat_col = "sentiment_polarity"
160
+ if strat_col in df.columns and df[strat_col].nunique() > 1:
161
+ # Proportional allocation per sentiment group
162
+ groups = df.groupby(strat_col, group_keys=False)
163
+ sampled = groups.apply(
164
+ lambda g: g.sample(
165
+ n=max(1, int(cap * len(g) / len(df))),
166
+ random_state=42,
167
+ )
168
+ )
169
+ return sampled.head(cap)
170
+ except Exception:
171
+ pass
172
+ return df.sample(n=cap, random_state=42)
173
+
174
+ def _build_aggregate_context(self, df_sample: pd.DataFrame,
175
+ df_full: pd.DataFrame) -> str:
176
+ """Build a text block with aggregate stats + conversation summaries."""
177
+ total = len(df_full)
178
+ n_sample = len(df_sample)
179
+
180
+ # Aggregate stats from the full filtered set
181
+ stats = []
182
+ if "sentiment_polarity" in df_full.columns:
183
+ sent_counts = df_full["sentiment_polarity"].value_counts()
184
+ sent_pct = (sent_counts / total * 100).round(1)
185
+ stats.append("Sentiment breakdown: " +
186
+ ", ".join(f"{s} {pct}%" for s, pct in sent_pct.items()))
187
+
188
+ if "topics" in df_full.columns:
189
+ from utils.helpscout_utils import explode_topics
190
+ exploded = explode_topics(df_full)
191
+ if not exploded.empty:
192
+ top_topics = exploded["topic_id"].value_counts().head(8)
193
+ topic_strs = [f"{topic_label(t, self.taxonomy)} ({c})" for t, c in top_topics.items()]
194
+ stats.append("Top topics: " + ", ".join(topic_strs))
195
+
196
+ from utils.helpscout_utils import boolean_flag_counts
197
+ flags = boolean_flag_counts(df_full)
198
+ flag_parts = []
199
+ if flags["is_refund_request"]:
200
+ flag_parts.append(f"Refund requests: {flags['is_refund_request']}")
201
+ if flags["is_cancellation"]:
202
+ flag_parts.append(f"Cancellations: {flags['is_cancellation']}")
203
+ if flags["is_membership"]:
204
+ flag_parts.append(f"Membership joins: {flags['is_membership']}")
205
+ if flag_parts:
206
+ stats.append(", ".join(flag_parts))
207
+
208
+ if "duration_hours" in df_full.columns:
209
+ avg_dur = df_full["duration_hours"].mean()
210
+ stats.append(f"Average conversation duration: {avg_dur:.1f} hours")
211
+
212
+ stats_block = "\n".join(stats)
213
+
214
+ # Individual summaries (capped per conversation)
215
+ summaries = []
216
+ for i, row in enumerate(df_sample.itertuples(), 1):
217
+ s = getattr(row, "summary", None) or ""
218
+ s = str(s).strip()
219
+ if s:
220
+ s = s[:self.MAX_SUMMARY_CHARS] + ("…" if len(s) > self.MAX_SUMMARY_CHARS else "")
221
+ sent = getattr(row, "sentiment_polarity", "")
222
+ summaries.append(f"[{i}] ({sent}) {s}")
223
+
224
+ summaries_block = "\n".join(summaries) if summaries else "No summaries available."
225
+
226
+ note = (f"Note: Showing {n_sample} of {total} matched conversations."
227
+ if n_sample < total else f"Showing all {total} matched conversations.")
228
+
229
+ return f"""=== AGGREGATE STATISTICS ===
230
+ {stats_block}
231
+ {note}
232
+
233
+ === CONVERSATION SUMMARIES ===
234
+ {summaries_block}"""
235
+
236
+ def _build_prompt(self, context: str, filter_desc: str,
237
+ n_analyzed: int) -> str:
238
+ return f"""Analyze the following {n_analyzed} HelpScout customer support conversation summaries for Musora.
239
+
240
+ Applied filters: {filter_desc}
241
+
242
+ {context}
243
+
244
+ Your task: Synthesize these conversations and produce insights that go BEYOND the pre-extracted tags.
245
+ Look for underlying patterns, recurring pain points, emotional signals, product gaps, and operational issues
246
+ that would not be obvious from simple topic counts alone.
247
+
248
+ Respond in JSON with this exact structure:
249
+ {{
250
+ "executive_summary": "3-5 sentence high-level synthesis of what customers are experiencing",
251
+ "top_themes": [
252
+ {{
253
+ "theme": "Short theme name (not a topic tag)",
254
+ "description": "What customers are actually saying and feeling about this",
255
+ "prevalence": "Rough estimate: e.g. 'Appears in ~30% of conversations'"
256
+ }}
257
+ ],
258
+ "top_complaints": [
259
+ "Specific actionable complaint statement (not generic)"
260
+ ],
261
+ "unexpected_insights": [
262
+ "A pattern, contradiction, or insight that would surprise a product manager"
263
+ ],
264
+ "notable_quotes": [
265
+ "Paraphrased quote or representative statement from conversations (not verbatim)"
266
+ ]
267
+ }}
268
+
269
+ Guidelines:
270
+ - Top themes: 5-8 items, each distinct from pre-extracted topics
271
+ - Top complaints: 5-8 bullet points, specific and actionable
272
+ - Unexpected insights: 3-5 items, must genuinely go beyond the tag taxonomy
273
+ - Notable quotes: 3-5 representative paraphrases
274
+ - If a section has fewer relevant items, use fewer β€” quality over quantity
275
+ """
276
+
277
+ @staticmethod
278
+ def _ensure_defaults(summary: dict) -> dict:
279
+ defaults = {
280
+ "executive_summary": "",
281
+ "top_themes": [],
282
+ "top_complaints": [],
283
+ "unexpected_insights": [],
284
+ "notable_quotes": [],
285
+ }
286
+ for k, v in defaults.items():
287
+ if k not in summary:
288
+ summary[k] = v
289
+ return summary
290
+
291
+ def _empty_result(self, filter_desc: str) -> dict:
292
+ return {
293
+ "success": True,
294
+ "summary": {
295
+ "executive_summary": "No conversations matched the selected filters.",
296
+ "top_themes": [],
297
+ "top_complaints": [],
298
+ "unexpected_insights": [],
299
+ "notable_quotes": [],
300
+ },
301
+ "metadata": {
302
+ "total_conversations_analyzed": 0,
303
+ "total_available": 0,
304
+ "model_used": self.model,
305
+ "tokens_used": 0,
306
+ "filter_applied": filter_desc,
307
+ },
308
+ "error": None,
309
+ }
visualization/app.py CHANGED
@@ -14,9 +14,12 @@ parent_dir = Path(__file__).resolve().parent
14
  sys.path.append(str(parent_dir))
15
 
16
  from data.data_loader import SentimentDataLoader
 
17
  from components.dashboard import render_dashboard
18
  from components.sentiment_analysis import render_sentiment_analysis
19
  from components.reply_required import render_reply_required
 
 
20
  from utils.auth import check_authentication, render_login_page, logout, get_current_user
21
 
22
  # ── Load configuration ────────────────────────────────────────────────────────
@@ -38,15 +41,13 @@ st.set_page_config(
38
  if not check_authentication():
39
  render_login_page()
40
 
41
- # ── Single data-loader instance (cheap: just reads config) ────────────────────
42
  data_loader = SentimentDataLoader()
 
43
 
44
 
45
  def _ensure_dashboard_data():
46
- """
47
- Load dashboard data once and store in session_state.
48
- Subsequent calls within the same session (or until cache expires) are free.
49
- """
50
  if 'dashboard_df' not in st.session_state or st.session_state['dashboard_df'] is None:
51
  with st.spinner("Loading dashboard data…"):
52
  df = data_loader.load_dashboard_data()
@@ -54,6 +55,15 @@ def _ensure_dashboard_data():
54
  return st.session_state['dashboard_df']
55
 
56
 
 
 
 
 
 
 
 
 
 
57
  def main():
58
  # ── Sidebar ───────────────────────────────────────────────────────────────
59
  with st.sidebar:
@@ -72,15 +82,22 @@ def main():
72
 
73
  page = st.radio(
74
  "Select Page",
75
- ["πŸ“Š Sentiment Dashboard", "πŸ” Custom Sentiment Queries", "πŸ’¬ Reply Required"],
 
 
 
 
 
 
76
  index=0
77
  )
78
 
79
  st.markdown("---")
80
  st.markdown("### πŸ” Global Filters")
81
 
82
- # Load / retrieve dashboard data for filter options
83
  dashboard_df = _ensure_dashboard_data()
 
84
 
85
  if dashboard_df.empty:
86
  st.error("No data available. Please check your Snowflake connection.")
@@ -148,22 +165,27 @@ def main():
148
  if st.button("♻️ Reload Data", use_container_width=True):
149
  st.cache_data.clear()
150
  st.session_state.pop('dashboard_df', None)
 
151
  st.rerun()
152
 
153
  # Data info
154
  st.markdown("---")
155
  st.markdown("### ℹ️ Data Info")
156
- st.info(f"**Total Records:** {len(dashboard_df):,}")
 
 
 
157
  if 'processed_at' in dashboard_df.columns and not dashboard_df.empty:
158
  last_update = dashboard_df['processed_at'].max()
159
  if hasattr(last_update, 'strftime'):
160
  st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
161
 
162
- # ── Build filtered dashboard_df for the Dashboard page ───────────────────
 
163
  filters_applied = st.session_state.get('filters_applied', False)
164
  global_filters = st.session_state.get('global_filters', {})
165
 
166
- if filters_applied and global_filters:
167
  filtered_df = data_loader.apply_filters(
168
  dashboard_df,
169
  platforms=global_filters.get('platforms') or None,
@@ -190,6 +212,12 @@ def main():
190
  # RR page fetches its own data on demand; receives only data_loader
191
  render_reply_required(data_loader)
192
 
 
 
 
 
 
 
193
  # ── Footer ────────────────────────────────────────────────────────────────
194
  st.markdown("---")
195
  st.markdown(
 
14
  sys.path.append(str(parent_dir))
15
 
16
  from data.data_loader import SentimentDataLoader
17
+ from data.helpscout_data_loader import HelpScoutDataLoader
18
  from components.dashboard import render_dashboard
19
  from components.sentiment_analysis import render_sentiment_analysis
20
  from components.reply_required import render_reply_required
21
+ from components.helpscout_dashboard import render_helpscout_dashboard
22
+ from components.helpscout_analysis import render_helpscout_analysis
23
  from utils.auth import check_authentication, render_login_page, logout, get_current_user
24
 
25
  # ── Load configuration ────────────────────────────────────────────────────────
 
41
  if not check_authentication():
42
  render_login_page()
43
 
44
+ # ── Data loader instances (cheap: just read config) ───────────────────────────
45
  data_loader = SentimentDataLoader()
46
+ helpscout_loader = HelpScoutDataLoader()
47
 
48
 
49
  def _ensure_dashboard_data():
50
+ """Load comment dashboard data once and store in session_state."""
 
 
 
51
  if 'dashboard_df' not in st.session_state or st.session_state['dashboard_df'] is None:
52
  with st.spinner("Loading dashboard data…"):
53
  df = data_loader.load_dashboard_data()
 
55
  return st.session_state['dashboard_df']
56
 
57
 
58
+ def _ensure_helpscout_data():
59
+ """Load HelpScout dashboard data once and store in session_state."""
60
+ if 'helpscout_df' not in st.session_state or st.session_state['helpscout_df'] is None:
61
+ with st.spinner("Loading HelpScout data…"):
62
+ hs_df = helpscout_loader.load_dashboard_data()
63
+ st.session_state['helpscout_df'] = hs_df
64
+ return st.session_state['helpscout_df']
65
+
66
+
67
  def main():
68
  # ── Sidebar ───────────────────────────────────────────────────────────────
69
  with st.sidebar:
 
82
 
83
  page = st.radio(
84
  "Select Page",
85
+ [
86
+ "πŸ“Š Sentiment Dashboard",
87
+ "πŸ” Custom Sentiment Queries",
88
+ "πŸ’¬ Reply Required",
89
+ "🎧 HelpScout Dashboard",
90
+ "πŸ”¬ HelpScout Analysis",
91
+ ],
92
  index=0
93
  )
94
 
95
  st.markdown("---")
96
  st.markdown("### πŸ” Global Filters")
97
 
98
+ # Load both data sources at startup
99
  dashboard_df = _ensure_dashboard_data()
100
+ _ensure_helpscout_data()
101
 
102
  if dashboard_df.empty:
103
  st.error("No data available. Please check your Snowflake connection.")
 
165
  if st.button("♻️ Reload Data", use_container_width=True):
166
  st.cache_data.clear()
167
  st.session_state.pop('dashboard_df', None)
168
+ st.session_state.pop('helpscout_df', None)
169
  st.rerun()
170
 
171
  # Data info
172
  st.markdown("---")
173
  st.markdown("### ℹ️ Data Info")
174
+ st.info(f"**Comments:** {len(dashboard_df):,}")
175
+ hs_df_info = st.session_state.get('helpscout_df')
176
+ if hs_df_info is not None and not hs_df_info.empty:
177
+ st.info(f"**HelpScout:** {len(hs_df_info):,} conversations")
178
  if 'processed_at' in dashboard_df.columns and not dashboard_df.empty:
179
  last_update = dashboard_df['processed_at'].max()
180
  if hasattr(last_update, 'strftime'):
181
  st.info(f"**Last Updated:** {last_update.strftime('%Y-%m-%d %H:%M')}")
182
 
183
+ # ── Build filtered dashboard_df (only applies to comment pages) ─────────
184
+ _hs_page = page in ("🎧 HelpScout Dashboard", "πŸ”¬ HelpScout Analysis")
185
  filters_applied = st.session_state.get('filters_applied', False)
186
  global_filters = st.session_state.get('global_filters', {})
187
 
188
+ if not _hs_page and filters_applied and global_filters:
189
  filtered_df = data_loader.apply_filters(
190
  dashboard_df,
191
  platforms=global_filters.get('platforms') or None,
 
212
  # RR page fetches its own data on demand; receives only data_loader
213
  render_reply_required(data_loader)
214
 
215
+ elif page == "🎧 HelpScout Dashboard":
216
+ render_helpscout_dashboard(helpscout_loader)
217
+
218
+ elif page == "πŸ”¬ HelpScout Analysis":
219
+ render_helpscout_analysis(helpscout_loader)
220
+
221
  # ── Footer ────────────────────────────────────────────────────────────────
222
  st.markdown("---")
223
  st.markdown(
visualization/components/dashboard.py CHANGED
@@ -220,6 +220,51 @@ def render_dashboard(df):
220
 
221
  st.markdown("---")
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  # Brand-Platform Matrix
224
  st.markdown("## πŸ”€ Cross-Dimensional Analysis")
225
 
@@ -580,4 +625,13 @@ def render_dashboard(df):
580
  sunburst = distribution_charts.create_combined_distribution_sunburst(
581
  df, title="Brand > Platform > Sentiment Distribution"
582
  )
583
- st.plotly_chart(sunburst, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
220
 
221
  st.markdown("---")
222
 
223
+ # Emotion Analysis
224
+ st.markdown("## πŸ’­ Emotion Analysis")
225
+
226
+ if 'emotions' in df.columns and df['emotions'].notna().any():
227
+ col1, col2 = st.columns(2)
228
+
229
+ with col1:
230
+ emotion_bar = distribution_charts.create_emotion_bar_chart(
231
+ df, title="Emotion Distribution", orientation='h'
232
+ )
233
+ st.plotly_chart(emotion_bar, use_container_width=True)
234
+
235
+ with col2:
236
+ emotion_pie = distribution_charts.create_emotion_pie_chart(
237
+ df, title="Emotion Distribution"
238
+ )
239
+ st.plotly_chart(emotion_pie, use_container_width=True)
240
+
241
+ with st.expander("πŸ’‘ Emotion Insights"):
242
+ emotion_dist = processor.get_emotion_distribution(df)
243
+ if not emotion_dist.empty:
244
+ top_emotion = emotion_dist.iloc[0]
245
+ st.write(f"**Most common emotion:** {top_emotion['emotions'].title()} "
246
+ f"({int(top_emotion['count']):,} comments, {top_emotion['percentage']:.1f}%)")
247
+
248
+ negative_emotions = ['frustration', 'disappointment', 'sadness', 'anger']
249
+ neg_emotion_dist = emotion_dist[emotion_dist['emotions'].isin(negative_emotions)]
250
+ if not neg_emotion_dist.empty:
251
+ total_neg = neg_emotion_dist['count'].sum()
252
+ total = emotion_dist['count'].sum()
253
+ st.write(f"**Negative emotions** (frustration, disappointment, sadness, anger): "
254
+ f"{int(total_neg):,} occurrences ({total_neg / total * 100:.1f}%)")
255
+
256
+ positive_emotions = ['joy', 'excitement', 'gratitude', 'admiration']
257
+ pos_emotion_dist = emotion_dist[emotion_dist['emotions'].isin(positive_emotions)]
258
+ if not pos_emotion_dist.empty:
259
+ total_pos = pos_emotion_dist['count'].sum()
260
+ total = emotion_dist['count'].sum()
261
+ st.write(f"**Positive emotions** (joy, excitement, gratitude, admiration): "
262
+ f"{int(total_pos):,} occurrences ({total_pos / total * 100:.1f}%)")
263
+ else:
264
+ st.info("No emotion data available. Emotions are extracted for newly processed comments.")
265
+
266
+ st.markdown("---")
267
+
268
  # Brand-Platform Matrix
269
  st.markdown("## πŸ”€ Cross-Dimensional Analysis")
270
 
 
625
  sunburst = distribution_charts.create_combined_distribution_sunburst(
626
  df, title="Brand > Platform > Sentiment Distribution"
627
  )
628
+ st.plotly_chart(sunburst, use_container_width=True)
629
+
630
+ # ── HelpScout compact summary (additive β€” no impact on existing charts) ──
631
+ hs_df = st.session_state.get("helpscout_df")
632
+ if hs_df is not None and not hs_df.empty:
633
+ try:
634
+ from components.helpscout_dashboard import render_helpscout_compact_summary
635
+ render_helpscout_compact_summary(hs_df)
636
+ except Exception:
637
+ pass # never break the main dashboard if helpscout module fails
visualization/components/helpscout_analysis.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout Analysis Page
3
+ Purpose-built analysis page for HelpScout conversations.
4
+ Mirrors the SA page architecture: filter β†’ fetch β†’ charts β†’ LLM summary β†’ export.
5
+ One page-level summary report for the entire filtered set.
6
+ """
7
+ import sys
8
+ from datetime import date, timedelta
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+ import streamlit as st
13
+
14
+ parent_dir = Path(__file__).resolve().parent.parent
15
+ sys.path.append(str(parent_dir))
16
+
17
+ from visualizations.helpscout_charts import HelpScoutCharts
18
+ from utils.helpscout_utils import (
19
+ boolean_flag_counts, build_filter_description, topic_label, load_topic_taxonomy
20
+ )
21
+ from agents.helpscout_summary_agent import HelpScoutSummaryAgent
22
+
23
+
24
+ def render_helpscout_analysis(data_loader):
25
+ """
26
+ Render the HelpScout Analysis page.
27
+
28
+ Args:
29
+ data_loader: HelpScoutDataLoader instance
30
+ """
31
+ st.title("πŸ”¬ HelpScout Analysis")
32
+ st.markdown(
33
+ "Deep-dive into customer support conversations. Apply filters, fetch the data, "
34
+ "explore distributions, and generate an AI-powered summary report."
35
+ )
36
+ st.markdown("---")
37
+
38
+ charts = HelpScoutCharts()
39
+ taxonomy = load_topic_taxonomy()
40
+
41
+ # ── Filter options from already-loaded dashboard df ───────────────────────
42
+ hs_df = st.session_state.get("helpscout_df")
43
+ if hs_df is None or hs_df.empty:
44
+ st.warning("HelpScout dashboard data not loaded yet. Please wait for the app to initialise.")
45
+ return
46
+
47
+ filter_options = data_loader.get_filter_options(hs_df)
48
+
49
+ # ── Filters ───────────────────────────────────────────────────────────────
50
+ st.markdown("### 🎯 Filters")
51
+
52
+ row1_col1, row1_col2 = st.columns(2)
53
+ with row1_col1:
54
+ min_date = hs_df["first_message_at"].min().date() if "first_message_at" in hs_df.columns and not hs_df.empty else date.today() - timedelta(days=60)
55
+ max_date = hs_df["first_message_at"].max().date() if "first_message_at" in hs_df.columns and not hs_df.empty else date.today()
56
+ default_start = max(min_date, max_date - timedelta(days=data_loader.default_date_range_days))
57
+ date_range = st.date_input(
58
+ "Date Range (First Message At)",
59
+ value=(default_start, max_date),
60
+ min_value=min_date, max_value=max_date,
61
+ key="hs_analysis_date_range",
62
+ )
63
+ with row1_col2:
64
+ top_n_options = [("All", 0), ("50", 50), ("100", 100), ("200", 200), ("500", 500), ("1000", 1000)]
65
+ top_n_label = st.selectbox(
66
+ "Limit Results",
67
+ options=[x[0] for x in top_n_options],
68
+ index=0,
69
+ help="Limit number of conversations fetched. 'All' fetches everything matching your filters.",
70
+ key="hs_analysis_top_n",
71
+ )
72
+ top_n = dict(top_n_options)[top_n_label]
73
+
74
+ row2_col1, row2_col2, row2_col3, row2_col4 = st.columns(4)
75
+ with row2_col1:
76
+ topic_options = filter_options.get("topics", [])
77
+ topic_labels_map = {t: topic_label(t, taxonomy) for t in topic_options}
78
+ selected_topic_labels = st.multiselect(
79
+ "Topics",
80
+ options=[topic_labels_map[t] for t in topic_options],
81
+ default=[],
82
+ key="hs_analysis_topics",
83
+ )
84
+ label_to_id = {v: k for k, v in topic_labels_map.items()}
85
+ selected_topics = [label_to_id[l] for l in selected_topic_labels if l in label_to_id]
86
+
87
+ with row2_col2:
88
+ selected_sentiments = st.multiselect(
89
+ "Sentiments",
90
+ options=filter_options.get("sentiments", []),
91
+ default=[],
92
+ key="hs_analysis_sentiments",
93
+ )
94
+
95
+ with row2_col3:
96
+ selected_statuses = st.multiselect(
97
+ "Status",
98
+ options=filter_options.get("statuses", []),
99
+ default=[],
100
+ key="hs_analysis_statuses",
101
+ )
102
+
103
+ with row2_col4:
104
+ selected_sources = st.multiselect(
105
+ "Source Type",
106
+ options=filter_options.get("sources", []),
107
+ default=[],
108
+ key="hs_analysis_sources",
109
+ )
110
+
111
+ row3_col1, row3_col2, row3_col3 = st.columns(3)
112
+ with row3_col1:
113
+ refund_only = st.checkbox("Refund Requests Only", key="hs_analysis_refund")
114
+ with row3_col2:
115
+ cancel_only = st.checkbox("Cancellations Only", key="hs_analysis_cancel")
116
+ with row3_col3:
117
+ membership_only = st.checkbox("Membership Joins Only", key="hs_analysis_membership")
118
+
119
+ st.markdown("---")
120
+
121
+ # ── Fetch button ─────────────────────────────────────────────────────────
122
+ dr_tuple = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else None
123
+
124
+ fetch_key = (
125
+ dr_tuple,
126
+ tuple(sorted(selected_sentiments)),
127
+ tuple(sorted(selected_topics)),
128
+ tuple(sorted(selected_statuses)),
129
+ tuple(sorted(selected_sources)),
130
+ bool(refund_only), bool(cancel_only), bool(membership_only),
131
+ top_n,
132
+ )
133
+
134
+ has_data = (
135
+ "hs_analysis_df" in st.session_state
136
+ and st.session_state.get("hs_analysis_fetch_key") == fetch_key
137
+ and not st.session_state["hs_analysis_df"].empty
138
+ )
139
+
140
+ fetch_col, info_col = st.columns([1, 3])
141
+ with fetch_col:
142
+ fetch_clicked = st.button("πŸš€ Fetch Data", type="primary",
143
+ use_container_width=True, key="hs_fetch_btn")
144
+ with info_col:
145
+ if has_data:
146
+ n = len(st.session_state["hs_analysis_df"])
147
+ st.success(f"βœ… Showing **{n:,}** conversations matching your filters")
148
+ elif not fetch_clicked:
149
+ st.info("πŸ‘† Set your filters and click **Fetch Data** to query Snowflake.")
150
+
151
+ if fetch_clicked:
152
+ with st.spinner("Fetching HelpScout data from Snowflake…"):
153
+ result_df = data_loader.load_analysis_data(
154
+ sentiments=selected_sentiments or None,
155
+ topics=selected_topics or None,
156
+ refund_only=refund_only,
157
+ cancel_only=cancel_only,
158
+ membership_only=membership_only,
159
+ statuses=selected_statuses or None,
160
+ sources=selected_sources or None,
161
+ date_range=(date_range[0], date_range[1]) if dr_tuple else None,
162
+ top_n=top_n or None,
163
+ )
164
+ applied_filters = {
165
+ "date_range": (date_range[0], date_range[1]) if dr_tuple else None,
166
+ "sentiments": selected_sentiments,
167
+ "topics": selected_topics,
168
+ "statuses": selected_statuses,
169
+ "sources": selected_sources,
170
+ "refund_only": refund_only,
171
+ "cancel_only": cancel_only,
172
+ "membership_only": membership_only,
173
+ }
174
+ st.session_state["hs_analysis_df"] = result_df
175
+ st.session_state["hs_analysis_fetch_key"] = fetch_key
176
+ st.session_state["hs_analysis_filter_desc"] = build_filter_description(applied_filters, taxonomy)
177
+ # Invalidate any prior summary when filters change
178
+ st.session_state.pop("hs_analysis_summary", None)
179
+ st.session_state.pop("hs_analysis_summary_key", None)
180
+ st.session_state["hs_analysis_page"] = 1
181
+ st.rerun()
182
+
183
+ if not has_data and not fetch_clicked:
184
+ return
185
+
186
+ analysis_df = st.session_state.get("hs_analysis_df", pd.DataFrame())
187
+ filter_desc = st.session_state.get("hs_analysis_filter_desc", "No filters applied")
188
+
189
+ if analysis_df.empty:
190
+ st.warning("No conversations found for the selected filters. Try adjusting and re-fetching.")
191
+ return
192
+
193
+ total = len(analysis_df)
194
+ flags = boolean_flag_counts(analysis_df)
195
+ neg_pct = analysis_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100
196
+ avg_dur = float(analysis_df["duration_hours"].mean()) if "duration_hours" in analysis_df.columns else 0.0
197
+
198
+ # ── KPI Row ───────────────────────────────────────────────────────────────
199
+ st.markdown("### πŸ“Š Overview")
200
+ k1, k2, k3, k4, k5 = st.columns(5)
201
+ k1.metric("Conversations", f"{total:,}")
202
+ k2.metric("Negative %", f"{neg_pct:.1f}%")
203
+ k3.metric("Refund Requests", f"{flags['is_refund_request']:,}")
204
+ k4.metric("Cancellations", f"{flags['is_cancellation']:,}")
205
+ k5.metric("Avg Duration (h)", f"{avg_dur:.1f}")
206
+
207
+ st.caption(f"**Active filters:** {filter_desc}")
208
+ st.markdown("---")
209
+
210
+ # ── Distributions ─────────────────────────────────────────────────────────
211
+ st.markdown("### πŸ“ˆ Distributions")
212
+
213
+ col1, col2 = st.columns(2)
214
+ with col1:
215
+ st.plotly_chart(charts.create_sentiment_pie_chart(analysis_df, title="Sentiment Distribution"),
216
+ use_container_width=True, key="hs_analysis_sent_pie")
217
+ with col2:
218
+ st.plotly_chart(charts.create_topic_bar_chart(analysis_df, title="Topic Distribution"),
219
+ use_container_width=True, key="hs_analysis_topic_bar")
220
+
221
+ col1, col2 = st.columns(2)
222
+ with col1:
223
+ st.plotly_chart(charts.create_topic_sentiment_heatmap(analysis_df),
224
+ use_container_width=True, key="hs_analysis_topic_heatmap")
225
+ with col2:
226
+ st.plotly_chart(charts.create_boolean_flags_chart(analysis_df),
227
+ use_container_width=True, key="hs_analysis_flags")
228
+
229
+ if "emotions" in analysis_df.columns and analysis_df["emotions"].notna().any():
230
+ col1, col2 = st.columns(2)
231
+ with col1:
232
+ st.plotly_chart(charts.create_emotion_bar_chart(analysis_df, title="Emotion Distribution"),
233
+ use_container_width=True, key="hs_analysis_emotion")
234
+ with col2:
235
+ st.plotly_chart(charts.create_volume_timeline(analysis_df, title="Volume Over Time"),
236
+ use_container_width=True, key="hs_analysis_vol_timeline")
237
+ else:
238
+ st.plotly_chart(charts.create_volume_timeline(analysis_df, title="Volume Over Time"),
239
+ use_container_width=True, key="hs_analysis_vol_timeline2")
240
+
241
+ st.markdown("---")
242
+
243
+ # ── AI Summary Report ─────────────────────────────────────────────────────
244
+ st.markdown("### πŸ€– AI Summary Report")
245
+ st.markdown(
246
+ "Generate an LLM-powered report from the conversation summaries matching your filters. "
247
+ "The AI looks beyond the pre-extracted tags to surface patterns, pain points, "
248
+ "and actionable insights."
249
+ )
250
+
251
+ summary_available = (
252
+ "hs_analysis_summary" in st.session_state
253
+ and st.session_state.get("hs_analysis_summary_key") == fetch_key
254
+ and st.session_state["hs_analysis_summary"] is not None
255
+ )
256
+
257
+ gen_col, pdf_col = st.columns([1, 1])
258
+ with gen_col:
259
+ gen_clicked = st.button("🧠 Generate Summary Report", type="primary",
260
+ use_container_width=True, key="hs_gen_summary_btn")
261
+ with pdf_col:
262
+ export_pdf_clicked = st.button("πŸ“„ Export as PDF", use_container_width=True,
263
+ key="hs_export_pdf_btn")
264
+
265
+ if gen_clicked:
266
+ with st.spinner("Analysing conversations with AI… this may take 20–40 seconds…"):
267
+ agent = HelpScoutSummaryAgent()
268
+ result = agent.process({
269
+ "conversations": analysis_df,
270
+ "filter_description": filter_desc,
271
+ })
272
+ st.session_state["hs_analysis_summary"] = result
273
+ st.session_state["hs_analysis_summary_key"] = fetch_key
274
+ st.rerun()
275
+
276
+ if export_pdf_clicked:
277
+ with st.spinner("Generating PDF…"):
278
+ try:
279
+ from utils.helpscout_pdf import HelpScoutAnalysisPDF
280
+ import datetime
281
+ summary_result = st.session_state.get("hs_analysis_summary")
282
+ exporter = HelpScoutAnalysisPDF()
283
+ pdf_bytes = exporter.generate_report(
284
+ analysis_df,
285
+ filter_info={"Filters": filter_desc, "Total Conversations": str(total)},
286
+ summary_result=summary_result,
287
+ )
288
+ filename = f"helpscout_analysis_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.pdf"
289
+ st.success("Report generated!")
290
+ st.download_button(
291
+ label="Download Analysis PDF",
292
+ data=pdf_bytes,
293
+ file_name=filename,
294
+ mime="application/pdf",
295
+ use_container_width=True,
296
+ key="hs_download_pdf_btn",
297
+ )
298
+ except Exception as e:
299
+ st.error(f"Failed to generate PDF: {e}")
300
+ st.exception(e)
301
+
302
+ # Render the summary if available
303
+ if summary_available:
304
+ result = st.session_state["hs_analysis_summary"]
305
+ _render_summary_report(result)
306
+
307
+ st.markdown("---")
308
+
309
+ # ── Conversation Cards ────────────────────────────────────────────────────
310
+ st.markdown("### πŸ’¬ Conversations")
311
+
312
+ if "hs_analysis_page" not in st.session_state:
313
+ st.session_state.hs_analysis_page = 1
314
+
315
+ per_page = 10
316
+ total_pages = max(1, (total + per_page - 1) // per_page)
317
+
318
+ if total > per_page:
319
+ st.info(f"Page {st.session_state.hs_analysis_page} of {total_pages} ({total:,} conversations)")
320
+ pc1, pc2, pc3 = st.columns([1, 2, 1])
321
+ with pc1:
322
+ if st.button("⬅️ Previous", key="hs_prev_top",
323
+ disabled=st.session_state.hs_analysis_page == 1):
324
+ st.session_state.hs_analysis_page -= 1
325
+ st.rerun()
326
+ with pc2:
327
+ st.markdown(
328
+ f"<div style='text-align:center;padding-top:8px;'>"
329
+ f"Page {st.session_state.hs_analysis_page} / {total_pages}</div>",
330
+ unsafe_allow_html=True,
331
+ )
332
+ with pc3:
333
+ if st.button("Next ➑️", key="hs_next_top",
334
+ disabled=st.session_state.hs_analysis_page >= total_pages):
335
+ st.session_state.hs_analysis_page += 1
336
+ st.rerun()
337
+ st.markdown("---")
338
+
339
+ start = (st.session_state.hs_analysis_page - 1) * per_page
340
+ end = min(start + per_page, total)
341
+ page_df = analysis_df.iloc[start:end]
342
+
343
+ for _, row in page_df.iterrows():
344
+ _render_conversation_card(row, taxonomy)
345
+
346
+ # Bottom pagination
347
+ if total > per_page:
348
+ pb1, pb2, pb3 = st.columns([1, 2, 1])
349
+ with pb1:
350
+ if st.button("⬅️ Previous", key="hs_prev_bot",
351
+ disabled=st.session_state.hs_analysis_page == 1):
352
+ st.session_state.hs_analysis_page -= 1
353
+ st.rerun()
354
+ with pb2:
355
+ st.markdown(
356
+ f"<div style='text-align:center;padding-top:8px;'>"
357
+ f"Page {st.session_state.hs_analysis_page} / {total_pages}</div>",
358
+ unsafe_allow_html=True,
359
+ )
360
+ with pb3:
361
+ if st.button("Next ➑️", key="hs_next_bot",
362
+ disabled=st.session_state.hs_analysis_page >= total_pages):
363
+ st.session_state.hs_analysis_page += 1
364
+ st.rerun()
365
+
366
+ st.markdown("---")
367
+
368
+ # ── Export CSV ────────────────────────────────────────────────────────────
369
+ st.markdown("### πŸ’Ύ Export Data")
370
+ export_cols = [c for c in ["conversation_id", "customer_email", "first_message_at",
371
+ "status", "sentiment_polarity", "topics", "summary",
372
+ "is_refund_request", "is_cancellation", "is_membership",
373
+ "duration_hours"] if c in analysis_df.columns]
374
+ csv = analysis_df[export_cols].to_csv(index=False)
375
+ st.download_button(
376
+ label="πŸ“₯ Download as CSV",
377
+ data=csv,
378
+ file_name=f"helpscout_analysis_{total}conversations.csv",
379
+ mime="text/csv",
380
+ key="hs_csv_download",
381
+ )
382
+
383
+
384
+ # ─────────────────────────────────────────────────────────────────────────────
385
+ # Helper renderers
386
+ # ─────────────────────────────────────────────────────────────────────────────
387
+
388
+ def _render_summary_report(result: dict):
389
+ """Render the LLM summary result with nice formatting."""
390
+ if not result.get("success"):
391
+ st.error(f"AI analysis failed: {result.get('error', 'Unknown error')}")
392
+ return
393
+
394
+ summary = result.get("summary", {})
395
+ meta = result.get("metadata", {})
396
+
397
+ with st.container():
398
+ st.markdown("---")
399
+ st.markdown("#### πŸ“‹ Executive Summary")
400
+ st.info(summary.get("executive_summary", ""))
401
+
402
+ col1, col2 = st.columns(2)
403
+
404
+ with col1:
405
+ themes = summary.get("top_themes", [])
406
+ if themes:
407
+ st.markdown("#### 🎯 Top Themes")
408
+ for t in themes:
409
+ st.markdown(
410
+ f"**{t.get('theme', '')}** _{t.get('prevalence', '')}_ \n"
411
+ f"{t.get('description', '')}"
412
+ )
413
+ st.markdown("")
414
+
415
+ insights = summary.get("unexpected_insights", [])
416
+ if insights:
417
+ st.markdown("#### πŸ’‘ Unexpected Insights")
418
+ for ins in insights:
419
+ st.markdown(f"- {ins}")
420
+
421
+ with col2:
422
+ complaints = summary.get("top_complaints", [])
423
+ if complaints:
424
+ st.markdown("#### ⚠️ Top Complaints")
425
+ for c in complaints:
426
+ st.markdown(f"- {c}")
427
+
428
+ quotes = summary.get("notable_quotes", [])
429
+ if quotes:
430
+ st.markdown("#### πŸ’¬ Notable Quotes")
431
+ for q in quotes:
432
+ st.markdown(f"> {q}")
433
+
434
+ with st.expander("ℹ️ Analysis Metadata"):
435
+ mc1, mc2, mc3 = st.columns(3)
436
+ mc1.metric("Conversations Analysed", meta.get("total_conversations_analyzed", 0))
437
+ mc2.metric("Model Used", meta.get("model_used", "N/A"))
438
+ mc3.metric("Tokens Used", meta.get("tokens_used", 0))
439
+ if meta.get("total_available", 0) > meta.get("total_conversations_analyzed", 0):
440
+ st.caption(
441
+ f"Sampled {meta['total_conversations_analyzed']} of "
442
+ f"{meta['total_available']} conversations for this analysis."
443
+ )
444
+
445
+
446
+ def _render_conversation_card(row, taxonomy: dict):
447
+ """Render a single conversation card."""
448
+ sent = str(row.get("sentiment_polarity", "unknown"))
449
+ sent_emoji = {
450
+ "very_positive": "🟒", "positive": "🟩", "neutral": "🟑",
451
+ "negative": "🟠", "very_negative": "πŸ”΄",
452
+ }.get(sent, "βšͺ")
453
+
454
+ topics_list = row.get("topics_list") or []
455
+ topic_labels_str = ", ".join(topic_label(t, taxonomy) for t in topics_list) if topics_list else "β€”"
456
+
457
+ first_name = str(row.get("customer_first") or "").strip()
458
+ last_name = str(row.get("customer_last") or "").strip()
459
+ customer_str = f"{first_name} {last_name[:1]}." if first_name or last_name else "Anonymous"
460
+
461
+ first_msg = row.get("first_message_at")
462
+ date_str = first_msg.strftime("%Y-%m-%d") if hasattr(first_msg, "strftime") else str(first_msg or "")
463
+
464
+ flags = []
465
+ if row.get("is_refund_request"): flags.append("πŸ’° Refund")
466
+ if row.get("is_cancellation"): flags.append("🚫 Cancel")
467
+ if row.get("is_membership"): flags.append("βœ… Membership")
468
+ flags_str = " | ".join(flags) if flags else ""
469
+
470
+ with st.expander(
471
+ f"{sent_emoji} {customer_str} β€” {topic_labels_str} | {sent.replace('_', ' ').title()} | {date_str}"
472
+ + (f" [{flags_str}]" if flags_str else ""),
473
+ expanded=False,
474
+ ):
475
+ info_col1, info_col2, info_col3 = st.columns(3)
476
+ info_col1.markdown(f"**Status:** {row.get('status', 'β€”')}")
477
+ info_col2.markdown(f"**Source:** {row.get('source_type', 'β€”')}")
478
+ info_col3.markdown(f"**Duration:** {row.get('duration_hours', 0):.1f}h | **Threads:** {row.get('thread_count', 0)}")
479
+
480
+ summary = str(row.get("summary") or "No summary available.")
481
+ st.markdown(f"**Summary:** {summary}")
482
+
483
+ notes_col1, notes_col2 = st.columns(2)
484
+ with notes_col1:
485
+ sent_note = str(row.get("sentiment_notes") or "")
486
+ if sent_note:
487
+ st.markdown(f"**Sentiment Note:** _{sent_note}_")
488
+ with notes_col2:
489
+ topic_note = str(row.get("topic_notes") or "")
490
+ if topic_note:
491
+ st.markdown(f"**Topic Note:** _{topic_note}_")
visualization/components/helpscout_dashboard.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout Dashboard Page
3
+ Full dedicated dashboard for HelpScout customer support conversation analysis.
4
+ """
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import pandas as pd
9
+ import streamlit as st
10
+
11
+ parent_dir = Path(__file__).resolve().parent.parent
12
+ sys.path.append(str(parent_dir))
13
+
14
+ from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
15
+ from visualizations.helpscout_charts import HelpScoutCharts
16
+ from visualizations.demographic_charts import DemographicCharts
17
+ from utils.data_processor import SentimentDataProcessor
18
+
19
+
20
+ def _sentiment_score(df) -> float:
21
+ """Compute average sentiment score on a -2 to +2 scale."""
22
+ score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
23
+ "negative": -1, "very_negative": -2}
24
+ if "sentiment_polarity" not in df.columns or df.empty:
25
+ return 0.0
26
+ scores = df["sentiment_polarity"].map(score_map).fillna(0)
27
+ return float(scores.mean())
28
+
29
+
30
+ def render_helpscout_dashboard(data_loader):
31
+ """
32
+ Render the full HelpScout Dashboard page.
33
+
34
+ Args:
35
+ data_loader: HelpScoutDataLoader instance
36
+ """
37
+ st.title("🎧 HelpScout Support Dashboard")
38
+ st.markdown("Customer support conversation analysis from HelpScout.")
39
+
40
+ hs_df = st.session_state.get("helpscout_df")
41
+ if hs_df is None or hs_df.empty:
42
+ st.warning("No HelpScout data available. Please check your Snowflake connection.")
43
+ return
44
+
45
+ charts = HelpScoutCharts()
46
+ taxonomy = load_topic_taxonomy()
47
+
48
+ # ── PDF Export ────────────────────────────────────────────────────────────
49
+ with st.expander("πŸ“„ Export PDF Report", expanded=False):
50
+ st.markdown(
51
+ "Generate a comprehensive HelpScout support report. "
52
+ "Covers sentiment, topics, billing flags, timelines, and demographics."
53
+ )
54
+ if st.button("Generate HelpScout PDF Report", type="primary",
55
+ use_container_width=True, key="hs_dash_pdf_btn"):
56
+ with st.spinner("Generating HelpScout PDF report…"):
57
+ try:
58
+ from utils.helpscout_pdf import HelpScoutDashboardPDF
59
+ exporter = HelpScoutDashboardPDF()
60
+ pdf_bytes = exporter.generate_report(hs_df)
61
+ import datetime
62
+ filename = f"helpscout_dashboard_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}.pdf"
63
+ st.success("Report generated successfully!")
64
+ st.download_button(
65
+ label="Download HelpScout Dashboard PDF",
66
+ data=pdf_bytes,
67
+ file_name=filename,
68
+ mime="application/pdf",
69
+ use_container_width=True,
70
+ )
71
+ except Exception as e:
72
+ st.error(f"Failed to generate report: {e}")
73
+ st.exception(e)
74
+
75
+ st.markdown("---")
76
+
77
+ # ── KPI Row ───────────────────────────────────────────────────────────────
78
+ total = len(hs_df)
79
+ escalation_count = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
80
+ flags = boolean_flag_counts(hs_df)
81
+ neg_pct = (hs_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100) if total else 0
82
+ avg_duration = float(hs_df["duration_hours"].mean()) if "duration_hours" in hs_df.columns else 0.0
83
+
84
+ k1, k2, k3, k4, k5, k6 = st.columns(6)
85
+ k1.metric("Total Conversations", f"{total:,}")
86
+ k2.metric("Avg Duration (h)", f"{avg_duration:.1f}")
87
+ k3.metric("Escalations", f"{escalation_count:,}", delta=f"{escalation_count/total*100:.1f}% of total" if total else None, delta_color="inverse")
88
+ k4.metric("Refund Requests", f"{flags['is_refund_request']:,}")
89
+ k5.metric("Cancellations", f"{flags['is_cancellation']:,}")
90
+ k6.metric("Membership Joins",f"{flags['is_membership']:,}")
91
+
92
+ st.markdown("---")
93
+
94
+ # ── Sentiment ─────────────────────────────────────────────────────────────
95
+ st.markdown("## 🎯 Sentiment Distribution")
96
+ col1, col2 = st.columns(2)
97
+ with col1:
98
+ st.plotly_chart(charts.create_sentiment_pie_chart(hs_df), use_container_width=True)
99
+ with col2:
100
+ avg_score = _sentiment_score(hs_df)
101
+ st.plotly_chart(charts.create_sentiment_score_gauge(avg_score), use_container_width=True)
102
+ m1, m2 = st.columns(2)
103
+ pos_pct = hs_df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
104
+ m1.metric("Positive %", f"{pos_pct:.1f}%")
105
+ m2.metric("Negative %", f"{neg_pct:.1f}%")
106
+
107
+ st.markdown("---")
108
+
109
+ # ── Topics ────────────────────────────────────────────────────────────────
110
+ st.markdown("## 🏷️ Topic Analysis")
111
+ col1, col2 = st.columns(2)
112
+ with col1:
113
+ st.plotly_chart(charts.create_topic_bar_chart(hs_df, title="Conversations by Topic"),
114
+ use_container_width=True)
115
+ with col2:
116
+ st.plotly_chart(charts.create_topic_pie_chart(hs_df, title="Topic Share"),
117
+ use_container_width=True)
118
+
119
+ st.plotly_chart(charts.create_topic_sentiment_heatmap(hs_df), use_container_width=True)
120
+
121
+ st.markdown("---")
122
+
123
+ # ── Emotions ─────────────────────────────────────────────────────────────
124
+ if "emotions" in hs_df.columns and hs_df["emotions"].notna().any():
125
+ st.markdown("## πŸ’­ Emotion Analysis")
126
+ col1, col2 = st.columns(2)
127
+ with col1:
128
+ st.plotly_chart(charts.create_emotion_bar_chart(hs_df, title="Emotion Distribution"),
129
+ use_container_width=True)
130
+ with col2:
131
+ # Reuse the existing DistributionCharts emotion pie (same df structure with emotions col)
132
+ from visualizations.distribution_charts import DistributionCharts
133
+ dist_charts = DistributionCharts()
134
+ st.plotly_chart(dist_charts.create_emotion_pie_chart(hs_df, title="Emotion Share"),
135
+ use_container_width=True)
136
+ st.markdown("---")
137
+
138
+ # ── Billing Flags ─────────────────────────────────────────────────────────
139
+ st.markdown("## πŸ’³ Billing & Membership Flags")
140
+ col1, col2 = st.columns(2)
141
+ with col1:
142
+ st.plotly_chart(charts.create_boolean_flags_chart(hs_df), use_container_width=True)
143
+ with col2:
144
+ st.plotly_chart(charts.create_escalation_breakdown(hs_df), use_container_width=True)
145
+
146
+ st.markdown("---")
147
+
148
+ # ── Status / Source ───────────────────────────────────────────────────────
149
+ st.markdown("## πŸ“¬ Status & Source Distribution")
150
+ col1, col2 = st.columns(2)
151
+ with col1:
152
+ st.plotly_chart(charts.create_status_distribution(hs_df), use_container_width=True)
153
+ with col2:
154
+ st.plotly_chart(charts.create_source_distribution(hs_df), use_container_width=True)
155
+
156
+ st.markdown("---")
157
+
158
+ # ── Volume & Timelines ────────────────────────────────────────────────────
159
+ with st.expander("πŸ“ˆ Volume & Trends", expanded=False):
160
+ freq_col, _ = st.columns([1, 3])
161
+ with freq_col:
162
+ freq = st.selectbox("Time Granularity", ["D", "W", "M"],
163
+ format_func=lambda x: {"D": "Daily", "W": "Weekly", "M": "Monthly"}[x],
164
+ index=1, key="hs_dash_freq")
165
+ st.plotly_chart(charts.create_volume_timeline(hs_df, freq=freq), use_container_width=True)
166
+ st.plotly_chart(charts.create_sentiment_timeline(hs_df, freq=freq), use_container_width=True)
167
+ st.plotly_chart(charts.create_topic_timeline(hs_df, freq=freq), use_container_width=True)
168
+ st.plotly_chart(charts.create_refund_cancel_timeline(hs_df, freq=freq), use_container_width=True)
169
+
170
+ # ── Duration & Thread Count ───────────────────────────────────────────────
171
+ with st.expander("πŸ“Š Conversation Depth", expanded=False):
172
+ col1, col2 = st.columns(2)
173
+ with col1:
174
+ st.plotly_chart(charts.create_duration_histogram(hs_df), use_container_width=True)
175
+ with col2:
176
+ st.plotly_chart(charts.create_thread_count_histogram(hs_df), use_container_width=True)
177
+
178
+ # ── Demographics ─────────────────────────────────────────────────────────
179
+ has_demographics = (
180
+ "age_group" in hs_df.columns
181
+ and "timezone_region" in hs_df.columns
182
+ and (hs_df["age_group"] != "Unknown").any()
183
+ )
184
+ if has_demographics:
185
+ st.markdown("---")
186
+ st.markdown("## πŸ‘₯ Customer Demographics")
187
+ st.info(f"Demographics available for customers whose email matched Musora user records.")
188
+
189
+ processor = SentimentDataProcessor()
190
+ demo_charts = DemographicCharts()
191
+
192
+ demo_col1, demo_col2, demo_col3, demo_col4 = st.columns(4)
193
+ known_demo = int((hs_df["age_group"] != "Unknown").sum())
194
+ demo_col1.metric("With Demographics", f"{known_demo:,}", f"{known_demo/total*100:.1f}% matched")
195
+
196
+ avg_age = hs_df["age"].mean() if "age" in hs_df.columns else None
197
+ demo_col2.metric("Average Age", f"{avg_age:.1f}" if avg_age else "N/A")
198
+
199
+ top_region = hs_df["timezone_region"].value_counts().index[0] if "timezone_region" in hs_df.columns and not hs_df.empty else "N/A"
200
+ demo_col3.metric("Top Region", str(top_region))
201
+
202
+ avg_exp = hs_df["experience_level"].mean() if "experience_level" in hs_df.columns else None
203
+ demo_col4.metric("Avg Experience", f"{avg_exp:.1f}/10" if avg_exp else "N/A")
204
+
205
+ st.markdown("---")
206
+ age_dist = processor.get_demographics_distribution(hs_df, "age_group")
207
+ if not age_dist.empty:
208
+ st.markdown("### Age Distribution")
209
+ col1, col2 = st.columns(2)
210
+ with col1:
211
+ st.plotly_chart(demo_charts.create_age_distribution_chart(age_dist), use_container_width=True)
212
+ with col2:
213
+ age_sent = processor.get_demographics_by_sentiment(hs_df, "age_group")
214
+ if not age_sent.empty:
215
+ st.plotly_chart(demo_charts.create_age_sentiment_chart(age_sent), use_container_width=True)
216
+
217
+ region_dist = processor.get_timezone_regions_distribution(hs_df)
218
+ if not region_dist.empty:
219
+ st.markdown("### Geographic Distribution")
220
+ col1, col2 = st.columns(2)
221
+ with col1:
222
+ st.plotly_chart(demo_charts.create_region_distribution_chart(region_dist), use_container_width=True)
223
+ with col2:
224
+ region_sent = processor.get_demographics_by_sentiment(hs_df, "timezone_region")
225
+ if not region_sent.empty:
226
+ st.plotly_chart(demo_charts.create_region_sentiment_chart(region_sent), use_container_width=True)
227
+
228
+ st.markdown("---")
229
+ st.caption(
230
+ "Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES | "
231
+ f"Last processed: {hs_df['processed_at'].max().strftime('%Y-%m-%d %H:%M') if 'processed_at' in hs_df.columns and not hs_df.empty else 'Unknown'}"
232
+ )
233
+
234
+
235
+ # ─────────────────────────────────────────────────────────────────────────────
236
+ # Compact summary for embedding in the main Sentiment Dashboard
237
+ # ─────────────────────────────────────────────────────────────────────────────
238
+
239
+ def render_helpscout_compact_summary(hs_df):
240
+ """
241
+ A one-screen HelpScout summary section embedded at the bottom of the
242
+ main Sentiment Dashboard. Kept purposely brief.
243
+ """
244
+ st.markdown("---")
245
+ st.markdown("## 🎧 HelpScout Support β€” Quick View")
246
+ st.caption(f"{len(hs_df):,} processed customer conversations")
247
+
248
+ total = len(hs_df)
249
+ if total == 0:
250
+ st.info("No HelpScout conversations available.")
251
+ return
252
+
253
+ charts = HelpScoutCharts()
254
+ flags = boolean_flag_counts(hs_df)
255
+ escalation_count = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
256
+ avg_dur = float(hs_df["duration_hours"].mean()) if "duration_hours" in hs_df.columns else 0.0
257
+
258
+ k1, k2, k3, k4 = st.columns(4)
259
+ k1.metric("Conversations", f"{total:,}")
260
+ k2.metric("Escalations", f"{escalation_count:,}", delta=f"{escalation_count/total*100:.1f}%", delta_color="inverse")
261
+ k3.metric("Refund Requests", f"{flags['is_refund_request']:,}")
262
+ k4.metric("Avg Duration (h)", f"{avg_dur:.1f}")
263
+
264
+ col1, col2 = st.columns(2)
265
+ with col1:
266
+ st.plotly_chart(
267
+ charts.create_sentiment_pie_chart(hs_df, title="HelpScout Sentiment"),
268
+ use_container_width=True,
269
+ key="hs_compact_sentiment_pie",
270
+ )
271
+ with col2:
272
+ st.plotly_chart(
273
+ charts.create_topic_bar_chart(hs_df, title="Top Topics", top_n=5),
274
+ use_container_width=True,
275
+ key="hs_compact_topic_bar",
276
+ )
277
+
278
+ st.info("πŸ‘‰ Navigate to **🎧 HelpScout Dashboard** for the full analysis.")
visualization/components/sentiment_analysis.py CHANGED
@@ -116,7 +116,7 @@ def render_sentiment_analysis(data_loader):
116
  mask = (dashboard_df['platform'] == selected_platform) & (dashboard_df['brand'] == selected_brand)
117
  preview_df = dashboard_df[mask]
118
 
119
- filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4)
120
 
121
  with filter_col1:
122
  sentiment_options = sorted(preview_df['sentiment_polarity'].unique().tolist())
@@ -141,6 +141,20 @@ def render_sentiment_analysis(data_loader):
141
  )
142
 
143
  with filter_col3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  top_n = st.selectbox(
145
  "Top N Contents",
146
  options=[5, 10, 15, 20, 25],
@@ -148,12 +162,12 @@ def render_sentiment_analysis(data_loader):
148
  help="Number of contents to display"
149
  )
150
 
151
- with filter_col4:
152
- filter_active = bool(selected_sentiments or selected_intents)
153
  st.metric(
154
  "Filters Active",
155
  "βœ“ Yes" if filter_active else "βœ— No",
156
- help="Sentiment or intent filters applied" if filter_active else "Showing all sentiments"
157
  )
158
 
159
  st.markdown("---")
@@ -200,6 +214,7 @@ def render_sentiment_analysis(data_loader):
200
  fetch_key = (
201
  selected_platform, selected_brand, top_n, min_comments, sort_by_value,
202
  tuple(sorted(selected_sentiments)), tuple(sorted(selected_intents)),
 
203
  str(query_date_range)
204
  )
205
 
@@ -234,6 +249,7 @@ def render_sentiment_analysis(data_loader):
234
  sort_by=sort_by_value,
235
  sentiments=selected_sentiments or None,
236
  intents=selected_intents or None,
 
237
  date_range=query_date_range,
238
  )
239
  st.session_state['sa_contents'] = contents_df
@@ -332,7 +348,7 @@ def render_sentiment_analysis(data_loader):
332
  if content_comments.empty:
333
  st.info("No sampled comment details available for this content.")
334
  else:
335
- viz_col1, viz_col2 = st.columns(2)
336
  with viz_col1:
337
  pie = sentiment_charts.create_sentiment_pie_chart(
338
  content_comments, title="Sentiment Distribution (sample)"
@@ -345,6 +361,12 @@ def render_sentiment_analysis(data_loader):
345
  )
346
  st.plotly_chart(bar, use_container_width=True,
347
  key=f"intent_bar_{content_row['content_sk']}")
 
 
 
 
 
 
348
 
349
  # AI Analysis
350
  st.markdown("#### πŸ€– AI-Powered Analysis")
@@ -500,7 +522,7 @@ def render_sentiment_analysis(data_loader):
500
  comments_df['content_sk'].isin(filtered_contents['content_sk'])
501
  ] if not comments_df.empty else pd.DataFrame()
502
 
503
- insight_col1, insight_col2 = st.columns(2)
504
  with insight_col1:
505
  st.markdown("#### 🎯 Common Intent Patterns")
506
  if not all_sampled.empty:
@@ -509,6 +531,16 @@ def render_sentiment_analysis(data_loader):
509
  st.markdown(f"- **{row['intent']}**: {row['count']} ({row['percentage']:.1f}%)")
510
 
511
  with insight_col2:
 
 
 
 
 
 
 
 
 
 
512
  st.markdown("#### 🌐 Platform Breakdown")
513
  if not all_sampled.empty:
514
  for platform, count in all_sampled['platform'].value_counts().items():
 
116
  mask = (dashboard_df['platform'] == selected_platform) & (dashboard_df['brand'] == selected_brand)
117
  preview_df = dashboard_df[mask]
118
 
119
+ filter_col1, filter_col2, filter_col3, filter_col4, filter_col5 = st.columns(5)
120
 
121
  with filter_col1:
122
  sentiment_options = sorted(preview_df['sentiment_polarity'].unique().tolist())
 
141
  )
142
 
143
  with filter_col3:
144
+ emotion_list = (
145
+ preview_df['emotions']
146
+ .str.split(',').explode().str.strip()
147
+ .dropna().unique().tolist()
148
+ if 'emotions' in preview_df.columns else []
149
+ )
150
+ selected_emotions = st.multiselect(
151
+ "Emotion",
152
+ options=sorted(e for e in emotion_list if e),
153
+ default=[],
154
+ help="Filter contents that have comments with these emotions"
155
+ )
156
+
157
+ with filter_col4:
158
  top_n = st.selectbox(
159
  "Top N Contents",
160
  options=[5, 10, 15, 20, 25],
 
162
  help="Number of contents to display"
163
  )
164
 
165
+ with filter_col5:
166
+ filter_active = bool(selected_sentiments or selected_intents or selected_emotions)
167
  st.metric(
168
  "Filters Active",
169
  "βœ“ Yes" if filter_active else "βœ— No",
170
+ help="Sentiment, intent, or emotion filters applied" if filter_active else "Showing all sentiments"
171
  )
172
 
173
  st.markdown("---")
 
214
  fetch_key = (
215
  selected_platform, selected_brand, top_n, min_comments, sort_by_value,
216
  tuple(sorted(selected_sentiments)), tuple(sorted(selected_intents)),
217
+ tuple(sorted(selected_emotions)),
218
  str(query_date_range)
219
  )
220
 
 
249
  sort_by=sort_by_value,
250
  sentiments=selected_sentiments or None,
251
  intents=selected_intents or None,
252
+ emotions=selected_emotions or None,
253
  date_range=query_date_range,
254
  )
255
  st.session_state['sa_contents'] = contents_df
 
348
  if content_comments.empty:
349
  st.info("No sampled comment details available for this content.")
350
  else:
351
+ viz_col1, viz_col2, viz_col3 = st.columns(3)
352
  with viz_col1:
353
  pie = sentiment_charts.create_sentiment_pie_chart(
354
  content_comments, title="Sentiment Distribution (sample)"
 
361
  )
362
  st.plotly_chart(bar, use_container_width=True,
363
  key=f"intent_bar_{content_row['content_sk']}")
364
+ with viz_col3:
365
+ emotion_bar = distribution_charts.create_emotion_bar_chart(
366
+ content_comments, title="Emotion Distribution (sample)", orientation='h'
367
+ )
368
+ st.plotly_chart(emotion_bar, use_container_width=True,
369
+ key=f"emotion_bar_{content_row['content_sk']}")
370
 
371
  # AI Analysis
372
  st.markdown("#### πŸ€– AI-Powered Analysis")
 
522
  comments_df['content_sk'].isin(filtered_contents['content_sk'])
523
  ] if not comments_df.empty else pd.DataFrame()
524
 
525
+ insight_col1, insight_col2, insight_col3 = st.columns(3)
526
  with insight_col1:
527
  st.markdown("#### 🎯 Common Intent Patterns")
528
  if not all_sampled.empty:
 
531
  st.markdown(f"- **{row['intent']}**: {row['count']} ({row['percentage']:.1f}%)")
532
 
533
  with insight_col2:
534
+ st.markdown("#### πŸ’­ Top Emotions")
535
+ if not all_sampled.empty:
536
+ emotion_dist = processor.get_emotion_distribution(all_sampled)
537
+ if not emotion_dist.empty:
538
+ for _, row in emotion_dist.sort_values('count', ascending=False).head(5).iterrows():
539
+ st.markdown(f"- **{row['emotions'].title()}**: {row['count']} ({row['percentage']:.1f}%)")
540
+ else:
541
+ st.info("No emotion data available.")
542
+
543
+ with insight_col3:
544
  st.markdown("#### 🌐 Platform Breakdown")
545
  if not all_sampled.empty:
546
  for platform, count in all_sampled['platform'].value_counts().items():
visualization/config/viz_config.json CHANGED
@@ -17,6 +17,19 @@
17
  "off_topic": "#9E9E9E",
18
  "spam_selfpromo": "#795548"
19
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "platform": {
21
  "facebook": "#1877F2",
22
  "instagram": "#E4405F",
@@ -49,6 +62,19 @@
49
  "off_topic",
50
  "spam_selfpromo"
51
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "negative_sentiments": [
53
  "negative",
54
  "very_negative"
@@ -67,7 +93,7 @@
67
  },
68
  "snowflake": {
69
  "query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
70
- "dashboard_query": "SELECT s.COMMENT_SK, s.CONTENT_SK, LOWER(s.PLATFORM) AS PLATFORM, LOWER(s.CHANNEL_NAME) AS BRAND, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.DETECTED_LANGUAGE, s.COMMENT_TIMESTAMP, s.PROCESSED_AT, s.AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s UNION ALL SELECT COMMENT_SK, CONTENT_SK, CASE WHEN LOWER(PLATFORM) = 'musora' THEN 'musora_app' ELSE LOWER(PLATFORM) END AS PLATFORM, LOWER(CHANNEL_NAME) AS BRAND, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, DETECTED_LANGUAGE, COMMENT_TIMESTAMP, PROCESSED_AT, AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
71
  "demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
72
  },
73
  "demographics": {
@@ -84,5 +110,39 @@
84
  "Advanced (8-10)": [8, 10]
85
  },
86
  "top_timezones_count": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  }
88
  }
 
17
  "off_topic": "#9E9E9E",
18
  "spam_selfpromo": "#795548"
19
  },
20
+ "emotion": {
21
+ "joy": "#FFD700",
22
+ "excitement": "#FF6B35",
23
+ "gratitude": "#4CAF50",
24
+ "admiration": "#2196F3",
25
+ "curiosity": "#00BCD4",
26
+ "humor": "#9C27B0",
27
+ "frustration": "#FF9800",
28
+ "disappointment": "#795548",
29
+ "sadness": "#607D8B",
30
+ "anger": "#D32F2F",
31
+ "neutral": "#9E9E9E"
32
+ },
33
  "platform": {
34
  "facebook": "#1877F2",
35
  "instagram": "#E4405F",
 
62
  "off_topic",
63
  "spam_selfpromo"
64
  ],
65
+ "emotion_order": [
66
+ "joy",
67
+ "excitement",
68
+ "gratitude",
69
+ "admiration",
70
+ "curiosity",
71
+ "humor",
72
+ "frustration",
73
+ "disappointment",
74
+ "sadness",
75
+ "anger",
76
+ "neutral"
77
+ ],
78
  "negative_sentiments": [
79
  "negative",
80
  "very_negative"
 
93
  },
94
  "snowflake": {
95
  "query": "SELECT s.COMMENT_SK, s.COMMENT_ID, s.ORIGINAL_TEXT, s.PLATFORM, s.COMMENT_TIMESTAMP, s.AUTHOR_NAME, s.AUTHOR_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_ID, CAST(NULL AS VARCHAR(16777216)) as PARENT_COMMENT_TEXT, s.CONTENT_SK, s.CONTENT_ID, s.CONTENT_DESCRIPTION, s.CHANNEL_SK, s.CHANNEL_NAME, s.CHANNEL_DISPLAY_NAME, s.DETECTED_LANGUAGE, s.LANGUAGE_CODE, s.IS_ENGLISH, s.LANGUAGE_CONFIDENCE, s.DETECTION_METHOD, s.HAS_TEXT, s.TRANSLATED_TEXT, s.TRANSLATION_PERFORMED, s.TRANSLATION_CONFIDENCE, s.TRANSLATION_NOTES, s.SENTIMENT_POLARITY, s.INTENT, s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.ANALYSIS_NOTES, s.PROCESSING_SUCCESS, CAST(NULL AS VARCHAR(16777216)) as PROCESSING_ERRORS, s.PROCESSED_AT, s.WORKFLOW_VERSION, CAST(NULL AS TIMESTAMP_NTZ(9)) as CREATED_AT, CAST(NULL AS TIMESTAMP_NTZ(9)) as UPDATED_AT, s.CHANNEL_NAME as BRAND, c.PERMALINK_URL, CAST(NULL AS VARCHAR(16777216)) as THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s LEFT JOIN SOCIAL_MEDIA_DB.CORE.DIM_CONTENT c ON s.CONTENT_SK = c.CONTENT_SK UNION ALL SELECT COMMENT_SK, COMMENT_ID, ORIGINAL_TEXT, CASE WHEN PLATFORM = 'musora' THEN 'musora_app' ELSE PLATFORM END as PLATFORM, COMMENT_TIMESTAMP, AUTHOR_NAME, AUTHOR_ID, PARENT_COMMENT_ID, PARENT_COMMENT_TEXT, CONTENT_SK, CONTENT_ID, CONTENT_DESCRIPTION, CHANNEL_SK, CHANNEL_NAME, CHANNEL_DISPLAY_NAME, DETECTED_LANGUAGE, LANGUAGE_CODE, IS_ENGLISH, LANGUAGE_CONFIDENCE, DETECTION_METHOD, HAS_TEXT, TRANSLATED_TEXT, TRANSLATION_PERFORMED, TRANSLATION_CONFIDENCE, TRANSLATION_NOTES, SENTIMENT_POLARITY, INTENT, REQUIRES_REPLY, SENTIMENT_CONFIDENCE, ANALYSIS_NOTES, PROCESSING_SUCCESS, PROCESSING_ERRORS, PROCESSED_AT, WORKFLOW_VERSION, CREATED_AT, UPDATED_AT, CHANNEL_NAME as BRAND, PERMALINK_URL, THUMBNAIL_URL FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
96
+ "dashboard_query": "SELECT s.COMMENT_SK, s.CONTENT_SK, LOWER(s.PLATFORM) AS PLATFORM, LOWER(s.CHANNEL_NAME) AS BRAND, s.SENTIMENT_POLARITY, s.INTENT, s.EMOTIONS, s.REQUIRES_REPLY, s.DETECTED_LANGUAGE, s.COMMENT_TIMESTAMP, s.PROCESSED_AT, s.AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s UNION ALL SELECT COMMENT_SK, CONTENT_SK, CASE WHEN LOWER(PLATFORM) = 'musora' THEN 'musora_app' ELSE LOWER(PLATFORM) END AS PLATFORM, LOWER(CHANNEL_NAME) AS BRAND, SENTIMENT_POLARITY, INTENT, EMOTIONS, REQUIRES_REPLY, DETECTED_LANGUAGE, COMMENT_TIMESTAMP, PROCESSED_AT, AUTHOR_ID FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES",
97
  "demographics_query": "SELECT u.id as USER_ID, u.birthday as BIRTHDAY, u.timezone as TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id"
98
  },
99
  "demographics": {
 
110
  "Advanced (8-10)": [8, 10]
111
  },
112
  "top_timezones_count": 15
113
+ },
114
+ "helpscout": {
115
+ "dashboard_query": "SELECT CONVERSATION_ID, LOWER(CUSTOMER_EMAIL) AS CUSTOMER_EMAIL, THREAD_COUNT, FIRST_MESSAGE_AT, LAST_MESSAGE_AT, DURATION_HOURS, STATUS, STATE, SOURCE_TYPE, SOURCE_VIA, SENTIMENT_POLARITY, EMOTIONS, TOPICS, IS_REFUND_REQUEST, IS_CANCELLATION, IS_MEMBERSHIP, SENTIMENT_CONFIDENCE, TOPIC_CONFIDENCE, PROCESSED_AT FROM SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES",
116
+ "demographics_query": "SELECT LOWER(u.email) AS CUSTOMER_EMAIL, TO_VARCHAR(u.birthday, 'YYYY-MM-DD HH24:MI:SS.FF6 TZHTZM') AS BIRTHDAY, u.timezone AS TIMEZONE, GREATEST(COALESCE(p.difficulty, 0), COALESCE(p.self_report_difficulty, 0), COALESCE(p.method_experience, 0)) AS EXPERIENCE_LEVEL FROM stitch.musora_ecom_db.usora_users u JOIN online_recsys.preprocessed.users p ON u.id = p.user_id WHERE u.email IS NOT NULL",
117
+ "default_top_n": 10,
118
+ "default_date_range_days": 60,
119
+ "escalation_sentiments": ["negative", "very_negative"],
120
+ "max_summary_conversations": 300
121
+ },
122
+ "color_schemes_helpscout": {
123
+ "topics": {
124
+ "video_and_playback": "#1982C4",
125
+ "app_and_technical_errors": "#D32F2F",
126
+ "navigation_and_ux": "#9C27B0",
127
+ "account_and_access": "#FF6F00",
128
+ "billing_and_subscription": "#00C851",
129
+ "learning_and_progress": "#2196F3",
130
+ "content_and_resources": "#4CAF50",
131
+ "community_and_notifications":"#FFB300",
132
+ "feedback_and_suggestions": "#00BCD4",
133
+ "uncategorized": "#9E9E9E"
134
+ },
135
+ "status": {
136
+ "active": "#FF6F00",
137
+ "pending": "#FFB300",
138
+ "closed": "#4CAF50",
139
+ "spam": "#9E9E9E",
140
+ "default": "#607D8B"
141
+ },
142
+ "boolean_flags": {
143
+ "is_refund_request": "#D32F2F",
144
+ "is_cancellation": "#FF6F00",
145
+ "is_membership": "#00C851"
146
+ }
147
  }
148
  }
visualization/data/data_loader.py CHANGED
@@ -90,6 +90,10 @@ class SentimentDataLoader:
90
  df['platform'] = df['platform'].fillna('unknown').str.lower()
91
  df['brand'] = df['brand'].fillna('unknown').str.lower()
92
 
 
 
 
 
93
  if 'requires_reply' in df.columns:
94
  df['requires_reply'] = df['requires_reply'].astype(bool)
95
 
@@ -166,7 +170,7 @@ class SentimentDataLoader:
166
 
167
  def load_sa_data(self, platform, brand, top_n=10, min_comments=10,
168
  sort_by='severity_score', sentiments=None, intents=None,
169
- date_range=None):
170
  """
171
  Load Sentiment Analysis page data:
172
  1. Content aggregation stats for top-N contents
@@ -180,6 +184,7 @@ class SentimentDataLoader:
180
  sort_by: 'severity_score' | 'sentiment_percentage' | 'sentiment_count' | 'total_comments'
181
  sentiments: List of sentiments to filter by (dominant_sentiment)
182
  intents: List of intents to filter by
 
183
  date_range: Tuple (start_date, end_date) or None
184
 
185
  Returns:
@@ -187,16 +192,17 @@ class SentimentDataLoader:
187
  """
188
  sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
189
  intents_key = tuple(sorted(intents)) if intents else ()
 
190
  date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
191
 
192
  return self._fetch_sa_data(
193
  platform, brand, top_n, min_comments, sort_by,
194
- sentiments_key, intents_key, date_key
195
  )
196
 
197
  @st.cache_data(ttl=86400)
198
  def _fetch_sa_data(_self, platform, brand, top_n, min_comments, sort_by,
199
- sentiments, intents, date_range):
200
  """Cached SA data fetch β€” returns (contents_df, comments_df)."""
201
  try:
202
  conn = SnowFlakeConn()
@@ -245,6 +251,16 @@ class SentimentDataLoader:
245
  ]['content_sk'].unique()
246
  contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
247
  comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
 
 
 
 
 
 
 
 
 
 
248
  else:
249
  comments_df = pd.DataFrame()
250
 
@@ -387,7 +403,7 @@ class SentimentDataLoader:
387
  LOWER(s.PLATFORM) AS PLATFORM,
388
  LOWER(s.CHANNEL_NAME) AS BRAND,
389
  s.COMMENT_TIMESTAMP, s.AUTHOR_NAME,
390
- s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT,
391
  s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH,
392
  c.PERMALINK_URL
393
  FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s
@@ -407,7 +423,7 @@ class SentimentDataLoader:
407
  'musora_app' AS PLATFORM,
408
  LOWER(CHANNEL_NAME) AS BRAND,
409
  COMMENT_TIMESTAMP, AUTHOR_NAME,
410
- DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT,
411
  REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH,
412
  PERMALINK_URL
413
  FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES
@@ -448,6 +464,10 @@ class SentimentDataLoader:
448
  df['intent'] = df['intent'].fillna('unknown')
449
  df['platform'] = df['platform'].fillna('unknown').str.lower()
450
 
 
 
 
 
451
  if 'requires_reply' in df.columns:
452
  df['requires_reply'] = df['requires_reply'].astype(bool)
453
 
 
90
  df['platform'] = df['platform'].fillna('unknown').str.lower()
91
  df['brand'] = df['brand'].fillna('unknown').str.lower()
92
 
93
+ # emotions is optional (soft-fail); keep NaN as-is
94
+ if 'emotions' not in df.columns:
95
+ df['emotions'] = None
96
+
97
  if 'requires_reply' in df.columns:
98
  df['requires_reply'] = df['requires_reply'].astype(bool)
99
 
 
170
 
171
  def load_sa_data(self, platform, brand, top_n=10, min_comments=10,
172
  sort_by='severity_score', sentiments=None, intents=None,
173
+ emotions=None, date_range=None):
174
  """
175
  Load Sentiment Analysis page data:
176
  1. Content aggregation stats for top-N contents
 
184
  sort_by: 'severity_score' | 'sentiment_percentage' | 'sentiment_count' | 'total_comments'
185
  sentiments: List of sentiments to filter by (dominant_sentiment)
186
  intents: List of intents to filter by
187
+ emotions: List of emotions to filter by (content must have at least one comment with these emotions)
188
  date_range: Tuple (start_date, end_date) or None
189
 
190
  Returns:
 
192
  """
193
  sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
194
  intents_key = tuple(sorted(intents)) if intents else ()
195
+ emotions_key = tuple(sorted(emotions)) if emotions else ()
196
  date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
197
 
198
  return self._fetch_sa_data(
199
  platform, brand, top_n, min_comments, sort_by,
200
+ sentiments_key, intents_key, emotions_key, date_key
201
  )
202
 
203
  @st.cache_data(ttl=86400)
204
  def _fetch_sa_data(_self, platform, brand, top_n, min_comments, sort_by,
205
+ sentiments, intents, emotions, date_range):
206
  """Cached SA data fetch β€” returns (contents_df, comments_df)."""
207
  try:
208
  conn = SnowFlakeConn()
 
251
  ]['content_sk'].unique()
252
  contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
253
  comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
254
+
255
+ # Python-side emotion filter β€” keep only content_sks that have
256
+ # at least one comment matching any selected emotion
257
+ if emotions:
258
+ pattern = '|'.join(re.escape(e) for e in emotions)
259
+ valid_sks = comments_df[
260
+ comments_df['emotions'].str.contains(pattern, na=False, case=False)
261
+ ]['content_sk'].unique()
262
+ contents_df = contents_df[contents_df['content_sk'].isin(valid_sks)]
263
+ comments_df = comments_df[comments_df['content_sk'].isin(valid_sks)]
264
  else:
265
  comments_df = pd.DataFrame()
266
 
 
403
  LOWER(s.PLATFORM) AS PLATFORM,
404
  LOWER(s.CHANNEL_NAME) AS BRAND,
405
  s.COMMENT_TIMESTAMP, s.AUTHOR_NAME,
406
+ s.DETECTED_LANGUAGE, s.SENTIMENT_POLARITY, s.INTENT, s.EMOTIONS,
407
  s.REQUIRES_REPLY, s.SENTIMENT_CONFIDENCE, s.IS_ENGLISH,
408
  c.PERMALINK_URL
409
  FROM SOCIAL_MEDIA_DB.ML_FEATURES.COMMENT_SENTIMENT_FEATURES s
 
423
  'musora_app' AS PLATFORM,
424
  LOWER(CHANNEL_NAME) AS BRAND,
425
  COMMENT_TIMESTAMP, AUTHOR_NAME,
426
+ DETECTED_LANGUAGE, SENTIMENT_POLARITY, INTENT, EMOTIONS,
427
  REQUIRES_REPLY, SENTIMENT_CONFIDENCE, IS_ENGLISH,
428
  PERMALINK_URL
429
  FROM SOCIAL_MEDIA_DB.ML_FEATURES.MUSORA_COMMENT_SENTIMENT_FEATURES
 
464
  df['intent'] = df['intent'].fillna('unknown')
465
  df['platform'] = df['platform'].fillna('unknown').str.lower()
466
 
467
+ # emotions is optional (soft-fail); keep NaN as-is for chart filtering
468
+ if 'emotions' not in df.columns:
469
+ df['emotions'] = None
470
+
471
  if 'requires_reply' in df.columns:
472
  df['requires_reply'] = df['requires_reply'].astype(bool)
473
 
visualization/data/helpscout_data_loader.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout data loader β€” mirrors SentimentDataLoader architecture.
3
+
4
+ Three loading modes:
5
+ - load_dashboard_data() : lightweight (no long text), cached 24 h
6
+ - load_analysis_data(...) : filtered with SUMMARY + notes, on-demand, cached 24 h
7
+ - load_demographics_data() : email-keyed user demographics, cached 24 h
8
+ """
9
+ import re
10
+ import sys
11
+ from datetime import datetime, timedelta
12
+ from pathlib import Path
13
+
14
+ import pandas as pd
15
+ import streamlit as st
16
+ from dateutil.relativedelta import relativedelta
17
+
18
+ root_dir = Path(__file__).resolve().parent.parent.parent
19
+ sys.path.append(str(root_dir))
20
+
21
+ from visualization.SnowFlakeConnection import SnowFlakeConn
22
+ from visualization.utils.helpscout_utils import (
23
+ load_topic_taxonomy, parse_topics, compute_escalation_flag
24
+ )
25
+ import json
26
+
27
+
28
+ class HelpScoutDataLoader:
29
+ """
30
+ Loads HelpScout conversation features from Snowflake with caching.
31
+ """
32
+
33
+ def __init__(self, config_path=None):
34
+ if config_path is None:
35
+ config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
36
+ with open(config_path, "r") as f:
37
+ self.config = json.load(f)
38
+
39
+ self.hs_config = self.config.get("helpscout", {})
40
+ self.dashboard_query = self.hs_config.get("dashboard_query", "")
41
+ self.demographics_query = self.hs_config.get("demographics_query", "")
42
+ self.escalation_sentiments = self.hs_config.get("escalation_sentiments", ["negative", "very_negative"])
43
+ self.default_date_range_days = self.hs_config.get("default_date_range_days", 60)
44
+ self.max_summary_conversations = self.hs_config.get("max_summary_conversations", 300)
45
+ self.topic_colors = self.config.get("color_schemes_helpscout", {}).get("topics", {})
46
+ self.status_colors = self.config.get("color_schemes_helpscout", {}).get("status", {})
47
+ self.flag_colors = self.config.get("color_schemes_helpscout", {}).get("boolean_flags", {})
48
+ self.sentiment_colors = self.config.get("color_schemes", {}).get("sentiment_polarity", {})
49
+ self.demographics_config = self.config.get("demographics", {})
50
+
51
+ self.taxonomy = load_topic_taxonomy()
52
+
53
+ # ─────────────────────────────────────────────────────────────
54
+ # Dashboard data (lightweight, 24-hour cache)
55
+ # ─────────────────────────────────────────────────────────────
56
+
57
+ @st.cache_data(ttl=86400)
58
+ def load_dashboard_data(_self):
59
+ """Load lightweight HelpScout dashboard data β€” no long-form text columns."""
60
+ try:
61
+ conn = SnowFlakeConn()
62
+ df = conn.run_read_query(_self.dashboard_query, "HelpScout dashboard data")
63
+ conn.close_connection()
64
+
65
+ if df is None or df.empty:
66
+ st.error("No HelpScout data returned from Snowflake")
67
+ return pd.DataFrame()
68
+
69
+ df = _self._process_dashboard_df(df)
70
+
71
+ if _self.demographics_query:
72
+ demo_df = _self.load_demographics_data()
73
+ if not demo_df.empty:
74
+ df = _self.merge_demographics(df, demo_df)
75
+
76
+ return df
77
+ except Exception as e:
78
+ st.error(f"Error loading HelpScout dashboard data: {e}")
79
+ return pd.DataFrame()
80
+
81
+ def _process_dashboard_df(self, df):
82
+ df.columns = df.columns.str.lower()
83
+
84
+ for ts_col in ("first_message_at", "last_message_at", "processed_at"):
85
+ if ts_col in df.columns:
86
+ df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce", utc=True).dt.tz_localize(None)
87
+
88
+ df["sentiment_polarity"] = df["sentiment_polarity"].fillna("unknown")
89
+ df["status"] = df["status"].fillna("unknown").str.lower()
90
+ df["state"] = df["state"].fillna("unknown").str.lower()
91
+ df["source_type"] = df["source_type"].fillna("unknown").str.lower()
92
+
93
+ for bool_col in ("is_refund_request", "is_cancellation", "is_membership"):
94
+ if bool_col in df.columns:
95
+ df[bool_col] = df[bool_col].fillna(False).astype(bool)
96
+
97
+ if "emotions" not in df.columns:
98
+ df["emotions"] = None
99
+
100
+ # topics_list for filter options
101
+ df["topics_list"] = df["topics"].apply(parse_topics)
102
+
103
+ # escalation flag
104
+ df["is_escalation"] = compute_escalation_flag(df, self.escalation_sentiments)
105
+
106
+ return df
107
+
108
+ # ─────────────────────────────────────────────────────────────
109
+ # Analysis page data (on-demand, 24-hour cache)
110
+ # ──────────────────────────────────────────────────────────��──
111
+
112
+ def load_analysis_data(self, sentiments=None, topics=None,
113
+ refund_only=False, cancel_only=False,
114
+ membership_only=False, statuses=None,
115
+ sources=None, date_range=None, top_n=None):
116
+ """
117
+ Load filtered HelpScout conversations with full text for the Analysis page.
118
+ Caches based on argument tuple.
119
+ """
120
+ sentiments_key = tuple(sorted(sentiments)) if sentiments else ()
121
+ topics_key = tuple(sorted(topics)) if topics else ()
122
+ statuses_key = tuple(sorted(statuses)) if statuses else ()
123
+ sources_key = tuple(sorted(sources)) if sources else ()
124
+ date_key = (str(date_range[0]), str(date_range[1])) if date_range and len(date_range) == 2 else ()
125
+ return self._fetch_analysis_data(
126
+ sentiments_key, topics_key, bool(refund_only), bool(cancel_only),
127
+ bool(membership_only), statuses_key, sources_key, date_key, top_n or 0
128
+ )
129
+
130
+ @st.cache_data(ttl=86400)
131
+ def _fetch_analysis_data(_self, sentiments, topics, refund_only, cancel_only,
132
+ membership_only, statuses, sources, date_range, top_n):
133
+ """Cached analysis data fetch β€” returns full-detail conversation df."""
134
+ try:
135
+ query = _self._build_analysis_query(
136
+ sentiments, topics, refund_only, cancel_only,
137
+ membership_only, statuses, sources, date_range, top_n
138
+ )
139
+ conn = SnowFlakeConn()
140
+ df = conn.run_read_query(query, "HelpScout analysis data")
141
+ conn.close_connection()
142
+
143
+ if df is None or df.empty:
144
+ return pd.DataFrame()
145
+
146
+ df = _self._process_analysis_df(df)
147
+ return df
148
+ except Exception as e:
149
+ st.error(f"Error loading HelpScout analysis data: {e}")
150
+ return pd.DataFrame()
151
+
152
+ def _build_analysis_query(self, sentiments, topics, refund_only, cancel_only,
153
+ membership_only, statuses, sources, date_range, top_n):
154
+ """Build dynamic SQL for the analysis page with all filters pushed to Snowflake."""
155
+ where_clauses = []
156
+
157
+ if date_range and len(date_range) == 2:
158
+ where_clauses.append(f"FIRST_MESSAGE_AT >= '{date_range[0]}' AND FIRST_MESSAGE_AT <= '{date_range[1]}'")
159
+
160
+ if sentiments:
161
+ safe = "', '".join(self._sanitize(s) for s in sentiments)
162
+ where_clauses.append(f"SENTIMENT_POLARITY IN ('{safe}')")
163
+
164
+ if topics:
165
+ topic_conditions = []
166
+ for t in topics:
167
+ safe_t = self._sanitize(t)
168
+ topic_conditions.append(
169
+ f"ARRAY_CONTAINS('{safe_t}'::VARIANT, SPLIT(TOPICS, ','))"
170
+ )
171
+ where_clauses.append("(" + " OR ".join(topic_conditions) + ")")
172
+
173
+ if statuses:
174
+ safe = "', '".join(self._sanitize(s.lower()) for s in statuses)
175
+ where_clauses.append(f"LOWER(STATUS) IN ('{safe}')")
176
+
177
+ if sources:
178
+ safe = "', '".join(self._sanitize(s.lower()) for s in sources)
179
+ where_clauses.append(f"LOWER(SOURCE_TYPE) IN ('{safe}')")
180
+
181
+ if refund_only:
182
+ where_clauses.append("IS_REFUND_REQUEST = TRUE")
183
+ if cancel_only:
184
+ where_clauses.append("IS_CANCELLATION = TRUE")
185
+ if membership_only:
186
+ where_clauses.append("IS_MEMBERSHIP = TRUE")
187
+
188
+ where_sql = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
189
+ limit_sql = f"LIMIT {int(top_n)}" if top_n and top_n > 0 else ""
190
+
191
+ return f"""
192
+ SELECT
193
+ CONVERSATION_ID,
194
+ LOWER(CUSTOMER_EMAIL) AS CUSTOMER_EMAIL,
195
+ CUSTOMER_FIRST,
196
+ CUSTOMER_LAST,
197
+ THREAD_COUNT,
198
+ FIRST_MESSAGE_AT,
199
+ LAST_MESSAGE_AT,
200
+ DURATION_HOURS,
201
+ STATUS,
202
+ STATE,
203
+ SOURCE_TYPE,
204
+ SOURCE_VIA,
205
+ SENTIMENT_POLARITY,
206
+ EMOTIONS,
207
+ SENTIMENT_CONFIDENCE,
208
+ SENTIMENT_NOTES,
209
+ TOPICS,
210
+ IS_REFUND_REQUEST,
211
+ IS_CANCELLATION,
212
+ IS_MEMBERSHIP,
213
+ TOPIC_CONFIDENCE,
214
+ TOPIC_NOTES,
215
+ SUMMARY,
216
+ PROCESSED_AT
217
+ FROM SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES
218
+ {where_sql}
219
+ ORDER BY FIRST_MESSAGE_AT DESC
220
+ {limit_sql}
221
+ """
222
+
223
+ def _process_analysis_df(self, df):
224
+ df.columns = df.columns.str.lower()
225
+
226
+ for ts_col in ("first_message_at", "last_message_at", "processed_at"):
227
+ if ts_col in df.columns:
228
+ df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce", utc=True).dt.tz_localize(None)
229
+
230
+ df["sentiment_polarity"] = df["sentiment_polarity"].fillna("unknown")
231
+ df["status"] = df["status"].fillna("unknown").str.lower()
232
+ df["source_type"] = df["source_type"].fillna("unknown").str.lower()
233
+
234
+ for bool_col in ("is_refund_request", "is_cancellation", "is_membership"):
235
+ if bool_col in df.columns:
236
+ df[bool_col] = df[bool_col].fillna(False).astype(bool)
237
+
238
+ if "emotions" not in df.columns:
239
+ df["emotions"] = None
240
+
241
+ df["topics_list"] = df["topics"].apply(parse_topics)
242
+ df["is_escalation"] = compute_escalation_flag(df, self.escalation_sentiments)
243
+
244
+ # Short summary for cards (100 chars)
245
+ if "summary" in df.columns:
246
+ text = df["summary"].fillna("").astype(str)
247
+ df["summary_short"] = text.where(text.str.len() <= 120, text.str[:120] + "…")
248
+
249
+ return df
250
+
251
+ # ─────────────────────────────────────────────────────────────
252
+ # Demographics (email-keyed, 24-hour cache)
253
+ # ─────────────────────────────────────────────────────────────
254
+
255
+ @st.cache_data(ttl=86400)
256
+ def load_demographics_data(_self):
257
+ """Load user demographics keyed by email."""
258
+ if not _self.demographics_query:
259
+ return pd.DataFrame()
260
+ try:
261
+ conn = SnowFlakeConn()
262
+ df = conn.run_read_query(_self.demographics_query, "HelpScout user demographics")
263
+ conn.close_connection()
264
+
265
+ if df is None or df.empty:
266
+ return pd.DataFrame()
267
+
268
+ return _self._process_demographics_df(df)
269
+ except Exception as e:
270
+ st.warning(f"Could not load HelpScout demographics: {e}")
271
+ return pd.DataFrame()
272
+
273
+ def _process_demographics_df(self, df):
274
+ df.columns = df.columns.str.lower()
275
+
276
+ if "birthday" in df.columns:
277
+ df["birthday"] = df["birthday"].astype(str)
278
+ df["birthday"] = pd.to_datetime(df["birthday"], errors="coerce", utc=True)
279
+ df["birthday"] = df["birthday"].dt.tz_localize(None)
280
+ df["age"] = df["birthday"].apply(self._calculate_age)
281
+ df["age_group"] = df["age"].apply(self._categorize_age)
282
+
283
+ if "timezone" in df.columns:
284
+ df["timezone_region"] = df["timezone"].apply(self._extract_timezone_region)
285
+
286
+ if "experience_level" in df.columns:
287
+ df["experience_group"] = df["experience_level"].apply(self._categorize_experience)
288
+
289
+ if "customer_email" in df.columns:
290
+ df = df[df["customer_email"].notna()]
291
+ df["customer_email"] = df["customer_email"].str.lower()
292
+
293
+ return df
294
+
295
+ def merge_demographics(self, df, demo_df):
296
+ """Merge demographic data with HelpScout conversations on customer_email."""
297
+ if demo_df.empty or "customer_email" not in df.columns:
298
+ for col, val in [("age", None), ("age_group", "Unknown"),
299
+ ("timezone", None), ("timezone_region", "Unknown"),
300
+ ("experience_level", None), ("experience_group", "Unknown")]:
301
+ df[col] = val
302
+ return df
303
+
304
+ if "customer_email" not in demo_df.columns:
305
+ return df
306
+
307
+ merge_cols = ["customer_email"]
308
+ for c in ["age", "age_group", "timezone", "timezone_region", "experience_level", "experience_group"]:
309
+ if c in demo_df.columns:
310
+ merge_cols.append(c)
311
+
312
+ merged = df.merge(demo_df[merge_cols], on="customer_email", how="left")
313
+
314
+ for col in ["age_group", "timezone_region", "experience_group"]:
315
+ if col in merged.columns:
316
+ merged[col] = merged[col].fillna("Unknown")
317
+
318
+ return merged
319
+
320
+ # ─────────────────────────────────────────────────────────────
321
+ # Filter helpers
322
+ # ─────────────────────────────────────────────────────────────
323
+
324
+ def get_filter_options(self, df):
325
+ """Return unique values for all in-page filters from the dashboard df."""
326
+ topics_flat = df["topics_list"].explode().dropna().unique().tolist() if "topics_list" in df.columns else []
327
+ return {
328
+ "sentiments": sorted(df["sentiment_polarity"].dropna().unique().tolist()),
329
+ "topics": sorted(t for t in topics_flat if t),
330
+ "statuses": sorted(df["status"].dropna().unique().tolist()),
331
+ "states": sorted(df["state"].dropna().unique().tolist()) if "state" in df.columns else [],
332
+ "sources": sorted(df["source_type"].dropna().unique().tolist()),
333
+ }
334
+
335
+ # ─────────────────────────────────────────────────────────────
336
+ # Demographics calculation helpers (mirrors SentimentDataLoader)
337
+ # ─────────────────────────────────────────────────────────────
338
+
339
+ @staticmethod
340
+ def _calculate_age(birthday):
341
+ if pd.isna(birthday):
342
+ return None
343
+ try:
344
+ age = relativedelta(datetime.now(), birthday).years
345
+ return age if 0 <= age <= 120 else None
346
+ except Exception:
347
+ return None
348
+
349
+ def _categorize_age(self, age):
350
+ if pd.isna(age) or age is None:
351
+ return "Unknown"
352
+ for group_name, (min_age, max_age) in self.demographics_config.get("age_groups", {}).items():
353
+ if min_age <= age <= max_age:
354
+ return group_name
355
+ return "Unknown"
356
+
357
+ @staticmethod
358
+ def _extract_timezone_region(timezone):
359
+ if pd.isna(timezone) or not isinstance(timezone, str):
360
+ return "Unknown"
361
+ parts = timezone.split("/")
362
+ return parts[0] if parts else "Unknown"
363
+
364
+ def _categorize_experience(self, experience_level):
365
+ if pd.isna(experience_level):
366
+ return "Unknown"
367
+ try:
368
+ exp_level = float(experience_level)
369
+ except Exception:
370
+ return "Unknown"
371
+ for group_name, (min_exp, max_exp) in self.demographics_config.get("experience_groups", {}).items():
372
+ if min_exp <= exp_level <= max_exp:
373
+ return group_name
374
+ return "Unknown"
375
+
376
+ # ─────────────────────────────────────────────────────────────
377
+ # Internal helpers
378
+ # ─────────────────────────────────────────────────────────────
379
+
380
+ @staticmethod
381
+ def _sanitize(value: str) -> str:
382
+ return re.sub(r"['\";\\]", "", str(value))
visualization/utils/auth.py CHANGED
@@ -24,8 +24,6 @@ AUTHORIZED_EMAILS = {
24
  "gabriel@musora.com",
25
  "jmilligan@musora.com",
26
  "dave@musora.com",
27
- "amy@musora.com",
28
- "karissa@musora.com"
29
  }
30
 
31
 
 
24
  "gabriel@musora.com",
25
  "jmilligan@musora.com",
26
  "dave@musora.com",
 
 
27
  }
28
 
29
 
visualization/utils/data_processor.py CHANGED
@@ -113,6 +113,52 @@ class SentimentDataProcessor:
113
 
114
  return intent_counts
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  @staticmethod
117
  def get_content_summary(df):
118
  """
 
113
 
114
  return intent_counts
115
 
116
+ @staticmethod
117
+ def get_emotion_distribution(df, group_by=None):
118
+ """
119
+ Calculate emotion distribution (handles multi-label).
120
+
121
+ Args:
122
+ df: Sentiment dataframe with 'emotions' column
123
+ group_by: Optional column(s) to group by
124
+
125
+ Returns:
126
+ pd.DataFrame: Emotion distribution with columns [emotion, count, percentage]
127
+ """
128
+ if 'emotions' not in df.columns:
129
+ return pd.DataFrame()
130
+
131
+ df_exploded = df.dropna(subset=['emotions']).copy()
132
+ df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
133
+ df_exploded = df_exploded.explode('emotions')
134
+ df_exploded['emotions'] = df_exploded['emotions'].str.strip()
135
+ df_exploded = df_exploded[df_exploded['emotions'] != '']
136
+
137
+ if df_exploded.empty:
138
+ return pd.DataFrame()
139
+
140
+ if group_by:
141
+ if isinstance(group_by, str):
142
+ group_by = [group_by]
143
+
144
+ emotion_counts = df_exploded.groupby(
145
+ group_by + ['emotions'],
146
+ as_index=False
147
+ ).size().rename(columns={'size': 'count'})
148
+
149
+ emotion_counts['percentage'] = emotion_counts.groupby(group_by)['count'].transform(
150
+ lambda x: (x / x.sum() * 100).round(2)
151
+ )
152
+
153
+ else:
154
+ emotion_counts = df_exploded['emotions'].value_counts().reset_index()
155
+ emotion_counts.columns = ['emotions', 'count']
156
+ emotion_counts['percentage'] = (
157
+ emotion_counts['count'] / emotion_counts['count'].sum() * 100
158
+ ).round(2)
159
+
160
+ return emotion_counts
161
+
162
  @staticmethod
163
  def get_content_summary(df):
164
  """
visualization/utils/helpscout_pdf.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout PDF Exporters.
3
+
4
+ Two classes sharing the MusoraPDF base from pdf_exporter.py:
5
+ - HelpScoutDashboardPDF : full HelpScout dashboard report
6
+ - HelpScoutAnalysisPDF : filtered analysis report + optional LLM summary
7
+ """
8
+ import logging
9
+ import os
10
+ import sys
11
+ import tempfile
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+
15
+ import plotly.io as pio
16
+
17
+ _parent = Path(__file__).resolve().parent.parent
18
+ if str(_parent) not in sys.path:
19
+ sys.path.insert(0, str(_parent))
20
+
21
+ from utils.pdf_exporter import MusoraPDF # reuse base class
22
+ from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
23
+ from visualizations.helpscout_charts import HelpScoutCharts
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ _RENDER_SCALE = 3
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Shared rendering helpers (mixin-style functions)
32
+ # ---------------------------------------------------------------------------
33
+
34
+ def _prepare_fig(fig, is_side_by_side=False):
35
+ base_fs = 13 if is_side_by_side else 14
36
+ fig.update_layout(
37
+ paper_bgcolor="white", plot_bgcolor="white",
38
+ font=dict(color="black", size=base_fs),
39
+ title_font_size=base_fs + 4,
40
+ margin=(dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80)),
41
+ )
42
+ fig.update_xaxes(automargin=True)
43
+ fig.update_yaxes(automargin=True)
44
+
45
+
46
+ def _fig_to_tmp(fig, width=800, height=400, is_side_by_side=False) -> str:
47
+ _prepare_fig(fig, is_side_by_side)
48
+ img = pio.to_image(fig, format="png", width=width, height=height,
49
+ scale=_RENDER_SCALE, engine="kaleido")
50
+ tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
51
+ tmp.write(img)
52
+ tmp.close()
53
+ return tmp.name
54
+
55
+
56
+ def _cleanup(paths):
57
+ for p in paths:
58
+ try:
59
+ os.unlink(p)
60
+ except OSError:
61
+ pass
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # HelpScoutDashboardPDF
66
+ # ---------------------------------------------------------------------------
67
+
68
+ class HelpScoutDashboardPDF:
69
+ """
70
+ Generates a comprehensive HelpScout dashboard PDF report.
71
+ """
72
+
73
+ def __init__(self):
74
+ self.charts = HelpScoutCharts()
75
+ self.taxonomy = load_topic_taxonomy()
76
+ self._tmp: list = []
77
+
78
+ def generate_report(self, df, filter_info: dict = None) -> bytes:
79
+ """Build and return the full dashboard PDF."""
80
+ self.pdf = MusoraPDF()
81
+ self._tmp = []
82
+ try:
83
+ self._cover(df, filter_info)
84
+ self._executive_summary(df)
85
+ self._sentiment_section(df)
86
+ self._topic_section(df)
87
+ self._emotion_section(df)
88
+ self._flags_section(df)
89
+ self._status_source_section(df)
90
+ self._timelines_section(df)
91
+ self._depth_section(df)
92
+ self._data_summary(df, filter_info)
93
+ return bytes(self.pdf.output())
94
+ finally:
95
+ _cleanup(self._tmp)
96
+
97
+ # ── Rendering helpers ──
98
+
99
+ def _add_chart(self, fig, width=180, img_w=800, img_h=400):
100
+ try:
101
+ p = _fig_to_tmp(fig, img_w, img_h)
102
+ self._tmp.append(p)
103
+ h_mm = width * (img_h / img_w)
104
+ self.pdf.check_page_break(h_mm + 5)
105
+ self.pdf.image(p, x=10, w=width)
106
+ self.pdf.ln(3)
107
+ except Exception:
108
+ logger.exception("Chart render failed")
109
+ self.pdf.body_text("[Chart could not be rendered]")
110
+
111
+ def _add_two_charts(self, fig1, fig2, width=92):
112
+ try:
113
+ p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
114
+ p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
115
+ h_mm = width * (450 / 700)
116
+ self.pdf.check_page_break(h_mm + 5)
117
+ y = self.pdf.get_y()
118
+ self.pdf.image(p1, x=10, y=y, w=width)
119
+ self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
120
+ self.pdf.set_y(y + h_mm + 3)
121
+ except Exception:
122
+ logger.exception("Side-by-side render failed")
123
+ self.pdf.body_text("[Charts could not be rendered]")
124
+
125
+ # ── Sections ──
126
+
127
+ def _cover(self, df, filter_info):
128
+ self.pdf.add_page()
129
+ self.pdf.ln(40)
130
+ r, g, b = MusoraPDF.PRIMARY
131
+ self.pdf.set_fill_color(r, g, b)
132
+ self.pdf.rect(0, 60, 210, 4, style="F")
133
+ self.pdf.ln(20)
134
+ self.pdf.set_font("Helvetica", "B", 28)
135
+ self.pdf.set_text_color(r, g, b)
136
+ self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
137
+ self.pdf.set_font("Helvetica", "", 16)
138
+ self.pdf.set_text_color(80, 80, 80)
139
+ self.pdf.cell(0, 10, "HelpScout Support Dashboard Report",
140
+ align="C", new_x="LMARGIN", new_y="NEXT")
141
+ self.pdf.ln(10)
142
+ self.pdf.set_font("Helvetica", "", 12)
143
+ self.pdf.set_text_color(100, 100, 100)
144
+ self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
145
+ align="C", new_x="LMARGIN", new_y="NEXT")
146
+ self.pdf.ln(5)
147
+ self.pdf.set_font("Helvetica", "", 10)
148
+ self.pdf.cell(0, 7, f"Total Conversations: {len(df):,}",
149
+ align="C", new_x="LMARGIN", new_y="NEXT")
150
+ if "first_message_at" in df.columns and not df.empty:
151
+ valid = df["first_message_at"].dropna()
152
+ if not valid.empty:
153
+ dr = f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}"
154
+ self.pdf.ln(3)
155
+ self.pdf.set_font("Helvetica", "I", 9)
156
+ self.pdf.set_text_color(120, 120, 120)
157
+ self.pdf.cell(0, 6, MusoraPDF._sanitize(f"Data period: {dr}"),
158
+ align="C", new_x="LMARGIN", new_y="NEXT")
159
+ self.pdf.ln(20)
160
+ self.pdf.set_font("Helvetica", "I", 8)
161
+ self.pdf.set_text_color(150, 150, 150)
162
+ self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
163
+ align="C", new_x="LMARGIN", new_y="NEXT")
164
+
165
+ def _executive_summary(self, df):
166
+ self.pdf.add_page()
167
+ self.pdf.section_header("Executive Summary")
168
+ total = len(df)
169
+ flags = boolean_flag_counts(df)
170
+ neg = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum()
171
+ pos = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum()
172
+ neg_pct = neg / total * 100 if total else 0
173
+ pos_pct = pos / total * 100 if total else 0
174
+ esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
175
+ avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
176
+
177
+ self.pdf.metric_row([
178
+ ("Total Conversations", f"{total:,}"),
179
+ ("Positive %", f"{pos_pct:.1f}%"),
180
+ ("Negative %", f"{neg_pct:.1f}%"),
181
+ ("Avg Duration (h)", f"{avg_dur:.1f}"),
182
+ ])
183
+ self.pdf.metric_row([
184
+ ("Escalations", f"{esc:,}"),
185
+ ("Refund Requests", f"{flags['is_refund_request']:,}"),
186
+ ("Cancellations", f"{flags['is_cancellation']:,}"),
187
+ ("Membership Joins", f"{flags['is_membership']:,}"),
188
+ ])
189
+
190
+ def _sentiment_section(self, df):
191
+ self.pdf.add_page()
192
+ self.pdf.section_header("Sentiment Distribution")
193
+ pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
194
+ gauge = self.charts.create_sentiment_score_gauge(self._avg_score(df))
195
+ self._add_two_charts(pie, gauge)
196
+
197
+ def _topic_section(self, df):
198
+ self.pdf.add_page()
199
+ self.pdf.section_header("Topic Analysis")
200
+ bar = self.charts.create_topic_bar_chart(df, title="Conversations by Topic")
201
+ pie = self.charts.create_topic_pie_chart(df, title="Topic Share")
202
+ self._add_two_charts(bar, pie)
203
+ self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
204
+
205
+ def _emotion_section(self, df):
206
+ if "emotions" not in df.columns or df["emotions"].dropna().empty:
207
+ return
208
+ self.pdf.add_page()
209
+ self.pdf.section_header("Emotion Analysis")
210
+ self._add_chart(self.charts.create_emotion_bar_chart(df, title="Emotion Distribution"))
211
+
212
+ def _flags_section(self, df):
213
+ self.pdf.add_page()
214
+ self.pdf.section_header("Billing & Membership Flags")
215
+ flags_chart = self.charts.create_boolean_flags_chart(df)
216
+ esc_chart = self.charts.create_escalation_breakdown(df)
217
+ self._add_two_charts(flags_chart, esc_chart)
218
+
219
+ def _status_source_section(self, df):
220
+ self.pdf.add_page()
221
+ self.pdf.section_header("Status & Source Distribution")
222
+ status_chart = self.charts.create_status_distribution(df)
223
+ source_chart = self.charts.create_source_distribution(df)
224
+ self._add_two_charts(status_chart, source_chart)
225
+
226
+ def _timelines_section(self, df):
227
+ self.pdf.add_page()
228
+ self.pdf.section_header("Volume & Trends (Weekly)")
229
+ self._add_chart(self.charts.create_volume_timeline(df, freq="W"))
230
+ self._add_chart(self.charts.create_sentiment_timeline(df, freq="W"))
231
+ self._add_chart(self.charts.create_refund_cancel_timeline(df, freq="W"))
232
+
233
+ def _depth_section(self, df):
234
+ self.pdf.add_page()
235
+ self.pdf.section_header("Conversation Depth")
236
+ dur = self.charts.create_duration_histogram(df)
237
+ thd = self.charts.create_thread_count_histogram(df)
238
+ self._add_two_charts(dur, thd)
239
+
240
+ def _data_summary(self, df, filter_info):
241
+ self.pdf.add_page()
242
+ self.pdf.section_header("Data Summary")
243
+ self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
244
+ self.pdf.body_text(f"Total conversations: {len(df):,}")
245
+ self.pdf.callout_box(
246
+ "Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
247
+ "This report is confidential and intended for internal Musora team use only.",
248
+ bg_color=(245, 245, 245),
249
+ )
250
+
251
+ @staticmethod
252
+ def _avg_score(df) -> float:
253
+ score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
254
+ "negative": -1, "very_negative": -2}
255
+ if "sentiment_polarity" not in df.columns or df.empty:
256
+ return 0.0
257
+ return float(df["sentiment_polarity"].map(score_map).fillna(0).mean())
258
+
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # HelpScoutAnalysisPDF
262
+ # ---------------------------------------------------------------------------
263
+
264
+ class HelpScoutAnalysisPDF:
265
+ """
266
+ Generates a focused analysis PDF from the HelpScout Analysis page.
267
+ Includes filter summary, distributions, and optionally the LLM summary report.
268
+ """
269
+
270
+ def __init__(self):
271
+ self.charts = HelpScoutCharts()
272
+ self.taxonomy = load_topic_taxonomy()
273
+ self._tmp: list = []
274
+
275
+ def generate_report(self, df, filter_info: dict = None,
276
+ summary_result: dict = None) -> bytes:
277
+ """
278
+ Build and return the analysis PDF.
279
+
280
+ Args:
281
+ df: Filtered HelpScout analysis DataFrame.
282
+ filter_info: Dict of filter descriptions for the cover.
283
+ summary_result: Output from HelpScoutSummaryAgent.process() or None.
284
+ """
285
+ self.pdf = MusoraPDF()
286
+ self._tmp = []
287
+ try:
288
+ self._cover(df, filter_info)
289
+ self._filter_summary_section(filter_info, df)
290
+ self._kpi_section(df)
291
+ self._distributions_section(df)
292
+ self._summary_section(summary_result)
293
+ self._data_summary(df, filter_info)
294
+ return bytes(self.pdf.output())
295
+ finally:
296
+ _cleanup(self._tmp)
297
+
298
+ # ── Rendering helpers ──
299
+
300
+ def _add_chart(self, fig, width=180, img_w=800, img_h=400):
301
+ try:
302
+ p = _fig_to_tmp(fig, img_w, img_h)
303
+ self._tmp.append(p)
304
+ h_mm = width * (img_h / img_w)
305
+ self.pdf.check_page_break(h_mm + 5)
306
+ self.pdf.image(p, x=10, w=width)
307
+ self.pdf.ln(3)
308
+ except Exception:
309
+ logger.exception("Chart render failed")
310
+ self.pdf.body_text("[Chart could not be rendered]")
311
+
312
+ def _add_two_charts(self, fig1, fig2, width=92):
313
+ try:
314
+ p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
315
+ p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
316
+ h_mm = width * (450 / 700)
317
+ self.pdf.check_page_break(h_mm + 5)
318
+ y = self.pdf.get_y()
319
+ self.pdf.image(p1, x=10, y=y, w=width)
320
+ self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
321
+ self.pdf.set_y(y + h_mm + 3)
322
+ except Exception:
323
+ logger.exception("Side-by-side render failed")
324
+ self.pdf.body_text("[Charts could not be rendered]")
325
+
326
+ # ── Sections ──
327
+
328
+ def _cover(self, df, filter_info):
329
+ self.pdf.add_page()
330
+ self.pdf.ln(40)
331
+ r, g, b = MusoraPDF.PRIMARY
332
+ self.pdf.set_fill_color(r, g, b)
333
+ self.pdf.rect(0, 60, 210, 4, style="F")
334
+ self.pdf.ln(20)
335
+ self.pdf.set_font("Helvetica", "B", 28)
336
+ self.pdf.set_text_color(r, g, b)
337
+ self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
338
+ self.pdf.set_font("Helvetica", "", 16)
339
+ self.pdf.set_text_color(80, 80, 80)
340
+ self.pdf.cell(0, 10, "HelpScout Analysis Report",
341
+ align="C", new_x="LMARGIN", new_y="NEXT")
342
+ self.pdf.ln(10)
343
+ self.pdf.set_font("Helvetica", "", 12)
344
+ self.pdf.set_text_color(100, 100, 100)
345
+ self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
346
+ align="C", new_x="LMARGIN", new_y="NEXT")
347
+ self.pdf.ln(5)
348
+ self.pdf.set_font("Helvetica", "", 10)
349
+ self.pdf.cell(0, 7, f"Matched Conversations: {len(df):,}",
350
+ align="C", new_x="LMARGIN", new_y="NEXT")
351
+ if filter_info:
352
+ self.pdf.ln(8)
353
+ self.pdf.set_font("Helvetica", "B", 9)
354
+ self.pdf.set_text_color(80, 80, 80)
355
+ self.pdf.cell(0, 6, "Applied Filters:", align="C", new_x="LMARGIN", new_y="NEXT")
356
+ self.pdf.set_font("Helvetica", "", 9)
357
+ for k, v in filter_info.items():
358
+ if v:
359
+ self.pdf.cell(0, 5, MusoraPDF._sanitize(f"{k}: {v}"),
360
+ align="C", new_x="LMARGIN", new_y="NEXT")
361
+ self.pdf.ln(20)
362
+ self.pdf.set_font("Helvetica", "I", 8)
363
+ self.pdf.set_text_color(150, 150, 150)
364
+ self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
365
+ align="C", new_x="LMARGIN", new_y="NEXT")
366
+
367
+ def _filter_summary_section(self, filter_info, df):
368
+ self.pdf.add_page()
369
+ self.pdf.section_header("Filter Set Summary")
370
+ if filter_info:
371
+ rows = [(k, MusoraPDF._sanitize(str(v))) for k, v in filter_info.items() if v]
372
+ if rows:
373
+ self.pdf.add_table(["Filter", "Value"], rows, col_widths=[80, 110])
374
+ else:
375
+ self.pdf.body_text("No filters applied β€” report covers all available conversations.")
376
+
377
+ def _kpi_section(self, df):
378
+ total = len(df)
379
+ flags = boolean_flag_counts(df)
380
+ neg_pct = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
381
+ pos_pct = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
382
+ avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
383
+ esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
384
+
385
+ self.pdf.section_header("Key Metrics")
386
+ self.pdf.metric_row([
387
+ ("Conversations", f"{total:,}"),
388
+ ("Positive %", f"{pos_pct:.1f}%"),
389
+ ("Negative %", f"{neg_pct:.1f}%"),
390
+ ("Avg Duration (h)", f"{avg_dur:.1f}"),
391
+ ])
392
+ self.pdf.metric_row([
393
+ ("Escalations", f"{esc:,}"),
394
+ ("Refund Requests", f"{flags['is_refund_request']:,}"),
395
+ ("Cancellations", f"{flags['is_cancellation']:,}"),
396
+ ("Membership Joins", f"{flags['is_membership']:,}"),
397
+ ])
398
+
399
+ def _distributions_section(self, df):
400
+ self.pdf.add_page()
401
+ self.pdf.section_header("Distributions")
402
+ pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
403
+ tbar = self.charts.create_topic_bar_chart(df, title="Topic Distribution")
404
+ self._add_two_charts(pie, tbar)
405
+ self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
406
+
407
+ def _summary_section(self, result: dict):
408
+ self.pdf.add_page()
409
+ self.pdf.section_header("AI Summary Report")
410
+
411
+ if result is None or not result.get("success"):
412
+ self.pdf.callout_box(
413
+ "AI summary not generated. To include it, click 'Generate Summary Report' "
414
+ "in the app before exporting the PDF.",
415
+ bg_color=(255, 250, 230),
416
+ )
417
+ return
418
+
419
+ summary = result.get("summary", {})
420
+ meta = result.get("metadata", {})
421
+
422
+ exec_summary = MusoraPDF._sanitize(summary.get("executive_summary", ""))
423
+ if exec_summary:
424
+ self.pdf.subsection_header("Executive Summary")
425
+ self.pdf.section_description(exec_summary)
426
+
427
+ themes = summary.get("top_themes", [])
428
+ if themes:
429
+ self.pdf.subsection_header("Top Themes")
430
+ for t in themes:
431
+ theme_text = MusoraPDF._sanitize(
432
+ f"{t.get('theme', '')} β€” {t.get('prevalence', '')}: {t.get('description', '')}"
433
+ )
434
+ self.pdf.body_text(f" * {theme_text}")
435
+
436
+ complaints = summary.get("top_complaints", [])
437
+ if complaints:
438
+ self.pdf.subsection_header("Top Complaints")
439
+ for c in complaints:
440
+ self.pdf.body_text(f" * {MusoraPDF._sanitize(c)}")
441
+
442
+ insights = summary.get("unexpected_insights", [])
443
+ if insights:
444
+ self.pdf.subsection_header("Unexpected Insights")
445
+ for ins in insights:
446
+ self.pdf.body_text(f" * {MusoraPDF._sanitize(ins)}")
447
+
448
+ quotes = summary.get("notable_quotes", [])
449
+ if quotes:
450
+ self.pdf.subsection_header("Notable Quotes")
451
+ for q in quotes:
452
+ self.pdf.body_text(f' "{MusoraPDF._sanitize(q)}"')
453
+
454
+ self.pdf.ln(4)
455
+ self.pdf.callout_box(
456
+ f"Analysis based on {meta.get('total_conversations_analyzed', 0)} conversations "
457
+ f"| Model: {meta.get('model_used', 'N/A')} "
458
+ f"| Tokens: {meta.get('tokens_used', 0):,}",
459
+ bg_color=(240, 248, 255),
460
+ )
461
+
462
+ def _data_summary(self, df, filter_info):
463
+ self.pdf.add_page()
464
+ self.pdf.section_header("Data Summary")
465
+ self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
466
+ self.pdf.body_text(f"Total conversations in report: {len(df):,}")
467
+ self.pdf.callout_box(
468
+ "Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
469
+ "This report is confidential and intended for internal Musora team use only.",
470
+ bg_color=(245, 245, 245),
471
+ )
visualization/utils/helpscout_utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout utility helpers β€” pure functions, no Streamlit dependency.
3
+ """
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Topic taxonomy helpers
12
+ # ---------------------------------------------------------------------------
13
+
14
+ def load_topic_taxonomy(path: str = None) -> dict:
15
+ """
16
+ Load topics.json and return {id: {'label': str, 'description': str}}.
17
+ Default path resolves to process_helpscout/config_files/topics.json
18
+ relative to the project root.
19
+ """
20
+ if path is None:
21
+ root = Path(__file__).resolve().parent.parent.parent
22
+ path = root / "process_helpscout" / "config_files" / "topics.json"
23
+ with open(path, "r", encoding="utf-8") as f:
24
+ raw = json.load(f)
25
+ return {t["id"]: {"label": t["label"], "description": t.get("description", "")}
26
+ for t in raw.get("topics", [])}
27
+
28
+
29
+ def topic_label(topic_id: str, taxonomy: dict) -> str:
30
+ """Return human-readable label for a topic id. Falls back to title-cased id."""
31
+ if topic_id in taxonomy:
32
+ return taxonomy[topic_id]["label"]
33
+ return topic_id.replace("_", " ").title()
34
+
35
+
36
+ def parse_topics(value) -> list:
37
+ """Split a comma-separated TOPICS string into a list of stripped lowercase ids."""
38
+ if pd.isna(value) or not isinstance(value, str) or not value.strip():
39
+ return []
40
+ return [t.strip().lower() for t in value.split(",") if t.strip()]
41
+
42
+
43
+ def explode_topics(df: pd.DataFrame, topics_col: str = "topics") -> pd.DataFrame:
44
+ """
45
+ Return a new dataframe with one row per (conversation_id, topic_id).
46
+ Requires df to have a 'conversation_id' column and a topics_col column.
47
+ """
48
+ df = df.copy()
49
+ df["_topic_list"] = df[topics_col].apply(parse_topics)
50
+ exploded = df.explode("_topic_list").rename(columns={"_topic_list": "topic_id"})
51
+ exploded = exploded[exploded["topic_id"].notna() & (exploded["topic_id"] != "")]
52
+ return exploded.drop(columns=[topics_col], errors="ignore").reset_index(drop=True)
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Boolean flag helpers
57
+ # ---------------------------------------------------------------------------
58
+
59
+ def boolean_flag_counts(df: pd.DataFrame) -> dict:
60
+ """Return counts for refund / cancellation / membership flags."""
61
+ return {
62
+ "is_refund_request": int(df["is_refund_request"].sum()) if "is_refund_request" in df.columns else 0,
63
+ "is_cancellation": int(df["is_cancellation"].sum()) if "is_cancellation" in df.columns else 0,
64
+ "is_membership": int(df["is_membership"].sum()) if "is_membership" in df.columns else 0,
65
+ }
66
+
67
+
68
+ def compute_escalation_flag(df: pd.DataFrame, escalation_sentiments: list) -> pd.Series:
69
+ """
70
+ Boolean Series: True when conversation is negative-sentiment
71
+ OR is a refund request OR is a cancellation.
72
+ """
73
+ is_neg = df["sentiment_polarity"].isin(escalation_sentiments)
74
+ is_refund = df.get("is_refund_request", pd.Series(False, index=df.index)).fillna(False).astype(bool)
75
+ is_cancel = df.get("is_cancellation", pd.Series(False, index=df.index)).fillna(False).astype(bool)
76
+ return is_neg | is_refund | is_cancel
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Filter description builder
81
+ # ---------------------------------------------------------------------------
82
+
83
+ def build_filter_description(filters: dict, taxonomy: dict) -> str:
84
+ """
85
+ Convert the filter dict from the analysis page into a human-readable string
86
+ suitable for the agent prompt and PDF cover.
87
+ """
88
+ parts = []
89
+ if filters.get("date_range"):
90
+ s, e = filters["date_range"]
91
+ parts.append(f"Date: {s} to {e}")
92
+ if filters.get("sentiments"):
93
+ parts.append(f"Sentiments: {', '.join(filters['sentiments'])}")
94
+ if filters.get("topics"):
95
+ labels = [topic_label(t, taxonomy) for t in filters["topics"]]
96
+ parts.append(f"Topics: {', '.join(labels)}")
97
+ if filters.get("statuses"):
98
+ parts.append(f"Status: {', '.join(filters['statuses'])}")
99
+ if filters.get("sources"):
100
+ parts.append(f"Source: {', '.join(filters['sources'])}")
101
+ if filters.get("refund_only"):
102
+ parts.append("Refund requests only")
103
+ if filters.get("cancel_only"):
104
+ parts.append("Cancellations only")
105
+ if filters.get("membership_only"):
106
+ parts.append("Membership requests only")
107
+ return "; ".join(parts) if parts else "No filters applied β€” showing all conversations"
visualization/utils/pdf_exporter.py CHANGED
@@ -79,6 +79,13 @@ _DESCRIPTIONS = {
79
  "Note: These charts reflect only users who have filled in their profile information - "
80
  "they do not represent all community members."
81
  ),
 
 
 
 
 
 
 
82
  "language": (
83
  "Language distribution shows what languages comments are written in. "
84
  "Non-English comments are automatically translated for analysis."
@@ -342,6 +349,7 @@ class DashboardPDFExporter:
342
  self._add_brand_section(df)
343
  self._add_platform_section(df)
344
  self._add_intent_section(df)
 
345
  self._add_cross_dimensional_section(df)
346
  self._add_volume_section(df)
347
  self._add_reply_requirements_section(df)
@@ -350,6 +358,7 @@ class DashboardPDFExporter:
350
  if "detected_language" in df.columns:
351
  self._add_language_section(df)
352
  self._add_data_summary(df, filter_info)
 
353
 
354
  return bytes(self.pdf.output())
355
  finally:
@@ -782,6 +791,39 @@ class DashboardPDFExporter:
782
  )
783
  self._add_two_charts(intent_bar, intent_pie)
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  def _add_cross_dimensional_section(self, df) -> None:
786
  if "brand" not in df.columns or "platform" not in df.columns:
787
  return
@@ -913,6 +955,44 @@ class DashboardPDFExporter:
913
  self.distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
914
  )
915
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
916
  def _add_data_summary(self, df, filter_info: dict) -> None:
917
  self.pdf.add_page()
918
  self.pdf.section_header("Data Summary")
 
79
  "Note: These charts reflect only users who have filled in their profile information - "
80
  "they do not represent all community members."
81
  ),
82
+ "emotion": (
83
+ "Beyond sentiment polarity, the AI identifies the underlying emotion in each comment: "
84
+ "joy, excitement, gratitude, admiration, curiosity, humor, frustration, "
85
+ "disappointment, sadness, anger, or neutral. "
86
+ "Comments can have multiple emotions (multi-label). "
87
+ "Emotions with no data are omitted from the charts."
88
+ ),
89
  "language": (
90
  "Language distribution shows what languages comments are written in. "
91
  "Non-English comments are automatically translated for analysis."
 
349
  self._add_brand_section(df)
350
  self._add_platform_section(df)
351
  self._add_intent_section(df)
352
+ self._add_emotion_section(df)
353
  self._add_cross_dimensional_section(df)
354
  self._add_volume_section(df)
355
  self._add_reply_requirements_section(df)
 
358
  if "detected_language" in df.columns:
359
  self._add_language_section(df)
360
  self._add_data_summary(df, filter_info)
361
+ self._add_helpscout_summary_section()
362
 
363
  return bytes(self.pdf.output())
364
  finally:
 
791
  )
792
  self._add_two_charts(intent_bar, intent_pie)
793
 
794
+ def _add_emotion_section(self, df) -> None:
795
+ if "emotions" not in df.columns or df["emotions"].dropna().empty:
796
+ return
797
+
798
+ self.pdf.add_page()
799
+ self.pdf.section_header("Emotion Analysis")
800
+ self.pdf.section_description(_DESCRIPTIONS["emotion"])
801
+
802
+ emotion_bar = self.distribution_charts.create_emotion_bar_chart(
803
+ df, title="Emotion Distribution", orientation="h"
804
+ )
805
+ emotion_pie = self.distribution_charts.create_emotion_pie_chart(
806
+ df, title="Emotion Distribution"
807
+ )
808
+ self._add_two_charts(emotion_bar, emotion_pie)
809
+
810
+ # Top 5 emotions summary
811
+ emotion_dist = self.processor.get_emotion_distribution(df)
812
+ if not emotion_dist.empty:
813
+ self.pdf.subsection_header("Top Emotions")
814
+ rows = []
815
+ for _, row in emotion_dist.sort_values('count', ascending=False).head(8).iterrows():
816
+ rows.append((
817
+ str(row['emotions']).title(),
818
+ f"{int(row['count']):,}",
819
+ f"{row['percentage']:.1f}%",
820
+ ))
821
+ self.pdf.add_table(
822
+ headers=["Emotion", "Count", "Percentage"],
823
+ rows=rows,
824
+ col_widths=[80, 55, 55],
825
+ )
826
+
827
  def _add_cross_dimensional_section(self, df) -> None:
828
  if "brand" not in df.columns or "platform" not in df.columns:
829
  return
 
955
  self.distribution_charts.create_language_distribution(df, top_n=10, title="Top 10 Languages")
956
  )
957
 
958
+ def _add_helpscout_summary_section(self) -> None:
959
+ """Short HelpScout overview appended to the combined dashboard PDF."""
960
+ try:
961
+ import streamlit as st
962
+ hs_df = st.session_state.get("helpscout_df")
963
+ if hs_df is None or hs_df.empty:
964
+ return
965
+
966
+ from utils.helpscout_utils import boolean_flag_counts
967
+ from visualizations.helpscout_charts import HelpScoutCharts
968
+
969
+ self.pdf.add_page()
970
+ self.pdf.section_header("HelpScout Support Overview")
971
+ self.pdf.section_description(
972
+ "Summary of customer support conversations processed through the "
973
+ "HelpScout sentiment pipeline."
974
+ )
975
+
976
+ total = len(hs_df)
977
+ flags = boolean_flag_counts(hs_df)
978
+ neg_pct = hs_df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
979
+ esc = int(hs_df["is_escalation"].sum()) if "is_escalation" in hs_df.columns else 0
980
+
981
+ self.pdf.metric_row([
982
+ ("Conversations", f"{total:,}"),
983
+ ("Negative %", f"{neg_pct:.1f}%"),
984
+ ("Escalations", f"{esc:,}"),
985
+ ("Refund Requests", f"{flags['is_refund_request']:,}"),
986
+ ])
987
+
988
+ hs_charts = HelpScoutCharts()
989
+ pie = hs_charts.create_sentiment_pie_chart(hs_df, title="HelpScout Sentiment Distribution")
990
+ tbar = hs_charts.create_topic_bar_chart(hs_df, title="Top Topics", top_n=5)
991
+ self._add_two_charts(pie, tbar)
992
+
993
+ except Exception:
994
+ logger.exception("HelpScout summary section failed β€” skipping")
995
+
996
  def _add_data_summary(self, df, filter_info: dict) -> None:
997
  self.pdf.add_page()
998
  self.pdf.section_header("Data Summary")
visualization/visualizations/distribution_charts.py CHANGED
@@ -29,9 +29,11 @@ class DistributionCharts:
29
  self.config = json.load(f)
30
 
31
  self.intent_colors = self.config['color_schemes']['intent']
 
32
  self.platform_colors = self.config['color_schemes']['platform']
33
  self.brand_colors = self.config['color_schemes']['brand']
34
  self.intent_order = self.config['intent_order']
 
35
  self.chart_height = self.config['dashboard']['chart_height']
36
 
37
  def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'):
@@ -141,6 +143,135 @@ class DistributionCharts:
141
 
142
  return fig
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def create_platform_distribution(self, df, title="Comments by Platform"):
145
  """
146
  Create bar chart for platform distribution
 
29
  self.config = json.load(f)
30
 
31
  self.intent_colors = self.config['color_schemes']['intent']
32
+ self.emotion_colors = self.config['color_schemes'].get('emotion', {})
33
  self.platform_colors = self.config['color_schemes']['platform']
34
  self.brand_colors = self.config['color_schemes']['brand']
35
  self.intent_order = self.config['intent_order']
36
+ self.emotion_order = self.config.get('emotion_order', [])
37
  self.chart_height = self.config['dashboard']['chart_height']
38
 
39
  def create_intent_bar_chart(self, df, title="Intent Distribution", orientation='h'):
 
143
 
144
  return fig
145
 
146
+ def create_emotion_bar_chart(self, df, title="Emotion Distribution", orientation='h'):
147
+ """
148
+ Create bar chart for emotion distribution (handles multi-label).
149
+
150
+ Args:
151
+ df: Sentiment dataframe with 'emotions' column
152
+ title: Chart title
153
+ orientation: 'h' for horizontal, 'v' for vertical
154
+
155
+ Returns:
156
+ plotly.graph_objects.Figure
157
+ """
158
+ if 'emotions' not in df.columns:
159
+ return go.Figure().add_annotation(
160
+ text="No emotion data available",
161
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
162
+ )
163
+
164
+ df_exploded = df.dropna(subset=['emotions']).copy()
165
+ df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
166
+ df_exploded = df_exploded.explode('emotions')
167
+ df_exploded['emotions'] = df_exploded['emotions'].str.strip()
168
+ df_exploded = df_exploded[df_exploded['emotions'] != '']
169
+
170
+ if df_exploded.empty:
171
+ return go.Figure().add_annotation(
172
+ text="No emotion data available",
173
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
174
+ )
175
+
176
+ emotion_counts = df_exploded['emotions'].value_counts()
177
+ ordered_emotions = [e for e in self.emotion_order if e in emotion_counts.index]
178
+ # include any emotions not in the order list
179
+ remaining = [e for e in emotion_counts.index if e not in ordered_emotions]
180
+ ordered_emotions = ordered_emotions + remaining
181
+ emotion_counts = emotion_counts[ordered_emotions]
182
+
183
+ colors = [self.emotion_colors.get(e, '#CCCCCC') for e in emotion_counts.index]
184
+
185
+ if orientation == 'h':
186
+ fig = go.Figure(data=[go.Bar(
187
+ y=emotion_counts.index,
188
+ x=emotion_counts.values,
189
+ orientation='h',
190
+ marker=dict(color=colors),
191
+ text=emotion_counts.values,
192
+ textposition='auto',
193
+ hovertemplate='<b>%{y}</b><br>Count: %{x}<extra></extra>'
194
+ )])
195
+ fig.update_layout(
196
+ title=title,
197
+ xaxis_title="Number of Comments",
198
+ yaxis_title="Emotion",
199
+ height=self.chart_height,
200
+ yaxis={'categoryorder': 'total ascending'}
201
+ )
202
+ else:
203
+ fig = go.Figure(data=[go.Bar(
204
+ x=emotion_counts.index,
205
+ y=emotion_counts.values,
206
+ marker=dict(color=colors),
207
+ text=emotion_counts.values,
208
+ textposition='auto',
209
+ hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>'
210
+ )])
211
+ fig.update_layout(
212
+ title=title,
213
+ xaxis_title="Emotion",
214
+ yaxis_title="Number of Comments",
215
+ height=self.chart_height
216
+ )
217
+
218
+ return fig
219
+
220
+ def create_emotion_pie_chart(self, df, title="Emotion Distribution"):
221
+ """
222
+ Create pie chart for emotion distribution.
223
+
224
+ Args:
225
+ df: Sentiment dataframe with 'emotions' column
226
+ title: Chart title
227
+
228
+ Returns:
229
+ plotly.graph_objects.Figure
230
+ """
231
+ if 'emotions' not in df.columns:
232
+ return go.Figure().add_annotation(
233
+ text="No emotion data available",
234
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
235
+ )
236
+
237
+ df_exploded = df.dropna(subset=['emotions']).copy()
238
+ df_exploded['emotions'] = df_exploded['emotions'].str.split(',')
239
+ df_exploded = df_exploded.explode('emotions')
240
+ df_exploded['emotions'] = df_exploded['emotions'].str.strip()
241
+ df_exploded = df_exploded[df_exploded['emotions'] != '']
242
+
243
+ if df_exploded.empty:
244
+ return go.Figure().add_annotation(
245
+ text="No emotion data available",
246
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False
247
+ )
248
+
249
+ emotion_counts = df_exploded['emotions'].value_counts()
250
+ ordered_emotions = [e for e in self.emotion_order if e in emotion_counts.index]
251
+ remaining = [e for e in emotion_counts.index if e not in ordered_emotions]
252
+ ordered_emotions = ordered_emotions + remaining
253
+ emotion_counts = emotion_counts[ordered_emotions]
254
+
255
+ colors = [self.emotion_colors.get(e, '#CCCCCC') for e in emotion_counts.index]
256
+
257
+ fig = go.Figure(data=[go.Pie(
258
+ labels=emotion_counts.index,
259
+ values=emotion_counts.values,
260
+ marker=dict(colors=colors),
261
+ textinfo='label+percent',
262
+ textposition='auto',
263
+ hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
264
+ )])
265
+
266
+ fig.update_layout(
267
+ title=title,
268
+ height=self.chart_height,
269
+ showlegend=True,
270
+ legend=dict(orientation="v", yanchor="middle", y=0.5, xanchor="left", x=1.05)
271
+ )
272
+
273
+ return fig
274
+
275
  def create_platform_distribution(self, df, title="Comments by Platform"):
276
  """
277
  Create bar chart for platform distribution
visualization/visualizations/helpscout_charts.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HelpScout-specific Plotly chart functions.
3
+ All functions accept a HelpScout conversations DataFrame and return a
4
+ plotly.graph_objects.Figure.
5
+ """
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+
13
+ # Ensure project root is on sys.path so visualization.* imports resolve
14
+ _root = Path(__file__).resolve().parent.parent.parent
15
+ if str(_root) not in sys.path:
16
+ sys.path.insert(0, str(_root))
17
+
18
+ from visualization.utils.helpscout_utils import (
19
+ explode_topics, parse_topics, topic_label, load_topic_taxonomy
20
+ )
21
+
22
+
23
+ class HelpScoutCharts:
24
+ """Plotly chart factory for HelpScout conversation data."""
25
+
26
+ def __init__(self, config_path=None):
27
+ if config_path is None:
28
+ config_path = Path(__file__).parent.parent / "config" / "viz_config.json"
29
+ with open(config_path, "r") as f:
30
+ config = json.load(f)
31
+
32
+ hs_colors = config.get("color_schemes_helpscout", {})
33
+ self.topic_colors = hs_colors.get("topics", {})
34
+ self.status_colors = hs_colors.get("status", {})
35
+ self.flag_colors = hs_colors.get("boolean_flags", {})
36
+ self.sentiment_colors = config.get("color_schemes", {}).get("sentiment_polarity", {})
37
+ self.sentiment_order = config.get("sentiment_order", [])
38
+ self.chart_height = config.get("dashboard", {}).get("chart_height", 400)
39
+ self.taxonomy = load_topic_taxonomy()
40
+
41
+ # ─────────────────────────────────────────────────────────────
42
+ # Sentiment charts
43
+ # ─────────────────────────────────────────────────────────────
44
+
45
+ def create_sentiment_pie_chart(self, df, title="Sentiment Distribution"):
46
+ counts = df["sentiment_polarity"].value_counts()
47
+ ordered = [s for s in self.sentiment_order if s in counts.index]
48
+ counts = counts[ordered]
49
+ colors = [self.sentiment_colors.get(s, "#CCCCCC") for s in counts.index]
50
+
51
+ fig = go.Figure(go.Pie(
52
+ labels=counts.index,
53
+ values=counts.values,
54
+ marker=dict(colors=colors),
55
+ textinfo="label+percent",
56
+ hovertemplate="<b>%{label}</b><br>Count: %{value}<br>%{percent}<extra></extra>",
57
+ ))
58
+ fig.update_layout(title=title, height=self.chart_height,
59
+ legend=dict(orientation="v", yanchor="middle", y=0.5))
60
+ return fig
61
+
62
+ def create_sentiment_score_gauge(self, avg_score, title="Sentiment Score"):
63
+ normalized = ((avg_score + 2) / 4) * 100
64
+ fig = go.Figure(go.Indicator(
65
+ mode="gauge+number",
66
+ value=normalized,
67
+ title={"text": title, "font": {"size": 18}},
68
+ number={"font": {"size": 36}},
69
+ gauge={
70
+ "axis": {"range": [0, 100]},
71
+ "bar": {"color": "darkblue"},
72
+ "steps": [
73
+ {"range": [0, 20], "color": "#D32F2F"},
74
+ {"range": [20, 40], "color": "#FF6F00"},
75
+ {"range": [40, 60], "color": "#FFB300"},
76
+ {"range": [60, 80], "color": "#7CB342"},
77
+ {"range": [80, 100],"color": "#00C851"},
78
+ ],
79
+ },
80
+ ))
81
+ fig.update_layout(height=300, margin=dict(l=20, r=20, t=60, b=20))
82
+ return fig
83
+
84
+ def create_sentiment_timeline(self, df, title="Sentiment Over Time", freq="W"):
85
+ if "first_message_at" not in df.columns:
86
+ return self._empty_fig(title, "No timestamp data")
87
+ df_t = df.copy()
88
+ df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
89
+ agg = df_t.groupby(["date", "sentiment_polarity"]).size().reset_index(name="count")
90
+ fig = go.Figure()
91
+ for s in self.sentiment_order:
92
+ d = agg[agg["sentiment_polarity"] == s]
93
+ if not d.empty:
94
+ fig.add_trace(go.Scatter(
95
+ x=d["date"], y=d["count"], name=s, mode="lines+markers",
96
+ line=dict(color=self.sentiment_colors.get(s, "#CCCCCC"), width=2),
97
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
98
+ ))
99
+ fig.update_layout(title=title, xaxis_title="Date",
100
+ yaxis_title="Conversations", height=self.chart_height,
101
+ hovermode="x unified")
102
+ return fig
103
+
104
+ # ─────────────────────────────────────────────────────────────
105
+ # Topic charts
106
+ # ─────────────────────────────────────────────────────────────
107
+
108
+ def create_topic_bar_chart(self, df, title="Topic Distribution",
109
+ orientation="h", top_n=None):
110
+ exploded = explode_topics(df)
111
+ if exploded.empty:
112
+ return self._empty_fig(title, "No topic data")
113
+ counts = exploded["topic_id"].value_counts()
114
+ if top_n:
115
+ counts = counts.head(top_n)
116
+ labels = [topic_label(t, self.taxonomy) for t in counts.index]
117
+ colors = [self.topic_colors.get(t, "#607D8B") for t in counts.index]
118
+
119
+ if orientation == "h":
120
+ fig = go.Figure(go.Bar(
121
+ y=labels, x=counts.values, orientation="h",
122
+ marker=dict(color=colors),
123
+ text=counts.values, textposition="auto",
124
+ hovertemplate="<b>%{y}</b><br>%{x} conversations<extra></extra>",
125
+ ))
126
+ fig.update_layout(title=title, xaxis_title="Conversations",
127
+ yaxis_title="Topic", height=self.chart_height,
128
+ yaxis={"categoryorder": "total ascending"})
129
+ else:
130
+ fig = go.Figure(go.Bar(
131
+ x=labels, y=counts.values,
132
+ marker=dict(color=colors),
133
+ text=counts.values, textposition="auto",
134
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
135
+ ))
136
+ fig.update_layout(title=title, xaxis_title="Topic",
137
+ yaxis_title="Conversations", height=self.chart_height)
138
+ return fig
139
+
140
+ def create_topic_pie_chart(self, df, title="Topic Distribution"):
141
+ exploded = explode_topics(df)
142
+ if exploded.empty:
143
+ return self._empty_fig(title, "No topic data")
144
+ counts = exploded["topic_id"].value_counts()
145
+ labels = [topic_label(t, self.taxonomy) for t in counts.index]
146
+ colors = [self.topic_colors.get(t, "#607D8B") for t in counts.index]
147
+ fig = go.Figure(go.Pie(
148
+ labels=labels, values=counts.values,
149
+ marker=dict(colors=colors),
150
+ textinfo="label+percent",
151
+ hovertemplate="<b>%{label}</b><br>%{value}<br>%{percent}<extra></extra>",
152
+ ))
153
+ fig.update_layout(title=title, height=self.chart_height)
154
+ return fig
155
+
156
+ def create_topic_sentiment_heatmap(self, df, title="Topic Γ— Sentiment Heatmap"):
157
+ exploded = explode_topics(df)
158
+ if exploded.empty or "sentiment_polarity" not in exploded.columns:
159
+ return self._empty_fig(title, "No data")
160
+ pivot = pd.crosstab(exploded["topic_id"], exploded["sentiment_polarity"])
161
+ pivot.index = [topic_label(t, self.taxonomy) for t in pivot.index]
162
+ ordered_cols = [s for s in self.sentiment_order if s in pivot.columns]
163
+ pivot = pivot[ordered_cols] if ordered_cols else pivot
164
+
165
+ fig = go.Figure(go.Heatmap(
166
+ z=pivot.values,
167
+ x=pivot.columns.tolist(),
168
+ y=pivot.index.tolist(),
169
+ colorscale="Blues",
170
+ text=pivot.values,
171
+ texttemplate="%{text}",
172
+ hovertemplate="<b>%{y} β€” %{x}</b><br>%{z}<extra></extra>",
173
+ colorbar=dict(title="Conversations"),
174
+ ))
175
+ fig.update_layout(title=title, xaxis_title="Sentiment",
176
+ yaxis_title="Topic", height=self.chart_height + 100)
177
+ return fig
178
+
179
+ def create_topic_timeline(self, df, title="Topic Volume Over Time",
180
+ freq="W", top_n=5):
181
+ if "first_message_at" not in df.columns:
182
+ return self._empty_fig(title, "No timestamp data")
183
+ exploded = explode_topics(df)
184
+ if exploded.empty:
185
+ return self._empty_fig(title, "No topic data")
186
+
187
+ top_topics = exploded["topic_id"].value_counts().head(top_n).index.tolist()
188
+ exploded = exploded[exploded["topic_id"].isin(top_topics)].copy()
189
+ exploded["date"] = pd.to_datetime(exploded["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
190
+ agg = exploded.groupby(["date", "topic_id"]).size().reset_index(name="count")
191
+
192
+ fig = go.Figure()
193
+ for t in top_topics:
194
+ d = agg[agg["topic_id"] == t]
195
+ if not d.empty:
196
+ fig.add_trace(go.Scatter(
197
+ x=d["date"], y=d["count"],
198
+ name=topic_label(t, self.taxonomy), mode="lines+markers",
199
+ line=dict(color=self.topic_colors.get(t, "#607D8B"), width=2),
200
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
201
+ ))
202
+ fig.update_layout(title=title, xaxis_title="Date",
203
+ yaxis_title="Conversations", height=self.chart_height,
204
+ hovermode="x unified")
205
+ return fig
206
+
207
+ # ─────────────────────────────────────────────────────────────
208
+ # Volume & timelines
209
+ # ─────────────────────────────────���───────────────────────────
210
+
211
+ def create_volume_timeline(self, df, title="Conversation Volume Over Time",
212
+ freq="W"):
213
+ if "first_message_at" not in df.columns:
214
+ return self._empty_fig(title, "No timestamp data")
215
+ df_t = df.copy()
216
+ df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
217
+ agg = df_t.groupby("date").size().reset_index(name="count")
218
+ fig = go.Figure(go.Bar(
219
+ x=agg["date"], y=agg["count"],
220
+ marker_color="#1982C4",
221
+ hovertemplate="<b>%{x}</b><br>%{y} conversations<extra></extra>",
222
+ ))
223
+ fig.update_layout(title=title, xaxis_title="Date",
224
+ yaxis_title="Conversations", height=self.chart_height)
225
+ return fig
226
+
227
+ def create_refund_cancel_timeline(self, df, title="Refund & Cancellation Over Time",
228
+ freq="W"):
229
+ if "first_message_at" not in df.columns:
230
+ return self._empty_fig(title, "No timestamp data")
231
+ df_t = df.copy()
232
+ df_t["date"] = pd.to_datetime(df_t["first_message_at"]).dt.to_period(freq).dt.to_timestamp()
233
+
234
+ fig = go.Figure()
235
+ for col, label, color in [
236
+ ("is_refund_request", "Refund Requests", "#D32F2F"),
237
+ ("is_cancellation", "Cancellations", "#FF6F00"),
238
+ ("is_membership", "Membership Joins", "#00C851"),
239
+ ]:
240
+ if col in df_t.columns:
241
+ agg = df_t[df_t[col] == True].groupby("date").size().reset_index(name="count")
242
+ if not agg.empty:
243
+ fig.add_trace(go.Scatter(
244
+ x=agg["date"], y=agg["count"], name=label,
245
+ mode="lines+markers", line=dict(color=color, width=2),
246
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
247
+ ))
248
+ fig.update_layout(title=title, xaxis_title="Date",
249
+ yaxis_title="Conversations", height=self.chart_height,
250
+ hovermode="x unified")
251
+ return fig
252
+
253
+ # ─────────────────────────────────────────────────────────────
254
+ # Status / source / flags
255
+ # ─────────────────────────────────────────────────────────────
256
+
257
+ def create_status_distribution(self, df, title="Conversations by Status"):
258
+ if "status" not in df.columns:
259
+ return self._empty_fig(title, "No status data")
260
+ counts = df["status"].value_counts()
261
+ colors = [self.status_colors.get(s, self.status_colors.get("default", "#607D8B"))
262
+ for s in counts.index]
263
+ fig = go.Figure(go.Bar(
264
+ x=counts.index, y=counts.values,
265
+ marker=dict(color=colors),
266
+ text=counts.values, textposition="auto",
267
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
268
+ ))
269
+ fig.update_layout(title=title, xaxis_title="Status",
270
+ yaxis_title="Conversations", height=self.chart_height)
271
+ return fig
272
+
273
+ def create_source_distribution(self, df, title="Conversations by Source Type"):
274
+ if "source_type" not in df.columns:
275
+ return self._empty_fig(title, "No source data")
276
+ counts = df["source_type"].value_counts()
277
+ fig = go.Figure(go.Bar(
278
+ x=counts.index, y=counts.values,
279
+ marker_color="#1982C4",
280
+ text=counts.values, textposition="auto",
281
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
282
+ ))
283
+ fig.update_layout(title=title, xaxis_title="Source",
284
+ yaxis_title="Conversations", height=self.chart_height)
285
+ return fig
286
+
287
+ def create_boolean_flags_chart(self, df, title="Key Billing & Membership Flags"):
288
+ labels, values, colors = [], [], []
289
+ for col, label in [("is_refund_request", "Refund Requests"),
290
+ ("is_cancellation", "Cancellations"),
291
+ ("is_membership", "Membership Joins")]:
292
+ if col in df.columns:
293
+ labels.append(label)
294
+ values.append(int(df[col].sum()))
295
+ colors.append(self.flag_colors.get(col, "#607D8B"))
296
+
297
+ if not values:
298
+ return self._empty_fig(title, "No flag data")
299
+
300
+ fig = go.Figure(go.Bar(
301
+ x=labels, y=values,
302
+ marker=dict(color=colors),
303
+ text=values, textposition="auto",
304
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
305
+ ))
306
+ fig.update_layout(title=title, xaxis_title="Flag",
307
+ yaxis_title="Conversations", height=self.chart_height)
308
+ return fig
309
+
310
+ def create_escalation_breakdown(self, df, title="Escalation Queue by Topic"):
311
+ if "is_escalation" not in df.columns:
312
+ return self._empty_fig(title, "No escalation data")
313
+
314
+ exploded = explode_topics(df)
315
+ if exploded.empty:
316
+ return self._empty_fig(title, "No topic data")
317
+
318
+ pivot = pd.crosstab(exploded["topic_id"], exploded["is_escalation"])
319
+ pivot.index = [topic_label(t, self.taxonomy) for t in pivot.index]
320
+
321
+ fig = go.Figure()
322
+ for flag, label, color in [(False, "Normal", "#4CAF50"), (True, "Escalation", "#D32F2F")]:
323
+ if flag in pivot.columns:
324
+ fig.add_trace(go.Bar(
325
+ name=label, y=pivot.index, x=pivot[flag],
326
+ orientation="h", marker_color=color,
327
+ hovertemplate="<b>%{y}</b><br>%{x}<extra></extra>",
328
+ ))
329
+ fig.update_layout(title=title, barmode="stack", xaxis_title="Conversations",
330
+ yaxis_title="Topic", height=self.chart_height,
331
+ yaxis={"categoryorder": "total ascending"})
332
+ return fig
333
+
334
+ # ─────────────────────────────────────────────────────────────
335
+ # Duration & thread count
336
+ # ─────────────────────────────────────────────────────────────
337
+
338
+ def create_duration_histogram(self, df, title="Conversation Duration Distribution"):
339
+ if "duration_hours" not in df.columns:
340
+ return self._empty_fig(title, "No duration data")
341
+ d = df["duration_hours"].dropna()
342
+ fig = go.Figure(go.Histogram(
343
+ x=d, nbinsx=40, marker_color="#1982C4",
344
+ hovertemplate="Duration: %{x:.1f}h<br>Count: %{y}<extra></extra>",
345
+ ))
346
+ fig.update_layout(title=title, xaxis_title="Duration (hours)",
347
+ yaxis_title="Conversations", height=self.chart_height)
348
+ return fig
349
+
350
+ def create_thread_count_histogram(self, df, title="Thread Count Distribution"):
351
+ if "thread_count" not in df.columns:
352
+ return self._empty_fig(title, "No thread data")
353
+ t = df["thread_count"].dropna()
354
+ fig = go.Figure(go.Histogram(
355
+ x=t, nbinsx=30, marker_color="#9C27B0",
356
+ hovertemplate="Threads: %{x}<br>Count: %{y}<extra></extra>",
357
+ ))
358
+ fig.update_layout(title=title, xaxis_title="Number of Threads",
359
+ yaxis_title="Conversations", height=self.chart_height)
360
+ return fig
361
+
362
+ # ─────────────────────────────────────────────────────────────
363
+ # Emotion (same logic as DistributionCharts but with helpscout df)
364
+ # ─────────────────────────────────────────────────────────────
365
+
366
+ def create_emotion_bar_chart(self, df, title="Emotion Distribution",
367
+ orientation="h"):
368
+ if "emotions" not in df.columns or df["emotions"].isna().all():
369
+ return self._empty_fig(title, "No emotion data")
370
+
371
+ emotion_colors = {
372
+ "joy": "#FFD700", "excitement": "#FF6B35", "gratitude": "#4CAF50",
373
+ "admiration": "#2196F3", "curiosity": "#00BCD4", "humor": "#9C27B0",
374
+ "frustration": "#FF9800", "disappointment": "#795548",
375
+ "sadness": "#607D8B", "anger": "#D32F2F", "neutral": "#9E9E9E",
376
+ }
377
+ df_e = df.copy()
378
+ df_e["emotions"] = df_e["emotions"].str.split(",")
379
+ df_e = df_e.explode("emotions")
380
+ df_e["emotions"] = df_e["emotions"].str.strip().str.lower()
381
+ counts = df_e["emotions"].dropna().value_counts()
382
+ colors = [emotion_colors.get(e, "#CCCCCC") for e in counts.index]
383
+
384
+ if orientation == "h":
385
+ fig = go.Figure(go.Bar(
386
+ y=counts.index, x=counts.values, orientation="h",
387
+ marker=dict(color=colors), text=counts.values, textposition="auto",
388
+ hovertemplate="<b>%{y}</b><br>%{x}<extra></extra>",
389
+ ))
390
+ fig.update_layout(title=title, xaxis_title="Conversations",
391
+ yaxis_title="Emotion", height=self.chart_height,
392
+ yaxis={"categoryorder": "total ascending"})
393
+ else:
394
+ fig = go.Figure(go.Bar(
395
+ x=counts.index, y=counts.values,
396
+ marker=dict(color=colors), text=counts.values, textposition="auto",
397
+ hovertemplate="<b>%{x}</b><br>%{y}<extra></extra>",
398
+ ))
399
+ fig.update_layout(title=title, xaxis_title="Emotion",
400
+ yaxis_title="Conversations", height=self.chart_height)
401
+ return fig
402
+
403
+ # ���────────────────────────────────────────────────────────────
404
+ # Helpers
405
+ # ─────────────────────────────────────────────────────────────
406
+
407
+ @staticmethod
408
+ def _empty_fig(title, message):
409
+ fig = go.Figure()
410
+ fig.add_annotation(text=message, xref="paper", yref="paper",
411
+ x=0.5, y=0.5, showarrow=False, font=dict(size=14))
412
+ fig.update_layout(title=title, height=300)
413
+ return fig