ceeyyuhhh commited on
Commit
06ac1f3
·
verified ·
1 Parent(s): 0c0c5e8

Upload 16 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ archive.zip filter=lfs diff=lfs merge=lfs -text
2
+ background_top.png filter=lfs diff=lfs merge=lfs -text
3
+ ecommerce_returns_cleaned.csv filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV PYTHONDONTWRITEBYTECODE=1
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ ENV GRADIO_SERVER_NAME=0.0.0.0
8
+ ENV GRADIO_SERVER_PORT=7860
9
+
10
+ WORKDIR /app
11
+ COPY . /app
12
+
13
+ # Python deps (from requirements.txt)
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Notebook execution deps
17
+ RUN pip install --no-cache-dir notebook ipykernel papermill
18
+
19
+ # Pre-install packages the notebooks use via !pip install
20
+ RUN pip install --no-cache-dir textblob faker vaderSentiment transformers
21
+
22
+ RUN python -m ipykernel install --user --name python3 --display-name "Python 3"
23
+
24
+ EXPOSE 7860
25
+
26
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SE21 App Template
3
+ emoji: 📊
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ short_description: AI-enhanced analytics dashboard template for SE21 students
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Womens%20Clothing%20E-Commerce%20Reviews.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,864 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ import traceback
6
+ from pathlib import Path
7
+ from typing import Dict, Any, List, Tuple
8
+
9
+ import pandas as pd
10
+ import gradio as gr
11
+ import papermill as pm
12
+ import plotly.graph_objects as go
13
+
14
+ # Optional LLM (HuggingFace Inference API)
15
+ try:
16
+ from huggingface_hub import InferenceClient
17
+ except Exception:
18
+ InferenceClient = None
19
+
20
+ # =========================================================
21
+ # CONFIG
22
+ # =========================================================
23
+
24
+ BASE_DIR = Path(__file__).resolve().parent
25
+
26
+ NB1 = os.environ.get("NB1", "datacreation.ipynb").strip()
27
+ NB2 = os.environ.get("NB2", "pythonanalysis.ipynb").strip()
28
+
29
+ RUNS_DIR = BASE_DIR / "runs"
30
+ ART_DIR = BASE_DIR / "artifacts"
31
+ PY_FIG_DIR = ART_DIR / "py" / "figures"
32
+ PY_TAB_DIR = ART_DIR / "py" / "tables"
33
+
34
+ PAPERMILL_TIMEOUT = int(os.environ.get("PAPERMILL_TIMEOUT", "1800"))
35
+ MAX_PREVIEW_ROWS = int(os.environ.get("MAX_FILE_PREVIEW_ROWS", "50"))
36
+ MAX_LOG_CHARS = int(os.environ.get("MAX_LOG_CHARS", "8000"))
37
+
38
+ HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
39
+ MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
40
+ HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
41
+ N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
42
+
43
+ LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
44
+ llm_client = (
45
+ InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
46
+ if LLM_ENABLED
47
+ else None
48
+ )
49
+
50
+ # =========================================================
51
+ # HELPERS
52
+ # =========================================================
53
+
54
+ def ensure_dirs():
55
+ for p in [RUNS_DIR, ART_DIR, PY_FIG_DIR, PY_TAB_DIR]:
56
+ p.mkdir(parents=True, exist_ok=True)
57
+
58
+ def stamp():
59
+ return time.strftime("%Y%m%d-%H%M%S")
60
+
61
+ def tail(text: str, n: int = MAX_LOG_CHARS) -> str:
62
+ return (text or "")[-n:]
63
+
64
+ def _ls(dir_path: Path, exts: Tuple[str, ...]) -> List[str]:
65
+ if not dir_path.is_dir():
66
+ return []
67
+ return sorted(p.name for p in dir_path.iterdir() if p.is_file() and p.suffix.lower() in exts)
68
+
69
+ def _read_csv(path: Path) -> pd.DataFrame:
70
+ return pd.read_csv(path, nrows=MAX_PREVIEW_ROWS)
71
+
72
+ def _read_json(path: Path):
73
+ with path.open(encoding="utf-8") as f:
74
+ return json.load(f)
75
+
76
+ def artifacts_index() -> Dict[str, Any]:
77
+ return {
78
+ "python": {
79
+ "figures": _ls(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
80
+ "tables": _ls(PY_TAB_DIR, (".csv", ".json")),
81
+ },
82
+ }
83
+
84
+ # =========================================================
85
+ # PIPELINE RUNNERS
86
+ # =========================================================
87
+
88
+ def run_notebook(nb_name: str) -> str:
89
+ ensure_dirs()
90
+ nb_in = BASE_DIR / nb_name
91
+ if not nb_in.exists():
92
+ return f"ERROR: {nb_name} not found."
93
+ nb_out = RUNS_DIR / f"run_{stamp()}_{nb_name}"
94
+ pm.execute_notebook(
95
+ input_path=str(nb_in),
96
+ output_path=str(nb_out),
97
+ cwd=str(BASE_DIR),
98
+ log_output=True,
99
+ progress_bar=False,
100
+ request_save_on_cell_execute=True,
101
+ execution_timeout=PAPERMILL_TIMEOUT,
102
+ kernel_name="python3",
103
+ )
104
+ return f"Executed {nb_name}"
105
+
106
+
107
+ def run_datacreation() -> str:
108
+ try:
109
+ log = run_notebook(NB1)
110
+ csvs = [f.name for f in BASE_DIR.glob("*.csv")]
111
+ return f"OK {log}\n\nCSVs now in /app:\n" + "\n".join(f" - {c}" for c in sorted(csvs))
112
+ except Exception as e:
113
+ return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
114
+
115
+
116
+ def run_pythonanalysis() -> str:
117
+ try:
118
+ log = run_notebook(NB2)
119
+ idx = artifacts_index()
120
+ figs = idx["python"]["figures"]
121
+ tabs = idx["python"]["tables"]
122
+ return (
123
+ f"OK {log}\n\n"
124
+ f"Figures: {', '.join(figs) or '(none)'}\n"
125
+ f"Tables: {', '.join(tabs) or '(none)'}"
126
+ )
127
+ except Exception as e:
128
+ return f"FAILED {e}\n\n{traceback.format_exc()[-2000:]}"
129
+
130
+
131
+ def run_full_pipeline() -> str:
132
+ logs = []
133
+ logs.append("=" * 50)
134
+ logs.append("STEP 1/2: Data Creation (web scraping + synthetic data)")
135
+ logs.append("=" * 50)
136
+ logs.append(run_datacreation())
137
+ logs.append("")
138
+ logs.append("=" * 50)
139
+ logs.append("STEP 2/2: Python Analysis (sentiment, ARIMA, dashboard)")
140
+ logs.append("=" * 50)
141
+ logs.append(run_pythonanalysis())
142
+ return "\n".join(logs)
143
+
144
+
145
+ # =========================================================
146
+ # GALLERY LOADERS
147
+ # =========================================================
148
+
149
+ def _load_all_figures() -> List[Tuple[str, str]]:
150
+ """Return list of (filepath, caption) for Gallery."""
151
+ items = []
152
+ for p in sorted(PY_FIG_DIR.glob("*.png")):
153
+ items.append((str(p), p.stem.replace('_', ' ').title()))
154
+ return items
155
+
156
+
157
+ def _load_table_safe(path: Path) -> pd.DataFrame:
158
+ try:
159
+ if path.suffix == ".json":
160
+ obj = _read_json(path)
161
+ if isinstance(obj, dict):
162
+ return pd.DataFrame([obj])
163
+ return pd.DataFrame(obj)
164
+ return _read_csv(path)
165
+ except Exception as e:
166
+ return pd.DataFrame([{"error": str(e)}])
167
+
168
+
169
+ def refresh_gallery():
170
+ """Called when user clicks Refresh on Gallery tab."""
171
+ figures = _load_all_figures()
172
+ idx = artifacts_index()
173
+
174
+ table_choices = list(idx["python"]["tables"])
175
+
176
+ default_df = pd.DataFrame()
177
+ if table_choices:
178
+ default_df = _load_table_safe(PY_TAB_DIR / table_choices[0])
179
+
180
+ return (
181
+ figures if figures else [],
182
+ gr.update(choices=table_choices, value=table_choices[0] if table_choices else None),
183
+ default_df,
184
+ )
185
+
186
+
187
+ def on_table_select(choice: str):
188
+ if not choice:
189
+ return pd.DataFrame([{"hint": "Select a table above."}])
190
+ path = PY_TAB_DIR / choice
191
+ if not path.exists():
192
+ return pd.DataFrame([{"error": f"File not found: {choice}"}])
193
+ return _load_table_safe(path)
194
+
195
+
196
+ # =========================================================
197
+ # KPI LOADER
198
+ # =========================================================
199
+
200
+ def load_kpis() -> Dict[str, Any]:
201
+ for candidate in [PY_TAB_DIR / "kpis.json", PY_FIG_DIR / "kpis.json"]:
202
+ if candidate.exists():
203
+ try:
204
+ return _read_json(candidate)
205
+ except Exception:
206
+ pass
207
+ return {}
208
+
209
+
210
+ # =========================================================
211
+ # AI DASHBOARD -- LLM picks what to display
212
+ # =========================================================
213
+
214
+ DASHBOARD_SYSTEM = """You are an AI dashboard assistant for a retail analytics app focused on e-commerce return prediction and review intelligence.
215
+
216
+ The user asks questions about product reviews, return risk, customer complaints, sentiment, rating distribution, and product/category return patterns. You have access to pre-computed artifacts from a Python analysis pipeline.
217
+
218
+ AVAILABLE ARTIFACTS:
219
+ {artifacts_json}
220
+
221
+ KPI SUMMARY:
222
+ {kpis_json}
223
+
224
+ YOUR JOB:
225
+ 1. Answer the user's question using the KPIs and available artifacts.
226
+ 2. At the END of your response, output a JSON block fenced with ```json ... ``` using this exact shape:
227
+ {{"show": "figure"|"table"|"none", "scope": "python", "filename": "..."}}
228
+
229
+ RULES:
230
+ - If the user asks about return rate, return risk, returned products, or high-risk categories, show category_return_rate.csv if available.
231
+ - If the user asks about sentiment or reviews, show sentiment_counts_sampled.csv or sentiment_distribution.png.
232
+ - If the user asks about ratings, show rating_distribution.png or rating_distribution.csv.
233
+ - If the user asks for dashboard overview, show df_dashboard.csv.
234
+ - If no artifact is relevant, use "show": "none".
235
+ - Keep the answer concise and business-focused.
236
+ """
237
+
238
+ JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
239
+ FALLBACK_JSON_RE = re.compile(r"\{[^{}]*\"show\"[^{}]*\}", re.DOTALL)
240
+
241
+
242
+ def _parse_display_directive(text: str) -> Dict[str, str]:
243
+ m = JSON_BLOCK_RE.search(text)
244
+ if m:
245
+ try:
246
+ return json.loads(m.group(1))
247
+ except json.JSONDecodeError:
248
+ pass
249
+ m = FALLBACK_JSON_RE.search(text)
250
+ if m:
251
+ try:
252
+ return json.loads(m.group(0))
253
+ except json.JSONDecodeError:
254
+ pass
255
+ return {"show": "none"}
256
+
257
+
258
+ def _clean_response(text: str) -> str:
259
+ """Strip the JSON directive block from the displayed response."""
260
+ return JSON_BLOCK_RE.sub("", text).strip()
261
+
262
+
263
+ def _n8n_call(msg: str) -> Tuple[str, Dict]:
264
+ """Call the student's n8n webhook and return (reply, directive)."""
265
+ import requests as req
266
+ try:
267
+ resp = req.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=20)
268
+ data = resp.json()
269
+ answer = data.get("answer", "No response from n8n workflow.")
270
+ chart = data.get("chart", "none")
271
+ if chart and chart != "none":
272
+ return answer, {"show": "figure", "chart": chart}
273
+ return answer, {"show": "none"}
274
+ except Exception as e:
275
+ return f"n8n error: {e}. Falling back to keyword matching.", None
276
+
277
+
278
+ def ai_chat(user_msg: str, history: list):
279
+ """Chat function for the AI Dashboard tab."""
280
+ if not user_msg or not user_msg.strip():
281
+ return history, "", None, None
282
+
283
+ idx = artifacts_index()
284
+ kpis = load_kpis()
285
+
286
+ # Priority: n8n webhook > HF LLM > keyword fallback
287
+ if N8N_WEBHOOK_URL:
288
+ reply, directive = _n8n_call(user_msg)
289
+ if directive is None:
290
+ reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
291
+ reply += "\n\n" + reply_fb
292
+ elif not LLM_ENABLED:
293
+ reply, directive = _keyword_fallback(user_msg, idx, kpis)
294
+ else:
295
+ system = DASHBOARD_SYSTEM.format(
296
+ artifacts_json=json.dumps(idx, indent=2),
297
+ kpis_json=json.dumps(kpis, indent=2) if kpis else "(no KPIs yet, run the pipeline first)",
298
+ )
299
+ msgs = [{"role": "system", "content": system}]
300
+ for entry in (history or [])[-6:]:
301
+ msgs.append(entry)
302
+ msgs.append({"role": "user", "content": user_msg})
303
+
304
+ try:
305
+ r = llm_client.chat_completion(
306
+ model=MODEL_NAME,
307
+ messages=msgs,
308
+ temperature=0.3,
309
+ max_tokens=600,
310
+ stream=False,
311
+ )
312
+ raw = (
313
+ r["choices"][0]["message"]["content"]
314
+ if isinstance(r, dict)
315
+ else r.choices[0].message.content
316
+ )
317
+ directive = _parse_display_directive(raw)
318
+ reply = _clean_response(raw)
319
+ except Exception as e:
320
+ reply = f"LLM error: {e}. Falling back to keyword matching."
321
+ reply_fb, directive = _keyword_fallback(user_msg, idx, kpis)
322
+ reply += "\n\n" + reply_fb
323
+
324
+ # Resolve artifacts — build interactive Plotly charts when possible
325
+ chart_out = None
326
+ tab_out = None
327
+
328
+ show = directive.get("show", "none") if isinstance(directive, dict) else "none"
329
+ fname = directive.get("filename", "") if isinstance(directive, dict) else ""
330
+ chart_name = directive.get("chart", "") if isinstance(directive, dict) else ""
331
+
332
+ # Interactive chart builders keyed by name. Old aliases are included so
333
+ # the app will still work if an LLM/webhook returns an older chart name.
334
+ chart_builders = {
335
+ "sales": build_monthly_return_chart,
336
+ "returns": build_monthly_return_chart,
337
+ "monthly_returns": build_monthly_return_chart,
338
+ "sentiment": build_sentiment_chart,
339
+ "top_sellers": build_top_return_categories_chart,
340
+ "top_returns": build_top_return_categories_chart,
341
+ "rating": build_rating_chart,
342
+ "ratings": build_rating_chart,
343
+ }
344
+
345
+ if chart_name and chart_name in chart_builders:
346
+ chart_out = chart_builders[chart_name]()
347
+
348
+ elif show == "figure" and fname:
349
+ low_fname = fname.lower()
350
+ if "sentiment" in low_fname:
351
+ chart_out = build_sentiment_chart()
352
+ elif "rating" in low_fname:
353
+ chart_out = build_rating_chart()
354
+ elif "category" in low_fname or "top" in low_fname or "risk" in low_fname:
355
+ chart_out = build_top_return_categories_chart()
356
+ elif "return" in low_fname or "dashboard" in low_fname or "monthly" in low_fname:
357
+ chart_out = build_monthly_return_chart()
358
+ else:
359
+ chart_out = _empty_chart(f"No interactive chart for {fname}")
360
+
361
+ if show == "table" and fname:
362
+ fp = PY_TAB_DIR / fname
363
+ if fp.exists():
364
+ tab_out = _load_table_safe(fp)
365
+ else:
366
+ reply += f"\n\n*(Could not find table: {fname})*"
367
+
368
+ new_history = (history or []) + [
369
+ {"role": "user", "content": user_msg},
370
+ {"role": "assistant", "content": reply},
371
+ ]
372
+
373
+ return new_history, "", chart_out, tab_out
374
+
375
+ def _keyword_fallback(msg: str, idx: Dict, kpis: Dict) -> Tuple[str, Dict]:
376
+ """Retail return/review keyword matcher when LLM is unavailable."""
377
+ msg_lower = msg.lower()
378
+
379
+ if not idx["python"]["figures"] and not idx["python"]["tables"]:
380
+ return (
381
+ "No artifacts found yet. Please run the pipeline first, then come back here to explore the results.",
382
+ {"show": "none"},
383
+ )
384
+
385
+ reviews_rows = kpis.get("reviews_rows", "?")
386
+ returns_rows = kpis.get("returns_rows", "?")
387
+ n_products = kpis.get("n_products", kpis.get("n_titles", "?"))
388
+ return_rate = kpis.get("estimated_return_rate", None)
389
+
390
+ if isinstance(return_rate, (int, float)):
391
+ return_rate_text = f"{return_rate:.1%}"
392
+ else:
393
+ return_rate_text = "not available"
394
+
395
+ kpi_text = (
396
+ f"Quick summary: **{reviews_rows} reviews**, **{returns_rows} return records**, "
397
+ f"and **{n_products} products/classes** analyzed. Estimated return rate: **{return_rate_text}**."
398
+ )
399
+
400
+ if any(w in msg_lower for w in ["return", "returned", "returns", "highest return", "return rate", "risk"]):
401
+ return (
402
+ f"Here are the highest return-risk products/categories. {kpi_text}",
403
+ {"show": "table", "scope": "python", "filename": "category_return_rate.csv"},
404
+ )
405
+
406
+ if any(w in msg_lower for w in ["complaint", "complaints", "problem", "issues", "review", "reviews"]):
407
+ return (
408
+ f"Here is the review intelligence summary. {kpi_text}",
409
+ {"show": "table", "scope": "python", "filename": "sentiment_counts_sampled.csv"},
410
+ )
411
+
412
+ if any(w in msg_lower for w in ["sentiment", "positive", "negative", "neutral"]):
413
+ return (
414
+ f"Here is the sentiment breakdown from customer reviews. {kpi_text}",
415
+ {"show": "figure", "chart": "sentiment"},
416
+ )
417
+
418
+ if any(w in msg_lower for w in ["rating", "ratings", "stars"]):
419
+ return (
420
+ f"Here is the rating distribution. {kpi_text}",
421
+ {"show": "figure", "scope": "python", "filename": "rating_distribution.png"},
422
+ )
423
+
424
+ if any(w in msg_lower for w in ["dashboard", "overview", "summary", "kpi"]):
425
+ return (
426
+ f"Dashboard overview: {kpi_text}\n\nAsk me about return rates, review complaints, sentiment, ratings, or high-risk products.",
427
+ {"show": "table", "scope": "python", "filename": "df_dashboard.csv"},
428
+ )
429
+
430
+ return (
431
+ f"I can help analyze e-commerce returns and review intelligence. {kpi_text}\n\n"
432
+ "Try asking about: **highest return-rate categories**, **review complaints**, "
433
+ "**sentiment**, **ratings**, or **dashboard overview**.",
434
+ {"show": "none"},
435
+ )
436
+
437
+ # =========================================================
438
+ # KPI CARDS (BubbleBusters style)
439
+ # =========================================================
440
+
441
+ def render_kpi_cards() -> str:
442
+ kpis = load_kpis()
443
+ if not kpis:
444
+ return (
445
+ '<div style="background:rgba(255,255,255,.65);backdrop-filter:blur(16px);'
446
+ 'border-radius:20px;padding:28px;text-align:center;'
447
+ 'border:1.5px solid rgba(255,255,255,.7);'
448
+ 'box-shadow:0 8px 32px rgba(124,92,191,.08);">'
449
+ '<div style="font-size:36px;margin-bottom:10px;">📊</div>'
450
+ '<div style="color:#a48de8;font-size:14px;'
451
+ 'font-weight:800;margin-bottom:6px;">No data yet</div>'
452
+ '<div style="color:#9d8fc4;font-size:12px;">'
453
+ 'Run the pipeline to populate these cards.</div>'
454
+ '</div>'
455
+ )
456
+
457
+ def fmt_value(key, val):
458
+ if val is None:
459
+ return "—"
460
+ if key == "estimated_return_rate" and isinstance(val, (int, float)):
461
+ return f"{val:.1%}"
462
+ if isinstance(val, (int, float)) and abs(val) >= 100:
463
+ return f"{val:,.0f}"
464
+ return str(val)
465
+
466
+ def card(icon, label, value, colour):
467
+ return f"""
468
+ <div style="background:rgba(255,255,255,.72);backdrop-filter:blur(16px);
469
+ border-radius:20px;padding:18px 14px 16px;text-align:center;
470
+ border:1.5px solid rgba(255,255,255,.8);
471
+ box-shadow:0 4px 16px rgba(124,92,191,.08);
472
+ border-top:3px solid {colour};">
473
+ <div style="font-size:26px;margin-bottom:7px;line-height:1;">{icon}</div>
474
+ <div style="color:#9d8fc4;font-size:9.5px;text-transform:uppercase;
475
+ letter-spacing:1.8px;margin-bottom:7px;font-weight:800;">{label}</div>
476
+ <div style="color:#2d1f4e;font-size:16px;font-weight:800;">{value}</div>
477
+ </div>"""
478
+
479
+ # Backwards-compatible aliases: older notebook versions may still write
480
+ # n_titles/n_months/total_units_sold, while the retail notebook writes
481
+ # n_products/n_periods/total_return_records.
482
+ aliases = {
483
+ "n_products": kpis.get("n_products", kpis.get("n_titles")),
484
+ "n_periods": kpis.get("n_periods", kpis.get("n_months")),
485
+ "total_return_records": kpis.get("total_return_records", kpis.get("total_units_sold")),
486
+ "estimated_return_rate": kpis.get("estimated_return_rate"),
487
+ "reviews_rows": kpis.get("reviews_rows"),
488
+ "returns_rows": kpis.get("returns_rows"),
489
+ }
490
+
491
+ kpi_config = [
492
+ ("reviews_rows", "💬", "Reviews", "#a48de8"),
493
+ ("returns_rows", "↩️", "Return Records", "#7aa6f8"),
494
+ ("n_products", "🛍️", "Products", "#6ee7c7"),
495
+ ("estimated_return_rate", "📉", "Return Rate", "#e8537a"),
496
+ ]
497
+
498
+ html = (
499
+ '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));'
500
+ 'gap:12px;margin-bottom:24px;">'
501
+ )
502
+
503
+ used = set()
504
+ for key, icon, label, colour in kpi_config:
505
+ val = aliases.get(key)
506
+ if val is None:
507
+ continue
508
+ used.add(key)
509
+ html += card(icon, label, fmt_value(key, val), colour)
510
+
511
+ # Extra KPIs not in config
512
+ for key, val in kpis.items():
513
+ if key in used or key in {"n_titles", "n_months", "total_units_sold", "total_revenue"}:
514
+ continue
515
+ label = key.replace("_", " ").title()
516
+ html += card("📈", label, fmt_value(key, val), "#8fa8f8")
517
+
518
+ html += "</div>"
519
+ return html
520
+
521
+ # =========================================================
522
+ # INTERACTIVE PLOTLY CHARTS (BubbleBusters style)
523
+ # =========================================================
524
+
525
+ CHART_PALETTE = ["#7c5cbf", "#2ec4a0", "#e8537a", "#e8a230", "#5e8fef",
526
+ "#c45ea8", "#3dbacc", "#a0522d", "#6aaa3a", "#d46060"]
527
+
528
+ def _styled_layout(**kwargs) -> dict:
529
+ defaults = dict(
530
+ template="plotly_white",
531
+ paper_bgcolor="rgba(255,255,255,0.95)",
532
+ plot_bgcolor="rgba(255,255,255,0.98)",
533
+ font=dict(family="system-ui, sans-serif", color="#2d1f4e", size=12),
534
+ margin=dict(l=60, r=20, t=70, b=70),
535
+ legend=dict(
536
+ orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
537
+ bgcolor="rgba(255,255,255,0.92)",
538
+ bordercolor="rgba(124,92,191,0.35)", borderwidth=1,
539
+ ),
540
+ title=dict(font=dict(size=15, color="#4b2d8a")),
541
+ )
542
+ defaults.update(kwargs)
543
+ return defaults
544
+
545
+
546
+ def _empty_chart(title: str) -> go.Figure:
547
+ fig = go.Figure()
548
+ fig.update_layout(
549
+ title=title, height=420, template="plotly_white",
550
+ paper_bgcolor="rgba(255,255,255,0.95)",
551
+ annotations=[dict(text="Run the pipeline to generate data",
552
+ x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False,
553
+ font=dict(size=14, color="rgba(124,92,191,0.5)"))],
554
+ )
555
+ return fig
556
+
557
+
558
+ def build_monthly_return_chart() -> go.Figure:
559
+ path = PY_TAB_DIR / "df_dashboard.csv"
560
+ if not path.exists():
561
+ return _empty_chart("Monthly Return Overview — run the pipeline first")
562
+
563
+ df = pd.read_csv(path)
564
+ if df.empty:
565
+ return _empty_chart("df_dashboard.csv is empty")
566
+
567
+ date_col = next((c for c in df.columns if "month" in c.lower() or "date" in c.lower() or "time" in c.lower()), None)
568
+ val_cols = [c for c in df.columns if c != date_col and pd.api.types.is_numeric_dtype(df[c])]
569
+
570
+ if not val_cols:
571
+ # Try converting numeric-looking columns
572
+ for c in df.columns:
573
+ if c != date_col:
574
+ df[c] = pd.to_numeric(df[c], errors="coerce")
575
+ val_cols = [c for c in df.columns if c != date_col and pd.api.types.is_numeric_dtype(df[c])]
576
+
577
+ if not date_col or not val_cols:
578
+ return _empty_chart("Could not auto-detect columns in df_dashboard.csv")
579
+
580
+ df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
581
+ fig = go.Figure()
582
+
583
+ for i, col in enumerate(val_cols):
584
+ y_format = ":.1%" if "rate" in col.lower() or "risk" in col.lower() else ":,.0f"
585
+ fig.add_trace(go.Scatter(
586
+ x=df[date_col], y=df[col], name=col.replace("_", " ").title(),
587
+ mode="lines+markers", line=dict(color=CHART_PALETTE[i % len(CHART_PALETTE)], width=2),
588
+ marker=dict(size=5),
589
+ hovertemplate=f"<b>{col.replace('_',' ').title()}</b><br>%{{x|%b %Y}}: %{{y{y_format}}}<extra></extra>",
590
+ ))
591
+
592
+ fig.update_layout(**_styled_layout(
593
+ height=450,
594
+ hovermode="x unified",
595
+ title=dict(text="Monthly Return Overview"),
596
+ ))
597
+ fig.update_xaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
598
+ fig.update_yaxes(gridcolor="rgba(124,92,191,0.15)", showgrid=True)
599
+ return fig
600
+
601
+
602
+ # Backwards-compatible name used by older template code
603
+ build_sales_chart = build_monthly_return_chart
604
+
605
+
606
+ def build_rating_chart() -> go.Figure:
607
+ path = PY_TAB_DIR / "rating_distribution.csv"
608
+ if not path.exists():
609
+ return _empty_chart("Rating Distribution — run the pipeline first")
610
+
611
+ df = pd.read_csv(path)
612
+ if df.empty:
613
+ return _empty_chart("rating_distribution.csv is empty")
614
+
615
+ rating_col = next((c for c in df.columns if "rating" in c.lower()), df.columns[0])
616
+ count_col = next((c for c in df.columns if "count" in c.lower() or "number" in c.lower()), df.columns[-1])
617
+
618
+ fig = go.Figure(go.Bar(
619
+ x=df[rating_col].astype(str),
620
+ y=df[count_col],
621
+ marker_color="#7c5cbf",
622
+ hovertemplate="<b>Rating %{x}</b><br>Reviews: %{y:,.0f}<extra></extra>",
623
+ ))
624
+ fig.update_layout(**_styled_layout(
625
+ height=420,
626
+ title=dict(text="Distribution of Customer Ratings"),
627
+ showlegend=False,
628
+ ))
629
+ fig.update_xaxes(title="Rating")
630
+ fig.update_yaxes(title="Number of Reviews")
631
+ return fig
632
+
633
+
634
+ def build_sentiment_chart() -> go.Figure:
635
+ path = PY_TAB_DIR / "sentiment_counts_sampled.csv"
636
+ if not path.exists():
637
+ return _empty_chart("Sentiment Distribution — run the pipeline first")
638
+
639
+ df = pd.read_csv(path)
640
+ if df.empty:
641
+ return _empty_chart("sentiment_counts_sampled.csv is empty")
642
+
643
+ title_col = df.columns[0]
644
+ sent_cols = [c for c in ["negative", "neutral", "positive"] if c in df.columns]
645
+ if not sent_cols:
646
+ return _empty_chart("No sentiment columns found in sentiment_counts_sampled.csv")
647
+
648
+ colors = {"negative": "#e8537a", "neutral": "#5e8fef", "positive": "#2ec4a0"}
649
+ fig = go.Figure()
650
+ for col in sent_cols:
651
+ fig.add_trace(go.Bar(
652
+ name=col.title(),
653
+ y=df[title_col].astype(str),
654
+ x=df[col],
655
+ orientation="h",
656
+ marker_color=colors.get(col, "#888"),
657
+ hovertemplate=f"<b>{col.title()}</b>: %{{x:,.0f}}<extra></extra>",
658
+ ))
659
+
660
+ fig.update_layout(**_styled_layout(
661
+ height=max(420, len(df) * 30),
662
+ barmode="stack",
663
+ title=dict(text="Sentiment Distribution by Product/Class"),
664
+ ))
665
+ fig.update_xaxes(title="Number of Reviews")
666
+ fig.update_yaxes(autorange="reversed")
667
+ return fig
668
+
669
+
670
+ def build_top_return_categories_chart() -> go.Figure:
671
+ path = PY_TAB_DIR / "category_return_rate.csv"
672
+ if not path.exists():
673
+ path = PY_TAB_DIR / "top_titles_by_units_sold.csv" # backwards-compatible fallback
674
+
675
+ if not path.exists():
676
+ return _empty_chart("Highest Return-Risk Categories — run the pipeline first")
677
+
678
+ df = pd.read_csv(path).head(15)
679
+ if df.empty:
680
+ return _empty_chart(f"{path.name} is empty")
681
+
682
+ category_col = next(
683
+ (c for c in df.columns if "category" in c.lower() or "class" in c.lower() or "title" in c.lower() or "product" in c.lower()),
684
+ df.columns[0],
685
+ )
686
+ value_col = next(
687
+ (c for c in df.columns if "return" in c.lower() or "risk" in c.lower() or "rate" in c.lower() or "unit" in c.lower()),
688
+ df.columns[-1],
689
+ )
690
+
691
+ df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
692
+ fig = go.Figure(go.Bar(
693
+ y=df[category_col].astype(str),
694
+ x=df[value_col],
695
+ orientation="h",
696
+ marker=dict(color=df[value_col], colorscale=[[0, "#c5b4f0"], [1, "#7c5cbf"]]),
697
+ hovertemplate="<b>%{y}</b><br>Return rate/risk: %{x:.2%}<extra></extra>",
698
+ ))
699
+ fig.update_layout(**_styled_layout(
700
+ height=max(420, len(df) * 32),
701
+ title=dict(text="Highest Return-Risk Products / Categories"),
702
+ showlegend=False,
703
+ ))
704
+ fig.update_yaxes(autorange="reversed")
705
+ fig.update_xaxes(title="Return Rate / Risk Score")
706
+ return fig
707
+
708
+
709
+ # Backwards-compatible name used by older template code
710
+ build_top_sellers_chart = build_top_return_categories_chart
711
+
712
+ def refresh_dashboard():
713
+ return render_kpi_cards(), build_sales_chart(), build_sentiment_chart(), build_top_sellers_chart()
714
+
715
+
716
+ # =========================================================
717
+ # UI
718
+ # =========================================================
719
+
720
+ ensure_dirs()
721
+
722
+ def load_css() -> str:
723
+ css_path = BASE_DIR / "style.css"
724
+ return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
725
+
726
+
727
+ with gr.Blocks(title="AIBDM 2026 Workshop App") as demo:
728
+
729
+ gr.Markdown(
730
+ "# SE21 App Template\n"
731
+ "*This is an app template for SE21 students*",
732
+ elem_id="escp_title",
733
+ )
734
+
735
+ # ===========================================================
736
+ # TAB 1 -- Pipeline Runner
737
+ # ===========================================================
738
+ with gr.Tab("Pipeline Runner"):
739
+ gr.Markdown()
740
+
741
+ with gr.Row():
742
+ with gr.Column(scale=1):
743
+ btn_nb1 = gr.Button("Step 1: Data Creation", variant="secondary")
744
+ with gr.Column(scale=1):
745
+ btn_nb2 = gr.Button("Step 2: Python Analysis", variant="secondary")
746
+
747
+ with gr.Row():
748
+ btn_all = gr.Button("Run Full Pipeline (Both Steps)", variant="primary")
749
+
750
+ run_log = gr.Textbox(
751
+ label="Execution Log",
752
+ lines=18,
753
+ max_lines=30,
754
+ interactive=False,
755
+ )
756
+
757
+ btn_nb1.click(run_datacreation, outputs=[run_log])
758
+ btn_nb2.click(run_pythonanalysis, outputs=[run_log])
759
+ btn_all.click(run_full_pipeline, outputs=[run_log])
760
+
761
+ # ===========================================================
762
+ # TAB 2 -- Dashboard (KPIs + Interactive Charts + Gallery)
763
+ # ===========================================================
764
+ with gr.Tab("Dashboard"):
765
+ kpi_html = gr.HTML(value=render_kpi_cards)
766
+
767
+ refresh_btn = gr.Button("Refresh Dashboard", variant="primary")
768
+
769
+ gr.Markdown("#### Interactive Charts")
770
+ chart_sales = gr.Plot(label="Monthly Return Overview")
771
+ chart_sentiment = gr.Plot(label="Sentiment Distribution")
772
+ chart_top = gr.Plot(label="Highest Return Risk")
773
+
774
+ gr.Markdown("#### Static Figures (from notebooks)")
775
+ gallery = gr.Gallery(
776
+ label="Generated Figures",
777
+ columns=2,
778
+ height=480,
779
+ object_fit="contain",
780
+ )
781
+
782
+ gr.Markdown("#### Data Tables")
783
+ table_dropdown = gr.Dropdown(
784
+ label="Select a table to view",
785
+ choices=[],
786
+ interactive=True,
787
+ )
788
+ table_display = gr.Dataframe(
789
+ label="Table Preview",
790
+ interactive=False,
791
+ )
792
+
793
+ def _on_refresh():
794
+ kpi, c1, c2, c3 = refresh_dashboard()
795
+ figs, dd, df = refresh_gallery()
796
+ return kpi, c1, c2, c3, figs, dd, df
797
+
798
+ refresh_btn.click(
799
+ _on_refresh,
800
+ outputs=[kpi_html, chart_sales, chart_sentiment, chart_top,
801
+ gallery, table_dropdown, table_display],
802
+ )
803
+ table_dropdown.change(
804
+ on_table_select,
805
+ inputs=[table_dropdown],
806
+ outputs=[table_display],
807
+ )
808
+
809
+ # ===========================================================
810
+ # TAB 3 -- AI Dashboard
811
+ # ===========================================================
812
+ with gr.Tab('"AI" Dashboard'):
813
+ _ai_status = (
814
+ "Connected to your **n8n workflow**." if N8N_WEBHOOK_URL
815
+ else "**LLM active.**" if LLM_ENABLED
816
+ else "Using **keyword matching**. Upgrade options: "
817
+ "set `N8N_WEBHOOK_URL` to connect your n8n workflow, "
818
+ "or set `HF_API_KEY` for direct LLM access."
819
+ )
820
+ gr.Markdown(
821
+ "### Ask questions, get interactive visualisations\n\n"
822
+ f"Type a question and the system will pick the right interactive chart or table. {_ai_status}"
823
+ )
824
+
825
+ with gr.Row(equal_height=True):
826
+ with gr.Column(scale=1):
827
+ chatbot = gr.Chatbot(
828
+ label="Conversation",
829
+ height=380,
830
+ )
831
+ user_input = gr.Textbox(
832
+ label="Ask about your data",
833
+ placeholder="e.g. Which products have the highest return rate? / What are the main complaints? / Sentiment analysis",
834
+ lines=1,
835
+ )
836
+ gr.Examples(
837
+ examples=[
838
+ "Which products have the highest return rate?",
839
+ "What are the main complaints in the reviews?",
840
+ "What does the sentiment look like?",
841
+ "Show me the rating distribution",
842
+ "Give me a dashboard overview",
843
+ "Which categories are highest risk?",
844
+ ],
845
+ inputs=user_input,
846
+ )
847
+
848
+ with gr.Column(scale=1):
849
+ ai_figure = gr.Plot(
850
+ label="Interactive Chart",
851
+ )
852
+ ai_table = gr.Dataframe(
853
+ label="Data Table",
854
+ interactive=False,
855
+ )
856
+
857
+ user_input.submit(
858
+ ai_chat,
859
+ inputs=[user_input, chatbot],
860
+ outputs=[chatbot, user_input, ai_figure, ai_table],
861
+ )
862
+
863
+
864
+ demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])
archive.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1046fecda157adb083485fae9e219d4f76de68d26904c01e4a39b8fa8b9ecf
3
+ size 2924120
background_bottom.png ADDED
background_mid.png ADDED
background_top.png ADDED

Git LFS Details

  • SHA256: 27e963d20dbb7ae88368fb527d475c85ef0de3df63d8f0d7d5e2af7403a5b365
  • Pointer size: 131 Bytes
  • Size of remote file: 726 kB
datacreation.ipynb ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "source": [
20
+ "# ==================================================\n",
21
+ "# UNIVERSAL SETUP CELL\n",
22
+ "# Works in BOTH Google Colab and Hugging Face Spaces\n",
23
+ "# ==================================================\n",
24
+ "\n",
25
+ "import os\n",
26
+ "import random\n",
27
+ "import warnings\n",
28
+ "from pathlib import Path\n",
29
+ "\n",
30
+ "import numpy as np\n",
31
+ "import pandas as pd\n",
32
+ "\n",
33
+ "warnings.filterwarnings(\"ignore\")\n",
34
+ "\n",
35
+ "# Reproducibility\n",
36
+ "random.seed(42)\n",
37
+ "np.random.seed(42)\n",
38
+ "\n",
39
+ "# Detect environment automatically\n",
40
+ "if Path(\"/app\").exists():\n",
41
+ " BASE_PATH = Path(\"/app\") # Hugging Face Space\n",
42
+ "elif Path(\"/content\").exists():\n",
43
+ " BASE_PATH = Path(\"/content\") # Google Colab\n",
44
+ "else:\n",
45
+ " BASE_PATH = Path.cwd() # Local fallback\n",
46
+ "\n",
47
+ "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n",
48
+ "OUTPUTS = BASE_PATH / \"outputs\"\n",
49
+ "\n",
50
+ "DATA_PROCESSED.mkdir(exist_ok=True)\n",
51
+ "OUTPUTS.mkdir(exist_ok=True)\n",
52
+ "\n",
53
+ "print(\"Environment ready.\")\n",
54
+ "print(\"Using BASE_PATH:\", BASE_PATH)\n",
55
+ "\n",
56
+ "# Find CSV files anywhere under BASE_PATH\n",
57
+ "csv_paths = [\n",
58
+ " p for p in BASE_PATH.rglob(\"*.csv\")\n",
59
+ " if \"sample_data\" not in str(p)\n",
60
+ "]\n",
61
+ "\n",
62
+ "print(\"Found CSV files:\")\n",
63
+ "for p in csv_paths:\n",
64
+ " print(\"-\", p)\n",
65
+ "\n",
66
+ "# Locate required files\n",
67
+ "reviews_matches = [\n",
68
+ " p for p in csv_paths\n",
69
+ " if \"clothing\" in p.name.lower()\n",
70
+ "]\n",
71
+ "\n",
72
+ "returns_matches = [\n",
73
+ " p for p in csv_paths\n",
74
+ " if \"return\" in p.name.lower()\n",
75
+ "]\n",
76
+ "\n",
77
+ "if not reviews_matches:\n",
78
+ " raise FileNotFoundError(\n",
79
+ " \"Could not find the Womens Clothing E-Commerce Reviews CSV. \"\n",
80
+ " \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
81
+ " )\n",
82
+ "\n",
83
+ "if not returns_matches:\n",
84
+ " raise FileNotFoundError(\n",
85
+ " \"Could not find the ecommerce returns CSV. \"\n",
86
+ " \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
87
+ " )\n",
88
+ "\n",
89
+ "reviews_path = reviews_matches[0]\n",
90
+ "returns_path = returns_matches[0]\n",
91
+ "\n",
92
+ "print(\"Using reviews file:\", reviews_path)\n",
93
+ "print(\"Using returns file:\", returns_path)\n",
94
+ "\n",
95
+ "reviews_df = pd.read_csv(reviews_path)\n",
96
+ "returns_df = pd.read_csv(returns_path)\n",
97
+ "\n",
98
+ "# Main dataframe used by the rest of this notebook\n",
99
+ "df = reviews_df.copy()\n",
100
+ "\n",
101
+ "print(\"Loaded successfully.\")\n",
102
+ "print(\"Reviews shape:\", reviews_df.shape)\n",
103
+ "print(\"Returns shape:\", returns_df.shape)\n"
104
+ ],
105
+ "metadata": {
106
+ "colab": {
107
+ "base_uri": "https://localhost:8080/"
108
+ },
109
+ "id": "bsHVIP13nWFe",
110
+ "outputId": "a44c956d-10cc-4879-f45f-e84c19bf7631"
111
+ },
112
+ "execution_count": null,
113
+ "outputs": []
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "source": [
118
+ "missing_summary = pd.DataFrame({\n",
119
+ " \"column\": df.columns,\n",
120
+ " \"missing_count\": df.isna().sum().values,\n",
121
+ " \"missing_pct\": (df.isna().mean().values * 100).round(2)\n",
122
+ "}).sort_values(by=\"missing_pct\", ascending=False)\n",
123
+ "\n",
124
+ "display(missing_summary)"
125
+ ],
126
+ "metadata": {
127
+ "colab": {
128
+ "base_uri": "https://localhost:8080/",
129
+ "height": 390
130
+ },
131
+ "id": "3qu3XSzfnV4-",
132
+ "outputId": "6480a5bb-b427-438e-8d5d-b9af2ebc67a0"
133
+ },
134
+ "execution_count": null,
135
+ "outputs": []
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "source": [
140
+ "df_clean = df.copy()\n",
141
+ "\n",
142
+ "# remove useless index column\n",
143
+ "df_clean = df_clean.drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
144
+ "\n",
145
+ "# fill text columns\n",
146
+ "df_clean[\"Title\"] = df_clean[\"Title\"].fillna(\"No Title\")\n",
147
+ "df_clean[\"Review Text\"] = df_clean[\"Review Text\"].fillna(\"No Review\")\n",
148
+ "\n",
149
+ "# fill category columns\n",
150
+ "for col in [\"Department Name\", \"Class Name\", \"Division Name\"]:\n",
151
+ " df_clean[col] = df_clean[col].fillna(\"Unknown\")\n",
152
+ "\n",
153
+ "# create sentiment label from rating\n",
154
+ "df_clean[\"sentiment\"] = df_clean[\"Rating\"].apply(\n",
155
+ " lambda x: \"positive\" if x >= 4 else \"negative\"\n",
156
+ ")\n",
157
+ "\n",
158
+ "# create engagement score\n",
159
+ "df_clean[\"engagement_score\"] = (\n",
160
+ " df_clean[\"Positive Feedback Count\"] + df_clean[\"Recommended IND\"]\n",
161
+ ")\n",
162
+ "\n",
163
+ "print(\"Cleaned shape:\", df_clean.shape)\n",
164
+ "display(df_clean.head())\n"
165
+ ],
166
+ "metadata": {
167
+ "colab": {
168
+ "base_uri": "https://localhost:8080/",
169
+ "height": 633
170
+ },
171
+ "id": "da3lANHOnV13",
172
+ "outputId": "9704c238-2e77-4d74-a4c0-042526c0dfa2"
173
+ },
174
+ "execution_count": null,
175
+ "outputs": []
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "source": [
180
+ "class_summary = (\n",
181
+ " df_clean.groupby(\"Class Name\")\n",
182
+ " .agg(\n",
183
+ " reviews=(\"Rating\", \"count\"),\n",
184
+ " avg_rating=(\"Rating\", \"mean\"),\n",
185
+ " recommendation_rate=(\"Recommended IND\", \"mean\"),\n",
186
+ " avg_feedback=(\"Positive Feedback Count\", \"mean\")\n",
187
+ " )\n",
188
+ " .reset_index()\n",
189
+ ")\n",
190
+ "\n",
191
+ "# keep only classes with enough reviews\n",
192
+ "class_summary = class_summary[class_summary[\"reviews\"] >= 100]\n",
193
+ "\n",
194
+ "# sort by rating\n",
195
+ "class_summary = class_summary.sort_values(\n",
196
+ " by=\"avg_rating\",\n",
197
+ " ascending=False\n",
198
+ ")\n",
199
+ "\n",
200
+ "display(class_summary.head(10))\n",
201
+ "display(class_summary.tail(10))"
202
+ ],
203
+ "metadata": {
204
+ "colab": {
205
+ "base_uri": "https://localhost:8080/",
206
+ "height": 701
207
+ },
208
+ "id": "MAW6wxDCnVzA",
209
+ "outputId": "c06b3a75-44d9-4620-ed82-e54a6c451c44"
210
+ },
211
+ "execution_count": null,
212
+ "outputs": []
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "source": [
217
+ "negative_reviews = df_clean[df_clean[\"sentiment\"] == \"negative\"].copy()\n",
218
+ "\n",
219
+ "print(\"Negative reviews:\", negative_reviews.shape)\n",
220
+ "\n",
221
+ "negative_reviews[\"Review Text\"] = negative_reviews[\"Review Text\"].astype(str)\n",
222
+ "\n",
223
+ "common_words = (\n",
224
+ " negative_reviews[\"Review Text\"]\n",
225
+ " .str.lower()\n",
226
+ " .str.split(expand=True)\n",
227
+ " .stack()\n",
228
+ " .value_counts()\n",
229
+ " .head(30)\n",
230
+ ")\n",
231
+ "\n",
232
+ "display(common_words)"
233
+ ],
234
+ "metadata": {
235
+ "colab": {
236
+ "base_uri": "https://localhost:8080/",
237
+ "height": 1000
238
+ },
239
+ "id": "C6TafBY7nVv3",
240
+ "outputId": "ed36279a-1846-48a4-98d9-5c7839bd0b4b"
241
+ },
242
+ "execution_count": null,
243
+ "outputs": []
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "source": [
248
+ "import numpy as np\n",
249
+ "\n",
250
+ "np.random.seed(42)\n",
251
+ "\n",
252
+ "synthetic_df = pd.DataFrame({\n",
253
+ " \"customer_id\": range(1, 501),\n",
254
+ " \"predicted_return_risk\": np.random.choice(\n",
255
+ " [\"low\", \"medium\", \"high\"],\n",
256
+ " size=500,\n",
257
+ " p=[0.5, 0.3, 0.2]\n",
258
+ " ),\n",
259
+ " \"predicted_size_issue\": np.random.choice(\n",
260
+ " [\"yes\", \"no\"],\n",
261
+ " size=500,\n",
262
+ " p=[0.25, 0.75]\n",
263
+ " ),\n",
264
+ " \"predicted_satisfaction_next_purchase\": np.random.randint(1, 6, 500)\n",
265
+ "})\n",
266
+ "\n",
267
+ "print(synthetic_df.shape)\n",
268
+ "display(synthetic_df.head())\n",
269
+ "\n",
270
+ "# Save outputs for the Hugging Face app / next analysis notebook\n",
271
+ "df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n",
272
+ "class_summary.to_csv(DATA_PROCESSED / \"class_summary.csv\", index=False)\n",
273
+ "synthetic_df.to_csv(DATA_PROCESSED / \"synthetic_return_risk.csv\", index=False)\n",
274
+ "returns_df.to_csv(DATA_PROCESSED / \"returns_input.csv\", index=False)\n",
275
+ "\n",
276
+ "common_words.reset_index().rename(\n",
277
+ " columns={\"index\": \"word\", \"Review Text\": \"count\", 0: \"count\"}\n",
278
+ ").to_csv(DATA_PROCESSED / \"common_negative_words.csv\", index=False)\n",
279
+ "\n",
280
+ "print(\"Saved processed files to:\", DATA_PROCESSED)\n",
281
+ "print([p.name for p in DATA_PROCESSED.glob(\"*.csv\")])\n"
282
+ ],
283
+ "metadata": {
284
+ "colab": {
285
+ "base_uri": "https://localhost:8080/",
286
+ "height": 222
287
+ },
288
+ "id": "gmcDmANBnVsP",
289
+ "outputId": "bba8768d-f088-419a-92ff-8092654ee950"
290
+ },
291
+ "execution_count": null,
292
+ "outputs": []
293
+ },
294
+ {
295
+ "cell_type": "markdown",
296
+ "source": [
297
+ "# Automation Opportunities\n",
298
+ "\n",
299
+ "## Automation 1 — Review Sentiment Alert\n",
300
+ "Automatically flag clothing classes when average rating drops below 3.5.\n",
301
+ "\n",
302
+ "## Automation 2 — Product Improvement Suggestions\n",
303
+ "Use negative review keywords to automatically suggest:\n",
304
+ "- sizing guide improvements\n",
305
+ "- fabric description clarification\n",
306
+ "- fit recommendations\n",
307
+ "- photo quality updates\n",
308
+ "\n",
309
+ "## Automation 3 — Future Return Risk Dashboard\n",
310
+ "Combine real reviews with synthetic future risk signals to monitor:\n",
311
+ "- high-risk customer segments\n",
312
+ "- classes with repeated size complaints\n",
313
+ "- products likely to receive negative reviews next season"
314
+ ],
315
+ "metadata": {
316
+ "id": "fmUnLL36pT_z"
317
+ },
318
+ "outputs": [],
319
+ "execution_count": null
320
+ }
321
+ ]
322
+ }
ecommerce_returns_cleaned.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7feb1ef02dff68370245fbef6e84b04c691b631bfd56bf1afb3c547eb08cef17
3
+ size 36312047
gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ background_top.png filter=lfs diff=lfs merge=lfs -text
37
+ ecommerce_returns_cleaned.csv filter=lfs diff=lfs merge=lfs -text
gitattributes (1) ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ background_top.png filter=lfs diff=lfs merge=lfs -text
gitattributes (2) ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ background_top.png filter=lfs diff=lfs merge=lfs -text
pythonanalysis.ipynb ADDED
@@ -0,0 +1,1046 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "85361b58",
6
+ "metadata": {
7
+ "id": "85361b58"
8
+ },
9
+ "source": [
10
+ "# Step 2 — Python Analysis / Modeling\n",
11
+ "\n",
12
+ "Clean version for the Hugging Face SE21 app template. It creates dashboard artifacts."
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 5,
18
+ "id": "c88b847c",
19
+ "metadata": {
20
+ "colab": {
21
+ "base_uri": "https://localhost:8080/"
22
+ },
23
+ "id": "c88b847c",
24
+ "outputId": "d0c3643a-d491-4746-a55b-35ed016e4fe4"
25
+ },
26
+ "outputs": [
27
+ {
28
+ "output_type": "stream",
29
+ "name": "stdout",
30
+ "text": [
31
+ "Environment ready.\n",
32
+ "BASE_PATH: /content\n",
33
+ "CSV files found:\n",
34
+ "- /content/Womens Clothing E-Commerce Reviews.csv\n",
35
+ "- /content/ecommerce_returns_cleaned.csv\n",
36
+ "Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n",
37
+ "Using returns file: /content/ecommerce_returns_cleaned.csv\n",
38
+ "Reviews shape: (23486, 10)\n",
39
+ "Returns shape: (113314, 29)\n",
40
+ "Reviews columns: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']\n",
41
+ "Returns columns: ['order_id', 'order_item_id', 'product_id', 'seller_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_score', 'review_comment_title', 'review_comment_message', 'price', 'freight_value', 'total_cost', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'has_review_text', 'review_text_length', 'delivery_delay_days', 'negative_keyword_flag', 'synthetic_return_risk', 'likely_return']\n",
42
+ "Data loaded and cleaned.\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "# ==================================================\n",
48
+ "# STEP 2: UNIVERSAL ANALYSIS SETUP\n",
49
+ "# Works in BOTH Hugging Face Spaces and Google Colab\n",
50
+ "# ==================================================\n",
51
+ "\n",
52
+ "import os\n",
53
+ "import json\n",
54
+ "import random\n",
55
+ "import warnings\n",
56
+ "from pathlib import Path\n",
57
+ "\n",
58
+ "os.environ.setdefault(\"MPLCONFIGDIR\", \"/tmp/matplotlib\")\n",
59
+ "\n",
60
+ "import numpy as np\n",
61
+ "import pandas as pd\n",
62
+ "import matplotlib.pyplot as plt\n",
63
+ "\n",
64
+ "warnings.filterwarnings(\"ignore\")\n",
65
+ "random.seed(42)\n",
66
+ "np.random.seed(42)\n",
67
+ "\n",
68
+ "# Pick the correct runtime folder automatically.\n",
69
+ "# Hugging Face Space uses /app. Colab uses /content.\n",
70
+ "candidate_roots = [Path(\"/app\"), Path(\"/content\"), Path.cwd(), Path(\"/mnt/data\")]\n",
71
+ "BASE_PATH = None\n",
72
+ "\n",
73
+ "for root in candidate_roots:\n",
74
+ " if root.exists():\n",
75
+ " csvs = []\n",
76
+ " for p in root.rglob(\"*.csv\"):\n",
77
+ " parts = {part.lower() for part in p.parts}\n",
78
+ " if \"sample_data\" in parts:\n",
79
+ " continue\n",
80
+ " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n",
81
+ " continue\n",
82
+ " csvs.append(p)\n",
83
+ " if csvs:\n",
84
+ " BASE_PATH = root\n",
85
+ " break\n",
86
+ "\n",
87
+ "if BASE_PATH is None:\n",
88
+ " if Path(\"/app\").exists():\n",
89
+ " BASE_PATH = Path(\"/app\")\n",
90
+ " elif Path(\"/content\").exists():\n",
91
+ " BASE_PATH = Path(\"/content\")\n",
92
+ " else:\n",
93
+ " BASE_PATH = Path.cwd()\n",
94
+ "\n",
95
+ "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n",
96
+ "\n",
97
+ "OUTPUTS = BASE_PATH / \"outputs\"\n",
98
+ "FIGURES = BASE_PATH / \"figures\"\n",
99
+ "TABLES = BASE_PATH / \"tables\"\n",
100
+ "ARTIFACTS = BASE_PATH / \"artifacts\"\n",
101
+ "\n",
102
+ "# Extra folders because different templates check different places\n",
103
+ "OUTPUT_FIGURES = OUTPUTS / \"figures\"\n",
104
+ "OUTPUT_TABLES = OUTPUTS / \"tables\"\n",
105
+ "ARTIFACT_FIGURES = ARTIFACTS / \"figures\"\n",
106
+ "ARTIFACT_TABLES = ARTIFACTS / \"tables\"\n",
107
+ "\n",
108
+ "ALL_OUTPUT_DIRS = [\n",
109
+ " DATA_PROCESSED,\n",
110
+ " OUTPUTS,\n",
111
+ " FIGURES,\n",
112
+ " TABLES,\n",
113
+ " ARTIFACTS,\n",
114
+ " OUTPUT_FIGURES,\n",
115
+ " OUTPUT_TABLES,\n",
116
+ " ARTIFACT_FIGURES,\n",
117
+ " ARTIFACT_TABLES,\n",
118
+ "]\n",
119
+ "\n",
120
+ "for folder in ALL_OUTPUT_DIRS:\n",
121
+ " folder.mkdir(parents=True, exist_ok=True)\n",
122
+ "\n",
123
+ "print(\"Environment ready.\")\n",
124
+ "print(\"BASE_PATH:\", BASE_PATH)\n",
125
+ "\n",
126
+ "# Load data created by Step 1 if available.\n",
127
+ "csv_paths = []\n",
128
+ "for p in BASE_PATH.rglob(\"*.csv\"):\n",
129
+ " parts = {part.lower() for part in p.parts}\n",
130
+ " if \"sample_data\" in parts:\n",
131
+ " continue\n",
132
+ " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n",
133
+ " continue\n",
134
+ " csv_paths.append(p)\n",
135
+ "\n",
136
+ "print(\"CSV files found:\")\n",
137
+ "for p in csv_paths:\n",
138
+ " print(\"-\", p)\n",
139
+ "\n",
140
+ "def first_existing(paths):\n",
141
+ " for p in paths:\n",
142
+ " if Path(p).exists():\n",
143
+ " return Path(p)\n",
144
+ " return None\n",
145
+ "\n",
146
+ "reviews_path = first_existing([\n",
147
+ " DATA_PROCESSED / \"reviews_cleaned.csv\",\n",
148
+ " DATA_PROCESSED / \"womens_reviews_cleaned.csv\",\n",
149
+ " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n",
150
+ "])\n",
151
+ "\n",
152
+ "returns_path = first_existing([\n",
153
+ " DATA_PROCESSED / \"returns_input.csv\",\n",
154
+ " DATA_PROCESSED / \"returns_cleaned.csv\",\n",
155
+ " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n",
156
+ " DATA_PROCESSED / \"synthetic_return_risk.csv\",\n",
157
+ "])\n",
158
+ "\n",
159
+ "# Fallback search.\n",
160
+ "if reviews_path is None:\n",
161
+ " review_matches = [\n",
162
+ " p for p in csv_paths\n",
163
+ " if (\"clothing\" in p.name.lower()) or (\"review\" in p.name.lower() and \"return\" not in p.name.lower())\n",
164
+ " ]\n",
165
+ " reviews_path = review_matches[0] if review_matches else None\n",
166
+ "\n",
167
+ "if returns_path is None:\n",
168
+ " return_matches = [\n",
169
+ " p for p in csv_paths\n",
170
+ " if \"return\" in p.name.lower()\n",
171
+ " ]\n",
172
+ " returns_path = return_matches[0] if return_matches else None\n",
173
+ "\n",
174
+ "\n",
175
+ "if returns_path is None:\n",
176
+ " raise FileNotFoundError(\"Step 2 could not find the ecommerce returns CSV.\")\n",
177
+ "\n",
178
+ "print(\"Using reviews file:\", reviews_path)\n",
179
+ "print(\"Using returns file:\", returns_path)\n",
180
+ "\n",
181
+ "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
182
+ "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
183
+ "\n",
184
+ "print(\"Reviews shape:\", reviews_df.shape)\n",
185
+ "print(\"Returns shape:\", returns_df.shape)\n",
186
+ "print(\"Reviews columns:\", reviews_df.columns.tolist())\n",
187
+ "print(\"Returns columns:\", returns_df.columns.tolist())\n",
188
+ "\n",
189
+ "# Basic cleanup / type safety\n",
190
+ "for col in [\"Age\", \"Rating\", \"Recommended IND\", \"Positive Feedback Count\"]:\n",
191
+ " if col in reviews_df.columns:\n",
192
+ " reviews_df[col] = pd.to_numeric(reviews_df[col], errors=\"coerce\")\n",
193
+ "\n",
194
+ "if \"Review Text\" in reviews_df.columns:\n",
195
+ " reviews_df[\"Review Text\"] = reviews_df[\"Review Text\"].fillna(\"\").astype(str)\n",
196
+ "\n",
197
+ "if \"Class Name\" in reviews_df.columns:\n",
198
+ " reviews_df[\"Class Name\"] = reviews_df[\"Class Name\"].fillna(\"Unknown\").astype(str)\n",
199
+ "\n",
200
+ "for col in [\"review_score\", \"likely_return\", \"price\", \"freight_value\", \"delivery_delay_days\", \"synthetic_return_risk\"]:\n",
201
+ " if col in returns_df.columns:\n",
202
+ " returns_df[col] = pd.to_numeric(returns_df[col], errors=\"coerce\")\n",
203
+ "\n",
204
+ "print(\"Data loaded and cleaned.\")"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 6,
210
+ "id": "f9eb3801",
211
+ "metadata": {
212
+ "id": "f9eb3801"
213
+ },
214
+ "outputs": [],
215
+ "source": [
216
+ "# ==================================================\n",
217
+ "# HELPERS: save artifacts where the app can find them\n",
218
+ "# ==================================================\n",
219
+ "# ==================================================\n",
220
+ "# HELPERS: save artifacts everywhere the app may check\n",
221
+ "# ==================================================\n",
222
+ "\n",
223
+ "def safe_write_csv(df, path):\n",
224
+ " try:\n",
225
+ " df.to_csv(path)\n",
226
+ " return True\n",
227
+ " except Exception as e:\n",
228
+ " print(f\"Could not save {path}: {e}\")\n",
229
+ " return False\n",
230
+ "\n",
231
+ "\n",
232
+ "def safe_savefig(path):\n",
233
+ " try:\n",
234
+ " plt.savefig(path, dpi=150, bbox_inches=\"tight\")\n",
235
+ " return True\n",
236
+ " except Exception as e:\n",
237
+ " print(f\"Could not save {path}: {e}\")\n",
238
+ " return False\n",
239
+ "\n",
240
+ "\n",
241
+ "def safe_write_text(text, path):\n",
242
+ " try:\n",
243
+ " path.write_text(text, encoding=\"utf-8\")\n",
244
+ " return True\n",
245
+ " except Exception as e:\n",
246
+ " print(f\"Could not save {path}: {e}\")\n",
247
+ " return False\n",
248
+ "\n",
249
+ "\n",
250
+ "def save_table(df, name):\n",
251
+ " if isinstance(df, pd.Series):\n",
252
+ " df = df.to_frame()\n",
253
+ "\n",
254
+ " table_folders = [\n",
255
+ " TABLES,\n",
256
+ " OUTPUT_TABLES,\n",
257
+ " OUTPUTS,\n",
258
+ " ARTIFACT_TABLES,\n",
259
+ " ARTIFACTS,\n",
260
+ " ]\n",
261
+ "\n",
262
+ " saved_anywhere = False\n",
263
+ "\n",
264
+ " for folder in table_folders:\n",
265
+ " folder.mkdir(parents=True, exist_ok=True)\n",
266
+ " path = folder / f\"{name}.csv\"\n",
267
+ " saved_anywhere = safe_write_csv(df, path) or saved_anywhere\n",
268
+ "\n",
269
+ " if saved_anywhere:\n",
270
+ " print(f\"Saved table everywhere: {name}.csv\")\n",
271
+ " else:\n",
272
+ " raise RuntimeError(f\"Could not save table {name}.csv\")\n",
273
+ "\n",
274
+ "\n",
275
+ "def save_figure(name):\n",
276
+ " figure_folders = [\n",
277
+ " FIGURES,\n",
278
+ " OUTPUT_FIGURES,\n",
279
+ " OUTPUTS,\n",
280
+ " ARTIFACT_FIGURES,\n",
281
+ " ARTIFACTS,\n",
282
+ " ]\n",
283
+ "\n",
284
+ " saved_anywhere = False\n",
285
+ "\n",
286
+ " for folder in figure_folders:\n",
287
+ " folder.mkdir(parents=True, exist_ok=True)\n",
288
+ " path = folder / f\"{name}.png\"\n",
289
+ " saved_anywhere = safe_savefig(path) or saved_anywhere\n",
290
+ "\n",
291
+ " if saved_anywhere:\n",
292
+ " print(f\"Saved figure everywhere: {name}.png\")\n",
293
+ " else:\n",
294
+ " raise RuntimeError(f\"Could not save figure {name}.png\")\n",
295
+ "\n",
296
+ "\n",
297
+ "def save_text(text, name):\n",
298
+ " text_folders = [\n",
299
+ " TABLES,\n",
300
+ " OUTPUT_TABLES,\n",
301
+ " OUTPUTS,\n",
302
+ " ARTIFACT_TABLES,\n",
303
+ " ARTIFACTS,\n",
304
+ " ]\n",
305
+ "\n",
306
+ " saved_anywhere = False\n",
307
+ "\n",
308
+ " for folder in text_folders:\n",
309
+ " folder.mkdir(parents=True, exist_ok=True)\n",
310
+ " path = folder / f\"{name}.txt\"\n",
311
+ " saved_anywhere = safe_write_text(text, path) or saved_anywhere\n",
312
+ "\n",
313
+ " if saved_anywhere:\n",
314
+ " print(f\"Saved text everywhere: {name}.txt\")\n",
315
+ " else:\n",
316
+ " raise RuntimeError(f\"Could not save text {name}.txt\")"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 7,
322
+ "id": "a99949ac",
323
+ "metadata": {
324
+ "colab": {
325
+ "base_uri": "https://localhost:8080/"
326
+ },
327
+ "id": "a99949ac",
328
+ "outputId": "33b9f5b0-67b0-4a44-8eef-b572cb8f7492"
329
+ },
330
+ "outputs": [
331
+ {
332
+ "output_type": "stream",
333
+ "name": "stdout",
334
+ "text": [
335
+ "Saved table everywhere: rating_distribution.csv\n",
336
+ "Saved figure everywhere: rating_distribution.png\n",
337
+ "Saved table everywhere: recommendation_by_class.csv\n",
338
+ "Saved figure everywhere: recommendation_by_class.png\n",
339
+ "Saved table everywhere: average_rating_by_age.csv\n",
340
+ "Saved figure everywhere: average_rating_by_age.png\n",
341
+ "Saved table everywhere: negative_keyword_counts.csv\n",
342
+ "Saved figure everywhere: negative_keyword_counts.png\n",
343
+ "Saved table everywhere: category_return_rate.csv\n",
344
+ "Saved figure everywhere: category_return_rate.png\n",
345
+ "Saved table everywhere: monthly_return_rate.csv\n",
346
+ "Saved figure everywhere: monthly_return_rate.png\n",
347
+ "Saved table everywhere: feature_importance.csv\n",
348
+ "Saved figure everywhere: feature_importance.png\n",
349
+ "Saved text everywhere: classification_report.txt\n",
350
+ "Artifact creation section finished.\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "# ==================================================\n",
356
+ "# CREATE DASHBOARD ARTIFACTS\n",
357
+ "# ==================================================\n",
358
+ "\n",
359
+ "created_figures = []\n",
360
+ "created_tables = []\n",
361
+ "\n",
362
+ "# 1) Rating distribution\n",
363
+ "if \"Rating\" in reviews_df.columns:\n",
364
+ " rating_distribution = reviews_df[\"Rating\"].dropna().value_counts().sort_index().to_frame(\"count\")\n",
365
+ " save_table(rating_distribution, \"rating_distribution\")\n",
366
+ " created_tables.append(\"rating_distribution.csv\")\n",
367
+ "\n",
368
+ " plt.figure(figsize=(7, 4))\n",
369
+ " plt.bar(rating_distribution.index.astype(str), rating_distribution[\"count\"])\n",
370
+ " plt.title(\"Distribution of Customer Ratings\")\n",
371
+ " plt.xlabel(\"Rating\")\n",
372
+ " plt.ylabel(\"Number of Reviews\")\n",
373
+ " plt.tight_layout()\n",
374
+ " save_figure(\"rating_distribution\")\n",
375
+ " created_figures.append(\"rating_distribution.png\")\n",
376
+ " plt.close()\n",
377
+ "\n",
378
+ "# 2) Recommendation rate by clothing class\n",
379
+ "if {\"Class Name\", \"Recommended IND\"}.issubset(reviews_df.columns):\n",
380
+ " recommendation_by_class = (\n",
381
+ " reviews_df.groupby(\"Class Name\")[\"Recommended IND\"]\n",
382
+ " .mean()\n",
383
+ " .sort_values(ascending=False)\n",
384
+ " .head(10)\n",
385
+ " .to_frame(\"recommendation_rate\")\n",
386
+ " )\n",
387
+ " save_table(recommendation_by_class, \"recommendation_by_class\")\n",
388
+ " created_tables.append(\"recommendation_by_class.csv\")\n",
389
+ "\n",
390
+ " plt.figure(figsize=(10, 5))\n",
391
+ " plt.bar(recommendation_by_class.index.astype(str), recommendation_by_class[\"recommendation_rate\"])\n",
392
+ " plt.title(\"Top 10 Most Recommended Clothing Classes\")\n",
393
+ " plt.xlabel(\"Class Name\")\n",
394
+ " plt.ylabel(\"Recommendation Rate\")\n",
395
+ " plt.xticks(rotation=75)\n",
396
+ " plt.tight_layout()\n",
397
+ " save_figure(\"recommendation_by_class\")\n",
398
+ " created_figures.append(\"recommendation_by_class.png\")\n",
399
+ " plt.close()\n",
400
+ "\n",
401
+ "# 3) Average rating by age\n",
402
+ "if {\"Age\", \"Rating\"}.issubset(reviews_df.columns):\n",
403
+ " average_rating_by_age = (\n",
404
+ " reviews_df.groupby(\"Age\")[\"Rating\"]\n",
405
+ " .mean()\n",
406
+ " .dropna()\n",
407
+ " .to_frame(\"average_rating\")\n",
408
+ " )\n",
409
+ " save_table(average_rating_by_age, \"average_rating_by_age\")\n",
410
+ " created_tables.append(\"average_rating_by_age.csv\")\n",
411
+ "\n",
412
+ " plt.figure(figsize=(10, 4))\n",
413
+ " plt.plot(average_rating_by_age.index, average_rating_by_age[\"average_rating\"])\n",
414
+ " plt.title(\"Average Rating by Customer Age\")\n",
415
+ " plt.xlabel(\"Age\")\n",
416
+ " plt.ylabel(\"Average Rating\")\n",
417
+ " plt.tight_layout()\n",
418
+ " save_figure(\"average_rating_by_age\")\n",
419
+ " created_figures.append(\"average_rating_by_age.png\")\n",
420
+ " plt.close()\n",
421
+ "\n",
422
+ "# 4) Complaint / return-risk keyword counts\n",
423
+ "review_text_column = None\n",
424
+ "for candidate in [\"Review Text\", \"review_text\", \"review_comment_message\"]:\n",
425
+ " if candidate in reviews_df.columns:\n",
426
+ " review_text_column = candidate\n",
427
+ " break\n",
428
+ "\n",
429
+ "if review_text_column is not None:\n",
430
+ " keywords = [\n",
431
+ " \"bad\", \"poor\", \"cheap\", \"small\", \"large\", \"tight\", \"loose\",\n",
432
+ " \"scratchy\", \"thin\", \"return\", \"returned\", \"disappointed\",\n",
433
+ " \"quality\", \"fit\", \"sizing\", \"fabric\", \"uncomfortable\"\n",
434
+ " ]\n",
435
+ " text_series = reviews_df[review_text_column].fillna(\"\").astype(str).str.lower()\n",
436
+ " keyword_counts = {}\n",
437
+ " for word in keywords:\n",
438
+ " keyword_counts[word] = int(text_series.str.contains(word, regex=False).sum())\n",
439
+ "\n",
440
+ " negative_keyword_counts = (\n",
441
+ " pd.DataFrame(keyword_counts.items(), columns=[\"keyword\", \"review_count\"])\n",
442
+ " .sort_values(\"review_count\", ascending=False)\n",
443
+ " .set_index(\"keyword\")\n",
444
+ " )\n",
445
+ " save_table(negative_keyword_counts, \"negative_keyword_counts\")\n",
446
+ " created_tables.append(\"negative_keyword_counts.csv\")\n",
447
+ "\n",
448
+ " top_keywords = negative_keyword_counts.head(10)\n",
449
+ " plt.figure(figsize=(9, 4))\n",
450
+ " plt.bar(top_keywords.index.astype(str), top_keywords[\"review_count\"])\n",
451
+ " plt.title(\"Most Common Return-Risk Keywords in Reviews\")\n",
452
+ " plt.xlabel(\"Keyword\")\n",
453
+ " plt.ylabel(\"Number of Reviews\")\n",
454
+ " plt.xticks(rotation=45)\n",
455
+ " plt.tight_layout()\n",
456
+ " save_figure(\"negative_keyword_counts\")\n",
457
+ " created_figures.append(\"negative_keyword_counts.png\")\n",
458
+ " plt.close()\n",
459
+ "\n",
460
+ "# 5) Product category return rate\n",
461
+ "if {\"product_category_name\", \"likely_return\"}.issubset(returns_df.columns):\n",
462
+ " category_return_rate = (\n",
463
+ " returns_df.groupby(\"product_category_name\")[\"likely_return\"]\n",
464
+ " .mean()\n",
465
+ " .sort_values(ascending=False)\n",
466
+ " .head(15)\n",
467
+ " .to_frame(\"return_rate\")\n",
468
+ " )\n",
469
+ " save_table(category_return_rate, \"category_return_rate\")\n",
470
+ " created_tables.append(\"category_return_rate.csv\")\n",
471
+ "\n",
472
+ " plt.figure(figsize=(11, 5))\n",
473
+ " plt.bar(category_return_rate.index.astype(str), category_return_rate[\"return_rate\"])\n",
474
+ " plt.title(\"Top Product Categories by Estimated Return Rate\")\n",
475
+ " plt.xlabel(\"Product Category\")\n",
476
+ " plt.ylabel(\"Return Rate\")\n",
477
+ " plt.xticks(rotation=75)\n",
478
+ " plt.tight_layout()\n",
479
+ " save_figure(\"category_return_rate\")\n",
480
+ " created_figures.append(\"category_return_rate.png\")\n",
481
+ " plt.close()\n",
482
+ "\n",
483
+ "# 6) Monthly return rate\n",
484
+ "if {\"order_purchase_timestamp\", \"likely_return\"}.issubset(returns_df.columns):\n",
485
+ " monthly_df = returns_df.copy()\n",
486
+ " monthly_df[\"order_purchase_timestamp\"] = pd.to_datetime(monthly_df[\"order_purchase_timestamp\"], errors=\"coerce\")\n",
487
+ " monthly_df = monthly_df.dropna(subset=[\"order_purchase_timestamp\"])\n",
488
+ "\n",
489
+ " if len(monthly_df) > 0:\n",
490
+ " monthly_return_rate = (\n",
491
+ " monthly_df.set_index(\"order_purchase_timestamp\")\n",
492
+ " .resample(\"M\")[\"likely_return\"]\n",
493
+ " .mean()\n",
494
+ " .dropna()\n",
495
+ " .to_frame(\"return_rate\")\n",
496
+ " )\n",
497
+ " save_table(monthly_return_rate, \"monthly_return_rate\")\n",
498
+ " created_tables.append(\"monthly_return_rate.csv\")\n",
499
+ "\n",
500
+ " plt.figure(figsize=(10, 4))\n",
501
+ " plt.plot(monthly_return_rate.index, monthly_return_rate[\"return_rate\"])\n",
502
+ " plt.title(\"Monthly Estimated Return Rate\")\n",
503
+ " plt.xlabel(\"Month\")\n",
504
+ " plt.ylabel(\"Return Rate\")\n",
505
+ " plt.tight_layout()\n",
506
+ " save_figure(\"monthly_return_rate\")\n",
507
+ " created_figures.append(\"monthly_return_rate.png\")\n",
508
+ " plt.close()\n",
509
+ "\n",
510
+ "# 7) Simple feature importance if sklearn is available\n",
511
+ "try:\n",
512
+ " from sklearn.ensemble import RandomForestClassifier\n",
513
+ " from sklearn.model_selection import train_test_split\n",
514
+ " from sklearn.metrics import accuracy_score, classification_report\n",
515
+ "\n",
516
+ " feature_columns = [c for c in [\"Age\", \"Rating\", \"Positive Feedback Count\"] if c in reviews_df.columns]\n",
517
+ " if \"Recommended IND\" in reviews_df.columns and len(feature_columns) > 0:\n",
518
+ " model_df = reviews_df[feature_columns + [\"Recommended IND\"]].dropna().copy()\n",
519
+ " if model_df[\"Recommended IND\"].nunique() >= 2:\n",
520
+ " X = model_df[feature_columns]\n",
521
+ " y = model_df[\"Recommended IND\"].astype(int)\n",
522
+ " X_train, X_test, y_train, y_test = train_test_split(\n",
523
+ " X, y, test_size=0.2, random_state=42, stratify=y\n",
524
+ " )\n",
525
+ "\n",
526
+ " clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
527
+ " clf.fit(X_train, y_train)\n",
528
+ " predictions = clf.predict(X_test)\n",
529
+ " accuracy = accuracy_score(y_test, predictions)\n",
530
+ "\n",
531
+ " feature_importance = (\n",
532
+ " pd.Series(clf.feature_importances_, index=feature_columns)\n",
533
+ " .sort_values(ascending=False)\n",
534
+ " .to_frame(\"importance\")\n",
535
+ " )\n",
536
+ " save_table(feature_importance, \"feature_importance\")\n",
537
+ " created_tables.append(\"feature_importance.csv\")\n",
538
+ "\n",
539
+ " plt.figure(figsize=(7, 4))\n",
540
+ " plt.bar(feature_importance.index.astype(str), feature_importance[\"importance\"])\n",
541
+ " plt.title(\"Feature Importance for Recommendation Prediction\")\n",
542
+ " plt.xlabel(\"Feature\")\n",
543
+ " plt.ylabel(\"Importance\")\n",
544
+ " plt.tight_layout()\n",
545
+ " save_figure(\"feature_importance\")\n",
546
+ " created_figures.append(\"feature_importance.png\")\n",
547
+ " plt.close()\n",
548
+ "\n",
549
+ " report = \"Model accuracy: {:.4f}\\n\\n{}\".format(\n",
550
+ " accuracy,\n",
551
+ " classification_report(y_test, predictions)\n",
552
+ " )\n",
553
+ " save_text(report, \"classification_report\")\n",
554
+ "except Exception as e:\n",
555
+ " print(\"ML section skipped:\", repr(e))\n",
556
+ "\n",
557
+ "print(\"Artifact creation section finished.\")"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 8,
563
+ "id": "c4bbc916",
564
+ "metadata": {
565
+ "colab": {
566
+ "base_uri": "https://localhost:8080/"
567
+ },
568
+ "id": "c4bbc916",
569
+ "outputId": "1dc63b01-ed81-47cd-cf56-3e193b2f87f2"
570
+ },
571
+ "outputs": [
572
+ {
573
+ "output_type": "stream",
574
+ "name": "stdout",
575
+ "text": [
576
+ "Saved table everywhere: dashboard_summary.csv\n",
577
+ "Saved text everywhere: business_insights_report.txt\n",
578
+ "STEP 2 COMPLETE.\n",
579
+ "Figures: ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
580
+ "Tables: ['average_rating_by_age.csv', 'category_return_rate.csv', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
581
+ "Outputs: ['average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n"
582
+ ]
583
+ }
584
+ ],
585
+ "source": [
586
+ "# ==================================================\n",
587
+ "# FINAL REPORT + MANIFEST\n",
588
+ "# ==================================================\n",
589
+ "\n",
590
+ "summary_rows = [\n",
591
+ " {\"metric\": \"reviews_rows\", \"value\": int(len(reviews_df))},\n",
592
+ " {\"metric\": \"returns_rows\", \"value\": int(len(returns_df))},\n",
593
+ " {\"metric\": \"figures_created\", \"value\": int(len(list(FIGURES.glob(\"*.png\"))))},\n",
594
+ " {\"metric\": \"tables_created\", \"value\": int(len(list(TABLES.glob(\"*.csv\"))))},\n",
595
+ "]\n",
596
+ "\n",
597
+ "summary_df = pd.DataFrame(summary_rows).set_index(\"metric\")\n",
598
+ "save_table(summary_df, \"dashboard_summary\")\n",
599
+ "\n",
600
+ "insights = \"\"\"\n",
601
+ "FINAL BUSINESS INSIGHTS\n",
602
+ "=======================\n",
603
+ "\n",
604
+ "This analysis supports an e-commerce return prediction and review intelligence assistant.\n",
605
+ "\n",
606
+ "Main findings:\n",
607
+ "- Customer ratings and recommendation behavior are useful signals for product satisfaction.\n",
608
+ "- Review text reveals return-risk themes such as fit, sizing, fabric, quality, and discomfort.\n",
609
+ "- Product categories with higher estimated return rates should be prioritized for improvement.\n",
610
+ "- Monthly return-rate tracking can help the business monitor operational or seasonal changes.\n",
611
+ "\n",
612
+ "Recommended automations:\n",
613
+ "1. Automatically scan new reviews for return-risk keywords.\n",
614
+ "2. Automatically rank products and categories by estimated return risk.\n",
615
+ "3. Automatically generate business recommendations for product pages, sizing guidance, and quality control.\n",
616
+ "\"\"\"\n",
617
+ "\n",
618
+ "save_text(insights, \"business_insights_report\")\n",
619
+ "\n",
620
+ "manifest = {\n",
621
+ " \"base_path\": str(BASE_PATH),\n",
622
+ " \"figures\": sorted([p.name for p in FIGURES.glob(\"*.png\")]),\n",
623
+ " \"tables\": sorted([p.name for p in TABLES.glob(\"*.csv\")]),\n",
624
+ " \"outputs\": sorted([p.name for p in OUTPUTS.iterdir() if p.is_file()]),\n",
625
+ "}\n",
626
+ "\n",
627
+ "for folder in [OUTPUTS, ARTIFACTS, TABLES]:\n",
628
+ " try:\n",
629
+ " with open(folder / \"artifacts_manifest.json\", \"w\", encoding=\"utf-8\") as f:\n",
630
+ " json.dump(manifest, f, indent=2)\n",
631
+ " except Exception as e:\n",
632
+ " print(f\"Could not save manifest in {folder}: {e}\")\n",
633
+ "\n",
634
+ "print(\"STEP 2 COMPLETE.\")\n",
635
+ "print(\"Figures:\", manifest[\"figures\"])\n",
636
+ "print(\"Tables:\", manifest[\"tables\"])\n",
637
+ "print(\"Outputs:\", manifest[\"outputs\"])"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "source": [
643
+ "print(\"\\nFINAL ARTIFACT CHECK\")\n",
644
+ "\n",
645
+ "check_dirs = {\n",
646
+ " \"FIGURES\": FIGURES,\n",
647
+ " \"TABLES\": TABLES,\n",
648
+ " \"OUTPUTS\": OUTPUTS,\n",
649
+ " \"OUTPUT_FIGURES\": OUTPUT_FIGURES,\n",
650
+ " \"OUTPUT_TABLES\": OUTPUT_TABLES,\n",
651
+ " \"ARTIFACTS\": ARTIFACTS,\n",
652
+ " \"ARTIFACT_FIGURES\": ARTIFACT_FIGURES,\n",
653
+ " \"ARTIFACT_TABLES\": ARTIFACT_TABLES,\n",
654
+ "}\n",
655
+ "\n",
656
+ "for label, folder in check_dirs.items():\n",
657
+ " files = sorted([p.name for p in folder.iterdir() if p.is_file()])\n",
658
+ " print(label, \"=\", files)"
659
+ ],
660
+ "metadata": {
661
+ "colab": {
662
+ "base_uri": "https://localhost:8080/"
663
+ },
664
+ "id": "fexa62gDM2c7",
665
+ "outputId": "e84626f3-e126-43f8-a408-665ccd7eb914"
666
+ },
667
+ "id": "fexa62gDM2c7",
668
+ "execution_count": 9,
669
+ "outputs": [
670
+ {
671
+ "output_type": "stream",
672
+ "name": "stdout",
673
+ "text": [
674
+ "\n",
675
+ "FINAL ARTIFACT CHECK\n",
676
+ "FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
677
+ "TABLES = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
678
+ "OUTPUTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n",
679
+ "OUTPUT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
680
+ "OUTPUT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
681
+ "ARTIFACTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n",
682
+ "ARTIFACT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
683
+ "ARTIFACT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n"
684
+ ]
685
+ }
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "source": [
691
+ "# ==================================================\n",
692
+ "# FORCE DASHBOARD ARTIFACTS FOR SE21 HUGGING FACE APP\n",
693
+ "# Put this as the VERY LAST CELL of pythonanalysis.ipynb\n",
694
+ "# ==================================================\n",
695
+ "\n",
696
+ "import os\n",
697
+ "import json\n",
698
+ "from pathlib import Path\n",
699
+ "\n",
700
+ "import pandas as pd\n",
701
+ "import numpy as np\n",
702
+ "\n",
703
+ "import matplotlib\n",
704
+ "matplotlib.use(\"Agg\")\n",
705
+ "import matplotlib.pyplot as plt\n",
706
+ "\n",
707
+ "# Detect runtime\n",
708
+ "if Path(\"/app\").exists():\n",
709
+ " BASE_PATH = Path(\"/app\")\n",
710
+ "elif Path(\"/content\").exists():\n",
711
+ " BASE_PATH = Path(\"/content\")\n",
712
+ "else:\n",
713
+ " BASE_PATH = Path.cwd()\n",
714
+ "\n",
715
+ "# THESE ARE THE EXACT FOLDERS app.py READS\n",
716
+ "PY_FIG_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"figures\"\n",
717
+ "PY_TAB_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"tables\"\n",
718
+ "\n",
719
+ "PY_FIG_DIR.mkdir(parents=True, exist_ok=True)\n",
720
+ "PY_TAB_DIR.mkdir(parents=True, exist_ok=True)\n",
721
+ "\n",
722
+ "print(\"Saving dashboard artifacts to:\")\n",
723
+ "print(\"Figures:\", PY_FIG_DIR)\n",
724
+ "print(\"Tables:\", PY_TAB_DIR)\n",
725
+ "\n",
726
+ "# Find CSV files\n",
727
+ "csv_paths = [\n",
728
+ " p for p in BASE_PATH.rglob(\"*.csv\")\n",
729
+ " if \"sample_data\" not in str(p)\n",
730
+ " and \"artifacts\" not in str(p)\n",
731
+ " and \"outputs\" not in str(p)\n",
732
+ " and \"figures\" not in str(p)\n",
733
+ " and \"tables\" not in str(p)\n",
734
+ "]\n",
735
+ "\n",
736
+ "print(\"CSV files found:\")\n",
737
+ "for p in csv_paths:\n",
738
+ " print(\"-\", p)\n",
739
+ "\n",
740
+ "# Find reviews dataset\n",
741
+ "reviews_candidates = [\n",
742
+ " BASE_PATH / \"data_processed\" / \"reviews_cleaned.csv\",\n",
743
+ " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n",
744
+ "]\n",
745
+ "\n",
746
+ "reviews_path = next((p for p in reviews_candidates if p.exists()), None)\n",
747
+ "\n",
748
+ "if reviews_path is None:\n",
749
+ " matches = [\n",
750
+ " p for p in csv_paths\n",
751
+ " if \"clothing\" in p.name.lower() or \"review\" in p.name.lower()\n",
752
+ " ]\n",
753
+ " reviews_path = matches[0] if matches else None\n",
754
+ "\n",
755
+ "# Find returns dataset\n",
756
+ "returns_candidates = [\n",
757
+ " BASE_PATH / \"data_processed\" / \"returns_input.csv\",\n",
758
+ " BASE_PATH / \"data_processed\" / \"returns_cleaned.csv\",\n",
759
+ " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n",
760
+ " BASE_PATH / \"data_processed\" / \"synthetic_return_risk.csv\",\n",
761
+ "]\n",
762
+ "\n",
763
+ "returns_path = next((p for p in returns_candidates if p.exists()), None)\n",
764
+ "\n",
765
+ "if returns_path is None:\n",
766
+ " matches = [\n",
767
+ " p for p in csv_paths\n",
768
+ " if \"return\" in p.name.lower()\n",
769
+ " ]\n",
770
+ " returns_path = matches[0] if matches else None\n",
771
+ "\n",
772
+ "if reviews_path is None:\n",
773
+ " raise FileNotFoundError(\"Could not find reviews CSV.\")\n",
774
+ "\n",
775
+ "if returns_path is None:\n",
776
+ " raise FileNotFoundError(\"Could not find returns CSV.\")\n",
777
+ "\n",
778
+ "print(\"Using reviews:\", reviews_path)\n",
779
+ "print(\"Using returns:\", returns_path)\n",
780
+ "\n",
781
+ "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
782
+ "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
783
+ "\n",
784
+ "print(\"Reviews shape:\", reviews_df.shape)\n",
785
+ "print(\"Returns shape:\", returns_df.shape)\n",
786
+ "\n",
787
+ "# --------------------------------------------------\n",
788
+ "# 1. Rating distribution\n",
789
+ "# --------------------------------------------------\n",
790
+ "if \"Rating\" in reviews_df.columns:\n",
791
+ " rating_distribution = (\n",
792
+ " reviews_df[\"Rating\"]\n",
793
+ " .dropna()\n",
794
+ " .value_counts()\n",
795
+ " .sort_index()\n",
796
+ " .reset_index()\n",
797
+ " )\n",
798
+ " rating_distribution.columns = [\"rating\", \"count\"]\n",
799
+ "\n",
800
+ " rating_distribution.to_csv(PY_TAB_DIR / \"rating_distribution.csv\", index=False)\n",
801
+ "\n",
802
+ " plt.figure(figsize=(7, 4))\n",
803
+ " plt.bar(rating_distribution[\"rating\"].astype(str), rating_distribution[\"count\"])\n",
804
+ " plt.title(\"Distribution of Customer Ratings\")\n",
805
+ " plt.xlabel(\"Rating\")\n",
806
+ " plt.ylabel(\"Number of Reviews\")\n",
807
+ " plt.tight_layout()\n",
808
+ " plt.savefig(PY_FIG_DIR / \"rating_distribution.png\", dpi=150, bbox_inches=\"tight\")\n",
809
+ " plt.close()\n",
810
+ "\n",
811
+ "# --------------------------------------------------\n",
812
+ "# 2. Sentiment counts for app's sentiment chart\n",
813
+ "# The app specifically looks for sentiment_counts_sampled.csv\n",
814
+ "# --------------------------------------------------\n",
815
+ "if \"Rating\" in reviews_df.columns:\n",
816
+ " temp = reviews_df.copy()\n",
817
+ "\n",
818
+ " def rating_to_sentiment(r):\n",
819
+ " try:\n",
820
+ " r = float(r)\n",
821
+ " if r <= 2:\n",
822
+ " return \"negative\"\n",
823
+ " elif r == 3:\n",
824
+ " return \"neutral\"\n",
825
+ " else:\n",
826
+ " return \"positive\"\n",
827
+ " except:\n",
828
+ " return \"neutral\"\n",
829
+ "\n",
830
+ " temp[\"sentiment\"] = temp[\"Rating\"].apply(rating_to_sentiment)\n",
831
+ "\n",
832
+ " group_col = \"Class Name\" if \"Class Name\" in temp.columns else None\n",
833
+ "\n",
834
+ " if group_col:\n",
835
+ " sentiment_counts = (\n",
836
+ " temp.groupby([group_col, \"sentiment\"])\n",
837
+ " .size()\n",
838
+ " .unstack(fill_value=0)\n",
839
+ " .reset_index()\n",
840
+ " .head(15)\n",
841
+ " )\n",
842
+ " sentiment_counts = sentiment_counts.rename(columns={group_col: \"title\"})\n",
843
+ " else:\n",
844
+ " sentiment_counts = (\n",
845
+ " temp[\"sentiment\"]\n",
846
+ " .value_counts()\n",
847
+ " .to_frame()\n",
848
+ " .T\n",
849
+ " .reset_index(drop=True)\n",
850
+ " )\n",
851
+ " sentiment_counts.insert(0, \"title\", \"All Reviews\")\n",
852
+ "\n",
853
+ " for col in [\"negative\", \"neutral\", \"positive\"]:\n",
854
+ " if col not in sentiment_counts.columns:\n",
855
+ " sentiment_counts[col] = 0\n",
856
+ "\n",
857
+ " sentiment_counts[[\"title\", \"negative\", \"neutral\", \"positive\"]].to_csv(\n",
858
+ " PY_TAB_DIR / \"sentiment_counts_sampled.csv\",\n",
859
+ " index=False\n",
860
+ " )\n",
861
+ "\n",
862
+ " # Also save a normal figure\n",
863
+ " sentiment_total = temp[\"sentiment\"].value_counts().reindex(\n",
864
+ " [\"negative\", \"neutral\", \"positive\"],\n",
865
+ " fill_value=0\n",
866
+ " )\n",
867
+ "\n",
868
+ " plt.figure(figsize=(7, 4))\n",
869
+ " plt.bar(sentiment_total.index, sentiment_total.values)\n",
870
+ " plt.title(\"Review Sentiment Distribution\")\n",
871
+ " plt.xlabel(\"Sentiment\")\n",
872
+ " plt.ylabel(\"Number of Reviews\")\n",
873
+ " plt.tight_layout()\n",
874
+ " plt.savefig(PY_FIG_DIR / \"sentiment_distribution.png\", dpi=150, bbox_inches=\"tight\")\n",
875
+ " plt.close()\n",
876
+ "\n",
877
+ "# --------------------------------------------------\n",
878
+ "# 3. Category return rate\n",
879
+ "# --------------------------------------------------\n",
880
+ "return_col = None\n",
881
+ "for candidate in [\"likely_return\", \"synthetic_return_risk\", \"returned\", \"return_flag\"]:\n",
882
+ " if candidate in returns_df.columns:\n",
883
+ " return_col = candidate\n",
884
+ " break\n",
885
+ "\n",
886
+ "category_col = None\n",
887
+ "for candidate in [\"product_category_name\", \"category\", \"Class Name\", \"product_id\"]:\n",
888
+ " if candidate in returns_df.columns:\n",
889
+ " category_col = candidate\n",
890
+ " break\n",
891
+ "\n",
892
+ "if return_col is not None:\n",
893
+ " returns_df[return_col] = pd.to_numeric(returns_df[return_col], errors=\"coerce\")\n",
894
+ "\n",
895
+ "if return_col is not None and category_col is not None:\n",
896
+ " category_return_rate = (\n",
897
+ " returns_df.groupby(category_col)[return_col]\n",
898
+ " .mean()\n",
899
+ " .sort_values(ascending=False)\n",
900
+ " .head(15)\n",
901
+ " .reset_index()\n",
902
+ " )\n",
903
+ " category_return_rate.columns = [\"category\", \"return_rate\"]\n",
904
+ "\n",
905
+ " category_return_rate.to_csv(PY_TAB_DIR / \"category_return_rate.csv\", index=False)\n",
906
+ "\n",
907
+ " plt.figure(figsize=(11, 5))\n",
908
+ " plt.bar(category_return_rate[\"category\"].astype(str), category_return_rate[\"return_rate\"])\n",
909
+ " plt.title(\"Highest Return-Rate Categories\")\n",
910
+ " plt.xlabel(\"Category\")\n",
911
+ " plt.ylabel(\"Return Rate\")\n",
912
+ " plt.xticks(rotation=75)\n",
913
+ " plt.tight_layout()\n",
914
+ " plt.savefig(PY_FIG_DIR / \"category_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n",
915
+ " plt.close()\n",
916
+ "\n",
917
+ " # The template's AI fallback weirdly expects this filename for \"top\" questions.\n",
918
+ " # We reuse it to show highest return-risk categories.\n",
919
+ " top_titles_by_units_sold = category_return_rate.copy()\n",
920
+ " top_titles_by_units_sold.columns = [\"title\", \"units_sold\"]\n",
921
+ " top_titles_by_units_sold.to_csv(PY_TAB_DIR / \"top_titles_by_units_sold.csv\", index=False)\n",
922
+ "\n",
923
+ "# --------------------------------------------------\n",
924
+ "# 4. Dashboard time-series file\n",
925
+ "# The app's dashboard chart specifically looks for df_dashboard.csv\n",
926
+ "# --------------------------------------------------\n",
927
+ "if \"order_purchase_timestamp\" in returns_df.columns and return_col is not None:\n",
928
+ " ts = returns_df.copy()\n",
929
+ " ts[\"order_purchase_timestamp\"] = pd.to_datetime(\n",
930
+ " ts[\"order_purchase_timestamp\"],\n",
931
+ " errors=\"coerce\"\n",
932
+ " )\n",
933
+ " ts = ts.dropna(subset=[\"order_purchase_timestamp\"])\n",
934
+ "\n",
935
+ " if not ts.empty:\n",
936
+ " dashboard_df = (\n",
937
+ " ts.set_index(\"order_purchase_timestamp\")\n",
938
+ " .resample(\"M\")\n",
939
+ " .agg(\n",
940
+ " return_rate=(return_col, \"mean\"),\n",
941
+ " orders=(return_col, \"count\")\n",
942
+ " )\n",
943
+ " .reset_index()\n",
944
+ " )\n",
945
+ " dashboard_df = dashboard_df.rename(columns={\"order_purchase_timestamp\": \"month\"})\n",
946
+ " else:\n",
947
+ " dashboard_df = pd.DataFrame({\n",
948
+ " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n",
949
+ " \"return_rate\": [0, 0, 0],\n",
950
+ " \"orders\": [0, 0, 0],\n",
951
+ " })\n",
952
+ "else:\n",
953
+ " dashboard_df = pd.DataFrame({\n",
954
+ " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n",
955
+ " \"return_rate\": [0, 0, 0],\n",
956
+ " \"orders\": [0, 0, 0],\n",
957
+ " })\n",
958
+ "\n",
959
+ "dashboard_df.to_csv(PY_TAB_DIR / \"df_dashboard.csv\", index=False)\n",
960
+ "\n",
961
+ "plt.figure(figsize=(9, 4))\n",
962
+ "plt.plot(pd.to_datetime(dashboard_df[\"month\"]), dashboard_df[\"return_rate\"], marker=\"o\")\n",
963
+ "plt.title(\"Monthly Estimated Return Rate\")\n",
964
+ "plt.xlabel(\"Month\")\n",
965
+ "plt.ylabel(\"Return Rate\")\n",
966
+ "plt.tight_layout()\n",
967
+ "plt.savefig(PY_FIG_DIR / \"monthly_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n",
968
+ "plt.close()\n",
969
+ "\n",
970
+ "# --------------------------------------------------\n",
971
+ "# 5. KPIs\n",
972
+ "# --------------------------------------------------\n",
973
+ "kpis = {\n",
974
+ " \"reviews_rows\": int(len(reviews_df)),\n",
975
+ " \"returns_rows\": int(len(returns_df)),\n",
976
+ " \"n_titles\": int(reviews_df[\"Clothing ID\"].nunique()) if \"Clothing ID\" in reviews_df.columns else int(len(reviews_df)),\n",
977
+ " \"n_months\": int(len(dashboard_df)),\n",
978
+ " \"total_units_sold\": int(len(returns_df)),\n",
979
+ " \"estimated_return_rate\": float(returns_df[return_col].mean()) if return_col is not None else None,\n",
980
+ "}\n",
981
+ "\n",
982
+ "with open(PY_TAB_DIR / \"kpis.json\", \"w\", encoding=\"utf-8\") as f:\n",
983
+ " json.dump(kpis, f, indent=2)\n",
984
+ "\n",
985
+ "# --------------------------------------------------\n",
986
+ "# Final verification\n",
987
+ "# --------------------------------------------------\n",
988
+ "print(\"\\nFORCE ARTIFACT CELL RAN SUCCESSFULLY\")\n",
989
+ "print(\"Figures now in app-readable folder:\")\n",
990
+ "print(sorted([p.name for p in PY_FIG_DIR.glob(\"*\")]))\n",
991
+ "\n",
992
+ "print(\"Tables now in app-readable folder:\")\n",
993
+ "print(sorted([p.name for p in PY_TAB_DIR.glob(\"*\")]))"
994
+ ],
995
+ "metadata": {
996
+ "id": "G-jXRriWP1TW",
997
+ "outputId": "23349a23-0bdc-476f-fb72-8e388be9630c",
998
+ "colab": {
999
+ "base_uri": "https://localhost:8080/"
1000
+ }
1001
+ },
1002
+ "id": "G-jXRriWP1TW",
1003
+ "execution_count": 10,
1004
+ "outputs": [
1005
+ {
1006
+ "output_type": "stream",
1007
+ "name": "stdout",
1008
+ "text": [
1009
+ "Saving dashboard artifacts to:\n",
1010
+ "Figures: /content/artifacts/py/figures\n",
1011
+ "Tables: /content/artifacts/py/tables\n",
1012
+ "CSV files found:\n",
1013
+ "- /content/Womens Clothing E-Commerce Reviews.csv\n",
1014
+ "- /content/ecommerce_returns_cleaned.csv\n",
1015
+ "Using reviews: /content/Womens Clothing E-Commerce Reviews.csv\n",
1016
+ "Using returns: /content/ecommerce_returns_cleaned.csv\n",
1017
+ "Reviews shape: (23486, 10)\n",
1018
+ "Returns shape: (113314, 29)\n",
1019
+ "\n",
1020
+ "FORCE ARTIFACT CELL RAN SUCCESSFULLY\n",
1021
+ "Figures now in app-readable folder:\n",
1022
+ "['category_return_rate.png', 'monthly_return_rate.png', 'rating_distribution.png', 'sentiment_distribution.png']\n",
1023
+ "Tables now in app-readable folder:\n",
1024
+ "['category_return_rate.csv', 'df_dashboard.csv', 'kpis.json', 'rating_distribution.csv', 'sentiment_counts_sampled.csv', 'top_titles_by_units_sold.csv']\n"
1025
+ ]
1026
+ }
1027
+ ]
1028
+ }
1029
+ ],
1030
+ "metadata": {
1031
+ "kernelspec": {
1032
+ "display_name": "Python 3",
1033
+ "language": "python",
1034
+ "name": "python3"
1035
+ },
1036
+ "language_info": {
1037
+ "name": "python",
1038
+ "version": "3.10"
1039
+ },
1040
+ "colab": {
1041
+ "provenance": []
1042
+ }
1043
+ },
1044
+ "nbformat": 4,
1045
+ "nbformat_minor": 5
1046
+ }
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==6.0.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ matplotlib>=3.7.0
5
+ seaborn>=0.13.0
6
+ statsmodels>=0.14.0
7
+ scikit-learn>=1.3.0
8
+ papermill>=2.5.0
9
+ nbformat>=5.9.0
10
+ pillow>=10.0.0
11
+ requests>=2.31.0
12
+ beautifulsoup4>=4.12.0
13
+ vaderSentiment>=3.3.2
14
+ huggingface_hub>=0.20.0
15
+ textblob>=0.18.0
16
+ faker>=20.0.0
17
+ plotly>=5.18.0
style.css ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* --- Target the Gradio app wrapper for backgrounds --- */
2
+ gradio-app,
3
+ .gradio-app,
4
+ .main,
5
+ #app,
6
+ [data-testid="app"] {
7
+ background-color: rgb(40,9,109) !important;
8
+ background-image:
9
+ url('https://huggingface.co/spaces/atascioglu/SE21AppTemplate/resolve/main/background_top.png'),
10
+ url('https://huggingface.co/spaces/atascioglu/SE21AppTemplate/resolve/main/background_mid.png'),
11
+ url('https://huggingface.co/spaces/atascioglu/SE21AppTemplate/resolve/main/background_bottom.png') !important;
12
+ background-position:
13
+ top center,
14
+ 0 913px,
15
+ bottom center !important;
16
+ background-repeat:
17
+ no-repeat,
18
+ repeat-y,
19
+ no-repeat !important;
20
+ background-size:
21
+ 100% auto,
22
+ 100% auto,
23
+ 100% auto !important;
24
+ min-height: 100vh !important;
25
+ }
26
+
27
+ /* --- Fallback on html/body --- */
28
+ html, body {
29
+ background-color: rgb(40,9,109) !important;
30
+ margin: 0 !important;
31
+ padding: 0 !important;
32
+ min-height: 100vh !important;
33
+ }
34
+
35
+ /* Bottom image is now part of the main background layers (positioned at bottom center) */
36
+
37
+ /* --- Main container --- */
38
+ .gradio-container {
39
+ max-width: 1400px !important;
40
+ width: 94vw !important;
41
+ margin: 0 auto !important;
42
+ padding-top: 220px !important;
43
+ padding-bottom: 150px !important;
44
+ background: transparent !important;
45
+ }
46
+
47
+ /* --- Title in ESCP gold --- */
48
+ #escp_title h1 {
49
+ color: rgb(242,198,55) !important;
50
+ font-size: 3rem !important;
51
+ font-weight: 800 !important;
52
+ text-align: center !important;
53
+ margin: 0 0 12px 0 !important;
54
+ }
55
+
56
+ /* --- Subtitle --- */
57
+ #escp_title p, #escp_title em {
58
+ color: rgba(255,255,255,0.85) !important;
59
+ text-align: center !important;
60
+ }
61
+
62
+ /* --- Tab bar background --- */
63
+ .tabs > .tab-nav,
64
+ .tab-nav,
65
+ div[role="tablist"],
66
+ .svelte-tabs > .tab-nav {
67
+ background: rgba(40,9,109,0.6) !important;
68
+ border-radius: 10px 10px 0 0 !important;
69
+ padding: 4px !important;
70
+ }
71
+
72
+ /* --- ALL tab buttons: force white text --- */
73
+ .tabs > .tab-nav button,
74
+ .tab-nav button,
75
+ div[role="tablist"] button,
76
+ button[role="tab"],
77
+ .svelte-tabs button,
78
+ .tab-nav > button,
79
+ .tabs button {
80
+ color: #ffffff !important;
81
+ font-weight: 600 !important;
82
+ border: none !important;
83
+ background: transparent !important;
84
+ padding: 10px 20px !important;
85
+ border-radius: 8px 8px 0 0 !important;
86
+ opacity: 1 !important;
87
+ }
88
+
89
+ /* --- Selected tab: ESCP gold --- */
90
+ .tabs > .tab-nav button.selected,
91
+ .tab-nav button.selected,
92
+ button[role="tab"][aria-selected="true"],
93
+ button[role="tab"].selected,
94
+ div[role="tablist"] button[aria-selected="true"],
95
+ .svelte-tabs button.selected {
96
+ color: rgb(242,198,55) !important;
97
+ background: rgba(255,255,255,0.12) !important;
98
+ }
99
+
100
+ /* --- Unselected tabs: ensure visibility --- */
101
+ .tabs > .tab-nav button:not(.selected),
102
+ .tab-nav button:not(.selected),
103
+ button[role="tab"][aria-selected="false"],
104
+ button[role="tab"]:not(.selected),
105
+ div[role="tablist"] button:not([aria-selected="true"]) {
106
+ color: #ffffff !important;
107
+ opacity: 1 !important;
108
+ }
109
+
110
+ /* --- White card panels --- */
111
+ .gradio-container .gr-block,
112
+ .gradio-container .gr-box,
113
+ .gradio-container .gr-panel,
114
+ .gradio-container .gr-group {
115
+ background: #ffffff !important;
116
+ border-radius: 10px !important;
117
+ }
118
+
119
+ /* --- Tab content area --- */
120
+ .tabitem {
121
+ background: rgba(255,255,255,0.95) !important;
122
+ border-radius: 0 0 10px 10px !important;
123
+ padding: 16px !important;
124
+ }
125
+
126
+ /* --- Inputs --- */
127
+ .gradio-container input,
128
+ .gradio-container textarea,
129
+ .gradio-container select {
130
+ background: #ffffff !important;
131
+ border: 1px solid #d1d5db !important;
132
+ border-radius: 8px !important;
133
+ }
134
+
135
+ /* --- Buttons: ESCP purple primary --- */
136
+ .gradio-container button:not([role="tab"]) {
137
+ font-weight: 600 !important;
138
+ padding: 10px 16px !important;
139
+ border-radius: 10px !important;
140
+ }
141
+
142
+ button.primary {
143
+ background-color: rgb(40,9,109) !important;
144
+ color: #ffffff !important;
145
+ border: none !important;
146
+ }
147
+
148
+ button.primary:hover {
149
+ background-color: rgb(60,20,140) !important;
150
+ }
151
+
152
+ button.secondary {
153
+ background-color: #ffffff !important;
154
+ color: rgb(40,9,109) !important;
155
+ border: 2px solid rgb(40,9,109) !important;
156
+ }
157
+
158
+ button.secondary:hover {
159
+ background-color: rgb(240,238,250) !important;
160
+ }
161
+
162
+ /* --- Dataframes --- */
163
+ [data-testid="dataframe"] {
164
+ background-color: #ffffff !important;
165
+ border-radius: 10px !important;
166
+ }
167
+
168
+ table {
169
+ font-size: 0.85rem !important;
170
+ }
171
+
172
+ /* --- Chatbot (AI Dashboard tab) --- */
173
+ .gr-chatbot {
174
+ min-height: 380px !important;
175
+ background-color: #ffffff !important;
176
+ border-radius: 12px !important;
177
+ }
178
+
179
+ .gr-chatbot .message.user {
180
+ background-color: rgb(232,225,250) !important;
181
+ border-radius: 12px !important;
182
+ }
183
+
184
+ .gr-chatbot .message.bot {
185
+ background-color: #f3f4f6 !important;
186
+ border-radius: 12px !important;
187
+ }
188
+
189
+ /* --- Gallery --- */
190
+ .gallery {
191
+ background: #ffffff !important;
192
+ border-radius: 10px !important;
193
+ }
194
+
195
+ /* --- Log textbox --- */
196
+ textarea {
197
+ font-family: monospace !important;
198
+ font-size: 0.8rem !important;
199
+ }
200
+
201
+ /* --- Markdown headings inside tabs --- */
202
+ .tabitem h3 {
203
+ color: rgb(40,9,109) !important;
204
+ font-weight: 700 !important;
205
+ }
206
+
207
+ .tabitem h4 {
208
+ color: #374151 !important;
209
+ }
210
+
211
+ /* --- Examples row (AI Dashboard) --- */
212
+ .examples-row button {
213
+ background: rgb(240,238,250) !important;
214
+ color: rgb(40,9,109) !important;
215
+ border: 1px solid rgb(40,9,109) !important;
216
+ border-radius: 8px !important;
217
+ font-size: 0.85rem !important;
218
+ }
219
+
220
+ .examples-row button:hover {
221
+ background: rgb(232,225,250) !important;
222
+ }
223
+
224
+ /* --- Header / footer: transparent over banner --- */
225
+ header, header *,
226
+ footer, footer * {
227
+ background: transparent !important;
228
+ box-shadow: none !important;
229
+ }
230
+
231
+ footer a, footer button,
232
+ header a, header button {
233
+ background: transparent !important;
234
+ border: none !important;
235
+ box-shadow: none !important;
236
+ }
237
+
238
+ #footer, #footer *,
239
+ [class*="footer"], [class*="footer"] *,
240
+ [class*="chip"], [class*="pill"], [class*="chip"] *, [class*="pill"] * {
241
+ background: transparent !important;
242
+ border: none !important;
243
+ box-shadow: none !important;
244
+ }
245
+
246
+ [data-testid*="api"], [data-testid*="settings"],
247
+ [id*="api"], [id*="settings"],
248
+ [class*="api"], [class*="settings"],
249
+ [class*="bottom"], [class*="toolbar"], [class*="controls"] {
250
+ background: transparent !important;
251
+ box-shadow: none !important;
252
+ }
253
+
254
+ [data-testid*="api"] *, [data-testid*="settings"] *,
255
+ [id*="api"] *, [id*="settings"] *,
256
+ [class*="api"] *, [class*="settings"] * {
257
+ background: transparent !important;
258
+ box-shadow: none !important;
259
+ }
260
+
261
+ section footer {
262
+ background: transparent !important;
263
+ }
264
+
265
+ section footer button,
266
+ section footer a {
267
+ background: transparent !important;
268
+ background-color: transparent !important;
269
+ border: none !important;
270
+ box-shadow: none !important;
271
+ color: white !important;
272
+ }
273
+
274
+ section footer button:hover,
275
+ section footer button:focus,
276
+ section footer a:hover,
277
+ section footer a:focus {
278
+ background: transparent !important;
279
+ background-color: transparent !important;
280
+ box-shadow: none !important;
281
+ }
282
+
283
+ section footer button,
284
+ section footer button * {
285
+ background: transparent !important;
286
+ background-color: transparent !important;
287
+ background-image: none !important;
288
+ box-shadow: none !important;
289
+ filter: none !important;
290
+ }
291
+
292
+ section footer button::before,
293
+ section footer button::after {
294
+ background: transparent !important;
295
+ background-color: transparent !important;
296
+ background-image: none !important;
297
+ box-shadow: none !important;
298
+ filter: none !important;
299
+ }
300
+
301
+ section footer a,
302
+ section footer a * {
303
+ background: transparent !important;
304
+ background-color: transparent !important;
305
+ box-shadow: none !important;
306
+ }
307
+
308
+ .gradio-container footer button,
309
+ .gradio-container footer button *,
310
+ .gradio-container .footer button,
311
+ .gradio-container .footer button * {
312
+ background: transparent !important;
313
+ background-color: transparent !important;
314
+ background-image: none !important;
315
+ box-shadow: none !important;
316
+ }
317
+
318
+ .gradio-container footer button::before,
319
+ .gradio-container footer button::after,
320
+ .gradio-container .footer button::before,
321
+ .gradio-container .footer button::after {
322
+ background: transparent !important;
323
+ background-color: transparent !important;
324
+ background-image: none !important;
325
+ box-shadow: none !important;
326
+ }