ceeyyuhhh commited on
Commit
5bb4638
·
verified ·
1 Parent(s): 3b4a320

Delete pythonanalysis.ipynb

Browse files
Files changed (1) hide show
  1. pythonanalysis.ipynb +0 -1046
pythonanalysis.ipynb DELETED
@@ -1,1046 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "85361b58",
6
- "metadata": {
7
- "id": "85361b58"
8
- },
9
- "source": [
10
- "# Step 2 — Python Analysis / Modeling\n",
11
- "\n",
12
- "Clean version for the Hugging Face SE21 app template. It creates dashboard artifacts."
13
- ]
14
- },
15
- {
16
- "cell_type": "code",
17
- "execution_count": 5,
18
- "id": "c88b847c",
19
- "metadata": {
20
- "colab": {
21
- "base_uri": "https://localhost:8080/"
22
- },
23
- "id": "c88b847c",
24
- "outputId": "d0c3643a-d491-4746-a55b-35ed016e4fe4"
25
- },
26
- "outputs": [
27
- {
28
- "output_type": "stream",
29
- "name": "stdout",
30
- "text": [
31
- "Environment ready.\n",
32
- "BASE_PATH: /content\n",
33
- "CSV files found:\n",
34
- "- /content/Womens Clothing E-Commerce Reviews.csv\n",
35
- "- /content/ecommerce_returns_cleaned.csv\n",
36
- "Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n",
37
- "Using returns file: /content/ecommerce_returns_cleaned.csv\n",
38
- "Reviews shape: (23486, 10)\n",
39
- "Returns shape: (113314, 29)\n",
40
- "Reviews columns: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']\n",
41
- "Returns columns: ['order_id', 'order_item_id', 'product_id', 'seller_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_score', 'review_comment_title', 'review_comment_message', 'price', 'freight_value', 'total_cost', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'has_review_text', 'review_text_length', 'delivery_delay_days', 'negative_keyword_flag', 'synthetic_return_risk', 'likely_return']\n",
42
- "Data loaded and cleaned.\n"
43
- ]
44
- }
45
- ],
46
- "source": [
47
- "# ==================================================\n",
48
- "# STEP 2: UNIVERSAL ANALYSIS SETUP\n",
49
- "# Works in BOTH Hugging Face Spaces and Google Colab\n",
50
- "# ==================================================\n",
51
- "\n",
52
- "import os\n",
53
- "import json\n",
54
- "import random\n",
55
- "import warnings\n",
56
- "from pathlib import Path\n",
57
- "\n",
58
- "os.environ.setdefault(\"MPLCONFIGDIR\", \"/tmp/matplotlib\")\n",
59
- "\n",
60
- "import numpy as np\n",
61
- "import pandas as pd\n",
62
- "import matplotlib.pyplot as plt\n",
63
- "\n",
64
- "warnings.filterwarnings(\"ignore\")\n",
65
- "random.seed(42)\n",
66
- "np.random.seed(42)\n",
67
- "\n",
68
- "# Pick the correct runtime folder automatically.\n",
69
- "# Hugging Face Space uses /app. Colab uses /content.\n",
70
- "candidate_roots = [Path(\"/app\"), Path(\"/content\"), Path.cwd(), Path(\"/mnt/data\")]\n",
71
- "BASE_PATH = None\n",
72
- "\n",
73
- "for root in candidate_roots:\n",
74
- " if root.exists():\n",
75
- " csvs = []\n",
76
- " for p in root.rglob(\"*.csv\"):\n",
77
- " parts = {part.lower() for part in p.parts}\n",
78
- " if \"sample_data\" in parts:\n",
79
- " continue\n",
80
- " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n",
81
- " continue\n",
82
- " csvs.append(p)\n",
83
- " if csvs:\n",
84
- " BASE_PATH = root\n",
85
- " break\n",
86
- "\n",
87
- "if BASE_PATH is None:\n",
88
- " if Path(\"/app\").exists():\n",
89
- " BASE_PATH = Path(\"/app\")\n",
90
- " elif Path(\"/content\").exists():\n",
91
- " BASE_PATH = Path(\"/content\")\n",
92
- " else:\n",
93
- " BASE_PATH = Path.cwd()\n",
94
- "\n",
95
- "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n",
96
- "\n",
97
- "OUTPUTS = BASE_PATH / \"outputs\"\n",
98
- "FIGURES = BASE_PATH / \"figures\"\n",
99
- "TABLES = BASE_PATH / \"tables\"\n",
100
- "ARTIFACTS = BASE_PATH / \"artifacts\"\n",
101
- "\n",
102
- "# Extra folders because different templates check different places\n",
103
- "OUTPUT_FIGURES = OUTPUTS / \"figures\"\n",
104
- "OUTPUT_TABLES = OUTPUTS / \"tables\"\n",
105
- "ARTIFACT_FIGURES = ARTIFACTS / \"figures\"\n",
106
- "ARTIFACT_TABLES = ARTIFACTS / \"tables\"\n",
107
- "\n",
108
- "ALL_OUTPUT_DIRS = [\n",
109
- " DATA_PROCESSED,\n",
110
- " OUTPUTS,\n",
111
- " FIGURES,\n",
112
- " TABLES,\n",
113
- " ARTIFACTS,\n",
114
- " OUTPUT_FIGURES,\n",
115
- " OUTPUT_TABLES,\n",
116
- " ARTIFACT_FIGURES,\n",
117
- " ARTIFACT_TABLES,\n",
118
- "]\n",
119
- "\n",
120
- "for folder in ALL_OUTPUT_DIRS:\n",
121
- " folder.mkdir(parents=True, exist_ok=True)\n",
122
- "\n",
123
- "print(\"Environment ready.\")\n",
124
- "print(\"BASE_PATH:\", BASE_PATH)\n",
125
- "\n",
126
- "# Load data created by Step 1 if available.\n",
127
- "csv_paths = []\n",
128
- "for p in BASE_PATH.rglob(\"*.csv\"):\n",
129
- " parts = {part.lower() for part in p.parts}\n",
130
- " if \"sample_data\" in parts:\n",
131
- " continue\n",
132
- " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n",
133
- " continue\n",
134
- " csv_paths.append(p)\n",
135
- "\n",
136
- "print(\"CSV files found:\")\n",
137
- "for p in csv_paths:\n",
138
- " print(\"-\", p)\n",
139
- "\n",
140
- "def first_existing(paths):\n",
141
- " for p in paths:\n",
142
- " if Path(p).exists():\n",
143
- " return Path(p)\n",
144
- " return None\n",
145
- "\n",
146
- "reviews_path = first_existing([\n",
147
- " DATA_PROCESSED / \"reviews_cleaned.csv\",\n",
148
- " DATA_PROCESSED / \"womens_reviews_cleaned.csv\",\n",
149
- " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n",
150
- "])\n",
151
- "\n",
152
- "returns_path = first_existing([\n",
153
- " DATA_PROCESSED / \"returns_input.csv\",\n",
154
- " DATA_PROCESSED / \"returns_cleaned.csv\",\n",
155
- " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n",
156
- " DATA_PROCESSED / \"synthetic_return_risk.csv\",\n",
157
- "])\n",
158
- "\n",
159
- "# Fallback search.\n",
160
- "if reviews_path is None:\n",
161
- " review_matches = [\n",
162
- " p for p in csv_paths\n",
163
- " if (\"clothing\" in p.name.lower()) or (\"review\" in p.name.lower() and \"return\" not in p.name.lower())\n",
164
- " ]\n",
165
- " reviews_path = review_matches[0] if review_matches else None\n",
166
- "\n",
167
- "if returns_path is None:\n",
168
- " return_matches = [\n",
169
- " p for p in csv_paths\n",
170
- " if \"return\" in p.name.lower()\n",
171
- " ]\n",
172
- " returns_path = return_matches[0] if return_matches else None\n",
173
- "\n",
174
- "\n",
175
- "if returns_path is None:\n",
176
- " raise FileNotFoundError(\"Step 2 could not find the ecommerce returns CSV.\")\n",
177
- "\n",
178
- "print(\"Using reviews file:\", reviews_path)\n",
179
- "print(\"Using returns file:\", returns_path)\n",
180
- "\n",
181
- "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
182
- "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
183
- "\n",
184
- "print(\"Reviews shape:\", reviews_df.shape)\n",
185
- "print(\"Returns shape:\", returns_df.shape)\n",
186
- "print(\"Reviews columns:\", reviews_df.columns.tolist())\n",
187
- "print(\"Returns columns:\", returns_df.columns.tolist())\n",
188
- "\n",
189
- "# Basic cleanup / type safety\n",
190
- "for col in [\"Age\", \"Rating\", \"Recommended IND\", \"Positive Feedback Count\"]:\n",
191
- " if col in reviews_df.columns:\n",
192
- " reviews_df[col] = pd.to_numeric(reviews_df[col], errors=\"coerce\")\n",
193
- "\n",
194
- "if \"Review Text\" in reviews_df.columns:\n",
195
- " reviews_df[\"Review Text\"] = reviews_df[\"Review Text\"].fillna(\"\").astype(str)\n",
196
- "\n",
197
- "if \"Class Name\" in reviews_df.columns:\n",
198
- " reviews_df[\"Class Name\"] = reviews_df[\"Class Name\"].fillna(\"Unknown\").astype(str)\n",
199
- "\n",
200
- "for col in [\"review_score\", \"likely_return\", \"price\", \"freight_value\", \"delivery_delay_days\", \"synthetic_return_risk\"]:\n",
201
- " if col in returns_df.columns:\n",
202
- " returns_df[col] = pd.to_numeric(returns_df[col], errors=\"coerce\")\n",
203
- "\n",
204
- "print(\"Data loaded and cleaned.\")"
205
- ]
206
- },
207
- {
208
- "cell_type": "code",
209
- "execution_count": 6,
210
- "id": "f9eb3801",
211
- "metadata": {
212
- "id": "f9eb3801"
213
- },
214
- "outputs": [],
215
- "source": [
216
- "# ==================================================\n",
217
- "# HELPERS: save artifacts where the app can find them\n",
218
- "# ==================================================\n",
219
- "# ==================================================\n",
220
- "# HELPERS: save artifacts everywhere the app may check\n",
221
- "# ==================================================\n",
222
- "\n",
223
- "def safe_write_csv(df, path):\n",
224
- " try:\n",
225
- " df.to_csv(path)\n",
226
- " return True\n",
227
- " except Exception as e:\n",
228
- " print(f\"Could not save {path}: {e}\")\n",
229
- " return False\n",
230
- "\n",
231
- "\n",
232
- "def safe_savefig(path):\n",
233
- " try:\n",
234
- " plt.savefig(path, dpi=150, bbox_inches=\"tight\")\n",
235
- " return True\n",
236
- " except Exception as e:\n",
237
- " print(f\"Could not save {path}: {e}\")\n",
238
- " return False\n",
239
- "\n",
240
- "\n",
241
- "def safe_write_text(text, path):\n",
242
- " try:\n",
243
- " path.write_text(text, encoding=\"utf-8\")\n",
244
- " return True\n",
245
- " except Exception as e:\n",
246
- " print(f\"Could not save {path}: {e}\")\n",
247
- " return False\n",
248
- "\n",
249
- "\n",
250
- "def save_table(df, name):\n",
251
- " if isinstance(df, pd.Series):\n",
252
- " df = df.to_frame()\n",
253
- "\n",
254
- " table_folders = [\n",
255
- " TABLES,\n",
256
- " OUTPUT_TABLES,\n",
257
- " OUTPUTS,\n",
258
- " ARTIFACT_TABLES,\n",
259
- " ARTIFACTS,\n",
260
- " ]\n",
261
- "\n",
262
- " saved_anywhere = False\n",
263
- "\n",
264
- " for folder in table_folders:\n",
265
- " folder.mkdir(parents=True, exist_ok=True)\n",
266
- " path = folder / f\"{name}.csv\"\n",
267
- " saved_anywhere = safe_write_csv(df, path) or saved_anywhere\n",
268
- "\n",
269
- " if saved_anywhere:\n",
270
- " print(f\"Saved table everywhere: {name}.csv\")\n",
271
- " else:\n",
272
- " raise RuntimeError(f\"Could not save table {name}.csv\")\n",
273
- "\n",
274
- "\n",
275
- "def save_figure(name):\n",
276
- " figure_folders = [\n",
277
- " FIGURES,\n",
278
- " OUTPUT_FIGURES,\n",
279
- " OUTPUTS,\n",
280
- " ARTIFACT_FIGURES,\n",
281
- " ARTIFACTS,\n",
282
- " ]\n",
283
- "\n",
284
- " saved_anywhere = False\n",
285
- "\n",
286
- " for folder in figure_folders:\n",
287
- " folder.mkdir(parents=True, exist_ok=True)\n",
288
- " path = folder / f\"{name}.png\"\n",
289
- " saved_anywhere = safe_savefig(path) or saved_anywhere\n",
290
- "\n",
291
- " if saved_anywhere:\n",
292
- " print(f\"Saved figure everywhere: {name}.png\")\n",
293
- " else:\n",
294
- " raise RuntimeError(f\"Could not save figure {name}.png\")\n",
295
- "\n",
296
- "\n",
297
- "def save_text(text, name):\n",
298
- " text_folders = [\n",
299
- " TABLES,\n",
300
- " OUTPUT_TABLES,\n",
301
- " OUTPUTS,\n",
302
- " ARTIFACT_TABLES,\n",
303
- " ARTIFACTS,\n",
304
- " ]\n",
305
- "\n",
306
- " saved_anywhere = False\n",
307
- "\n",
308
- " for folder in text_folders:\n",
309
- " folder.mkdir(parents=True, exist_ok=True)\n",
310
- " path = folder / f\"{name}.txt\"\n",
311
- " saved_anywhere = safe_write_text(text, path) or saved_anywhere\n",
312
- "\n",
313
- " if saved_anywhere:\n",
314
- " print(f\"Saved text everywhere: {name}.txt\")\n",
315
- " else:\n",
316
- " raise RuntimeError(f\"Could not save text {name}.txt\")"
317
- ]
318
- },
319
- {
320
- "cell_type": "code",
321
- "execution_count": 7,
322
- "id": "a99949ac",
323
- "metadata": {
324
- "colab": {
325
- "base_uri": "https://localhost:8080/"
326
- },
327
- "id": "a99949ac",
328
- "outputId": "33b9f5b0-67b0-4a44-8eef-b572cb8f7492"
329
- },
330
- "outputs": [
331
- {
332
- "output_type": "stream",
333
- "name": "stdout",
334
- "text": [
335
- "Saved table everywhere: rating_distribution.csv\n",
336
- "Saved figure everywhere: rating_distribution.png\n",
337
- "Saved table everywhere: recommendation_by_class.csv\n",
338
- "Saved figure everywhere: recommendation_by_class.png\n",
339
- "Saved table everywhere: average_rating_by_age.csv\n",
340
- "Saved figure everywhere: average_rating_by_age.png\n",
341
- "Saved table everywhere: negative_keyword_counts.csv\n",
342
- "Saved figure everywhere: negative_keyword_counts.png\n",
343
- "Saved table everywhere: category_return_rate.csv\n",
344
- "Saved figure everywhere: category_return_rate.png\n",
345
- "Saved table everywhere: monthly_return_rate.csv\n",
346
- "Saved figure everywhere: monthly_return_rate.png\n",
347
- "Saved table everywhere: feature_importance.csv\n",
348
- "Saved figure everywhere: feature_importance.png\n",
349
- "Saved text everywhere: classification_report.txt\n",
350
- "Artifact creation section finished.\n"
351
- ]
352
- }
353
- ],
354
- "source": [
355
- "# ==================================================\n",
356
- "# CREATE DASHBOARD ARTIFACTS\n",
357
- "# ==================================================\n",
358
- "\n",
359
- "created_figures = []\n",
360
- "created_tables = []\n",
361
- "\n",
362
- "# 1) Rating distribution\n",
363
- "if \"Rating\" in reviews_df.columns:\n",
364
- " rating_distribution = reviews_df[\"Rating\"].dropna().value_counts().sort_index().to_frame(\"count\")\n",
365
- " save_table(rating_distribution, \"rating_distribution\")\n",
366
- " created_tables.append(\"rating_distribution.csv\")\n",
367
- "\n",
368
- " plt.figure(figsize=(7, 4))\n",
369
- " plt.bar(rating_distribution.index.astype(str), rating_distribution[\"count\"])\n",
370
- " plt.title(\"Distribution of Customer Ratings\")\n",
371
- " plt.xlabel(\"Rating\")\n",
372
- " plt.ylabel(\"Number of Reviews\")\n",
373
- " plt.tight_layout()\n",
374
- " save_figure(\"rating_distribution\")\n",
375
- " created_figures.append(\"rating_distribution.png\")\n",
376
- " plt.close()\n",
377
- "\n",
378
- "# 2) Recommendation rate by clothing class\n",
379
- "if {\"Class Name\", \"Recommended IND\"}.issubset(reviews_df.columns):\n",
380
- " recommendation_by_class = (\n",
381
- " reviews_df.groupby(\"Class Name\")[\"Recommended IND\"]\n",
382
- " .mean()\n",
383
- " .sort_values(ascending=False)\n",
384
- " .head(10)\n",
385
- " .to_frame(\"recommendation_rate\")\n",
386
- " )\n",
387
- " save_table(recommendation_by_class, \"recommendation_by_class\")\n",
388
- " created_tables.append(\"recommendation_by_class.csv\")\n",
389
- "\n",
390
- " plt.figure(figsize=(10, 5))\n",
391
- " plt.bar(recommendation_by_class.index.astype(str), recommendation_by_class[\"recommendation_rate\"])\n",
392
- " plt.title(\"Top 10 Most Recommended Clothing Classes\")\n",
393
- " plt.xlabel(\"Class Name\")\n",
394
- " plt.ylabel(\"Recommendation Rate\")\n",
395
- " plt.xticks(rotation=75)\n",
396
- " plt.tight_layout()\n",
397
- " save_figure(\"recommendation_by_class\")\n",
398
- " created_figures.append(\"recommendation_by_class.png\")\n",
399
- " plt.close()\n",
400
- "\n",
401
- "# 3) Average rating by age\n",
402
- "if {\"Age\", \"Rating\"}.issubset(reviews_df.columns):\n",
403
- " average_rating_by_age = (\n",
404
- " reviews_df.groupby(\"Age\")[\"Rating\"]\n",
405
- " .mean()\n",
406
- " .dropna()\n",
407
- " .to_frame(\"average_rating\")\n",
408
- " )\n",
409
- " save_table(average_rating_by_age, \"average_rating_by_age\")\n",
410
- " created_tables.append(\"average_rating_by_age.csv\")\n",
411
- "\n",
412
- " plt.figure(figsize=(10, 4))\n",
413
- " plt.plot(average_rating_by_age.index, average_rating_by_age[\"average_rating\"])\n",
414
- " plt.title(\"Average Rating by Customer Age\")\n",
415
- " plt.xlabel(\"Age\")\n",
416
- " plt.ylabel(\"Average Rating\")\n",
417
- " plt.tight_layout()\n",
418
- " save_figure(\"average_rating_by_age\")\n",
419
- " created_figures.append(\"average_rating_by_age.png\")\n",
420
- " plt.close()\n",
421
- "\n",
422
- "# 4) Complaint / return-risk keyword counts\n",
423
- "review_text_column = None\n",
424
- "for candidate in [\"Review Text\", \"review_text\", \"review_comment_message\"]:\n",
425
- " if candidate in reviews_df.columns:\n",
426
- " review_text_column = candidate\n",
427
- " break\n",
428
- "\n",
429
- "if review_text_column is not None:\n",
430
- " keywords = [\n",
431
- " \"bad\", \"poor\", \"cheap\", \"small\", \"large\", \"tight\", \"loose\",\n",
432
- " \"scratchy\", \"thin\", \"return\", \"returned\", \"disappointed\",\n",
433
- " \"quality\", \"fit\", \"sizing\", \"fabric\", \"uncomfortable\"\n",
434
- " ]\n",
435
- " text_series = reviews_df[review_text_column].fillna(\"\").astype(str).str.lower()\n",
436
- " keyword_counts = {}\n",
437
- " for word in keywords:\n",
438
- " keyword_counts[word] = int(text_series.str.contains(word, regex=False).sum())\n",
439
- "\n",
440
- " negative_keyword_counts = (\n",
441
- " pd.DataFrame(keyword_counts.items(), columns=[\"keyword\", \"review_count\"])\n",
442
- " .sort_values(\"review_count\", ascending=False)\n",
443
- " .set_index(\"keyword\")\n",
444
- " )\n",
445
- " save_table(negative_keyword_counts, \"negative_keyword_counts\")\n",
446
- " created_tables.append(\"negative_keyword_counts.csv\")\n",
447
- "\n",
448
- " top_keywords = negative_keyword_counts.head(10)\n",
449
- " plt.figure(figsize=(9, 4))\n",
450
- " plt.bar(top_keywords.index.astype(str), top_keywords[\"review_count\"])\n",
451
- " plt.title(\"Most Common Return-Risk Keywords in Reviews\")\n",
452
- " plt.xlabel(\"Keyword\")\n",
453
- " plt.ylabel(\"Number of Reviews\")\n",
454
- " plt.xticks(rotation=45)\n",
455
- " plt.tight_layout()\n",
456
- " save_figure(\"negative_keyword_counts\")\n",
457
- " created_figures.append(\"negative_keyword_counts.png\")\n",
458
- " plt.close()\n",
459
- "\n",
460
- "# 5) Product category return rate\n",
461
- "if {\"product_category_name\", \"likely_return\"}.issubset(returns_df.columns):\n",
462
- " category_return_rate = (\n",
463
- " returns_df.groupby(\"product_category_name\")[\"likely_return\"]\n",
464
- " .mean()\n",
465
- " .sort_values(ascending=False)\n",
466
- " .head(15)\n",
467
- " .to_frame(\"return_rate\")\n",
468
- " )\n",
469
- " save_table(category_return_rate, \"category_return_rate\")\n",
470
- " created_tables.append(\"category_return_rate.csv\")\n",
471
- "\n",
472
- " plt.figure(figsize=(11, 5))\n",
473
- " plt.bar(category_return_rate.index.astype(str), category_return_rate[\"return_rate\"])\n",
474
- " plt.title(\"Top Product Categories by Estimated Return Rate\")\n",
475
- " plt.xlabel(\"Product Category\")\n",
476
- " plt.ylabel(\"Return Rate\")\n",
477
- " plt.xticks(rotation=75)\n",
478
- " plt.tight_layout()\n",
479
- " save_figure(\"category_return_rate\")\n",
480
- " created_figures.append(\"category_return_rate.png\")\n",
481
- " plt.close()\n",
482
- "\n",
483
- "# 6) Monthly return rate\n",
484
- "if {\"order_purchase_timestamp\", \"likely_return\"}.issubset(returns_df.columns):\n",
485
- " monthly_df = returns_df.copy()\n",
486
- " monthly_df[\"order_purchase_timestamp\"] = pd.to_datetime(monthly_df[\"order_purchase_timestamp\"], errors=\"coerce\")\n",
487
- " monthly_df = monthly_df.dropna(subset=[\"order_purchase_timestamp\"])\n",
488
- "\n",
489
- " if len(monthly_df) > 0:\n",
490
- " monthly_return_rate = (\n",
491
- " monthly_df.set_index(\"order_purchase_timestamp\")\n",
492
- " .resample(\"M\")[\"likely_return\"]\n",
493
- " .mean()\n",
494
- " .dropna()\n",
495
- " .to_frame(\"return_rate\")\n",
496
- " )\n",
497
- " save_table(monthly_return_rate, \"monthly_return_rate\")\n",
498
- " created_tables.append(\"monthly_return_rate.csv\")\n",
499
- "\n",
500
- " plt.figure(figsize=(10, 4))\n",
501
- " plt.plot(monthly_return_rate.index, monthly_return_rate[\"return_rate\"])\n",
502
- " plt.title(\"Monthly Estimated Return Rate\")\n",
503
- " plt.xlabel(\"Month\")\n",
504
- " plt.ylabel(\"Return Rate\")\n",
505
- " plt.tight_layout()\n",
506
- " save_figure(\"monthly_return_rate\")\n",
507
- " created_figures.append(\"monthly_return_rate.png\")\n",
508
- " plt.close()\n",
509
- "\n",
510
- "# 7) Simple feature importance if sklearn is available\n",
511
- "try:\n",
512
- " from sklearn.ensemble import RandomForestClassifier\n",
513
- " from sklearn.model_selection import train_test_split\n",
514
- " from sklearn.metrics import accuracy_score, classification_report\n",
515
- "\n",
516
- " feature_columns = [c for c in [\"Age\", \"Rating\", \"Positive Feedback Count\"] if c in reviews_df.columns]\n",
517
- " if \"Recommended IND\" in reviews_df.columns and len(feature_columns) > 0:\n",
518
- " model_df = reviews_df[feature_columns + [\"Recommended IND\"]].dropna().copy()\n",
519
- " if model_df[\"Recommended IND\"].nunique() >= 2:\n",
520
- " X = model_df[feature_columns]\n",
521
- " y = model_df[\"Recommended IND\"].astype(int)\n",
522
- " X_train, X_test, y_train, y_test = train_test_split(\n",
523
- " X, y, test_size=0.2, random_state=42, stratify=y\n",
524
- " )\n",
525
- "\n",
526
- " clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
527
- " clf.fit(X_train, y_train)\n",
528
- " predictions = clf.predict(X_test)\n",
529
- " accuracy = accuracy_score(y_test, predictions)\n",
530
- "\n",
531
- " feature_importance = (\n",
532
- " pd.Series(clf.feature_importances_, index=feature_columns)\n",
533
- " .sort_values(ascending=False)\n",
534
- " .to_frame(\"importance\")\n",
535
- " )\n",
536
- " save_table(feature_importance, \"feature_importance\")\n",
537
- " created_tables.append(\"feature_importance.csv\")\n",
538
- "\n",
539
- " plt.figure(figsize=(7, 4))\n",
540
- " plt.bar(feature_importance.index.astype(str), feature_importance[\"importance\"])\n",
541
- " plt.title(\"Feature Importance for Recommendation Prediction\")\n",
542
- " plt.xlabel(\"Feature\")\n",
543
- " plt.ylabel(\"Importance\")\n",
544
- " plt.tight_layout()\n",
545
- " save_figure(\"feature_importance\")\n",
546
- " created_figures.append(\"feature_importance.png\")\n",
547
- " plt.close()\n",
548
- "\n",
549
- " report = \"Model accuracy: {:.4f}\\n\\n{}\".format(\n",
550
- " accuracy,\n",
551
- " classification_report(y_test, predictions)\n",
552
- " )\n",
553
- " save_text(report, \"classification_report\")\n",
554
- "except Exception as e:\n",
555
- " print(\"ML section skipped:\", repr(e))\n",
556
- "\n",
557
- "print(\"Artifact creation section finished.\")"
558
- ]
559
- },
560
- {
561
- "cell_type": "code",
562
- "execution_count": 8,
563
- "id": "c4bbc916",
564
- "metadata": {
565
- "colab": {
566
- "base_uri": "https://localhost:8080/"
567
- },
568
- "id": "c4bbc916",
569
- "outputId": "1dc63b01-ed81-47cd-cf56-3e193b2f87f2"
570
- },
571
- "outputs": [
572
- {
573
- "output_type": "stream",
574
- "name": "stdout",
575
- "text": [
576
- "Saved table everywhere: dashboard_summary.csv\n",
577
- "Saved text everywhere: business_insights_report.txt\n",
578
- "STEP 2 COMPLETE.\n",
579
- "Figures: ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
580
- "Tables: ['average_rating_by_age.csv', 'category_return_rate.csv', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
581
- "Outputs: ['average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n"
582
- ]
583
- }
584
- ],
585
- "source": [
586
- "# ==================================================\n",
587
- "# FINAL REPORT + MANIFEST\n",
588
- "# ==================================================\n",
589
- "\n",
590
- "summary_rows = [\n",
591
- " {\"metric\": \"reviews_rows\", \"value\": int(len(reviews_df))},\n",
592
- " {\"metric\": \"returns_rows\", \"value\": int(len(returns_df))},\n",
593
- " {\"metric\": \"figures_created\", \"value\": int(len(list(FIGURES.glob(\"*.png\"))))},\n",
594
- " {\"metric\": \"tables_created\", \"value\": int(len(list(TABLES.glob(\"*.csv\"))))},\n",
595
- "]\n",
596
- "\n",
597
- "summary_df = pd.DataFrame(summary_rows).set_index(\"metric\")\n",
598
- "save_table(summary_df, \"dashboard_summary\")\n",
599
- "\n",
600
- "insights = \"\"\"\n",
601
- "FINAL BUSINESS INSIGHTS\n",
602
- "=======================\n",
603
- "\n",
604
- "This analysis supports an e-commerce return prediction and review intelligence assistant.\n",
605
- "\n",
606
- "Main findings:\n",
607
- "- Customer ratings and recommendation behavior are useful signals for product satisfaction.\n",
608
- "- Review text reveals return-risk themes such as fit, sizing, fabric, quality, and discomfort.\n",
609
- "- Product categories with higher estimated return rates should be prioritized for improvement.\n",
610
- "- Monthly return-rate tracking can help the business monitor operational or seasonal changes.\n",
611
- "\n",
612
- "Recommended automations:\n",
613
- "1. Automatically scan new reviews for return-risk keywords.\n",
614
- "2. Automatically rank products and categories by estimated return risk.\n",
615
- "3. Automatically generate business recommendations for product pages, sizing guidance, and quality control.\n",
616
- "\"\"\"\n",
617
- "\n",
618
- "save_text(insights, \"business_insights_report\")\n",
619
- "\n",
620
- "manifest = {\n",
621
- " \"base_path\": str(BASE_PATH),\n",
622
- " \"figures\": sorted([p.name for p in FIGURES.glob(\"*.png\")]),\n",
623
- " \"tables\": sorted([p.name for p in TABLES.glob(\"*.csv\")]),\n",
624
- " \"outputs\": sorted([p.name for p in OUTPUTS.iterdir() if p.is_file()]),\n",
625
- "}\n",
626
- "\n",
627
- "for folder in [OUTPUTS, ARTIFACTS, TABLES]:\n",
628
- " try:\n",
629
- " with open(folder / \"artifacts_manifest.json\", \"w\", encoding=\"utf-8\") as f:\n",
630
- " json.dump(manifest, f, indent=2)\n",
631
- " except Exception as e:\n",
632
- " print(f\"Could not save manifest in {folder}: {e}\")\n",
633
- "\n",
634
- "print(\"STEP 2 COMPLETE.\")\n",
635
- "print(\"Figures:\", manifest[\"figures\"])\n",
636
- "print(\"Tables:\", manifest[\"tables\"])\n",
637
- "print(\"Outputs:\", manifest[\"outputs\"])"
638
- ]
639
- },
640
- {
641
- "cell_type": "code",
642
- "source": [
643
- "print(\"\\nFINAL ARTIFACT CHECK\")\n",
644
- "\n",
645
- "check_dirs = {\n",
646
- " \"FIGURES\": FIGURES,\n",
647
- " \"TABLES\": TABLES,\n",
648
- " \"OUTPUTS\": OUTPUTS,\n",
649
- " \"OUTPUT_FIGURES\": OUTPUT_FIGURES,\n",
650
- " \"OUTPUT_TABLES\": OUTPUT_TABLES,\n",
651
- " \"ARTIFACTS\": ARTIFACTS,\n",
652
- " \"ARTIFACT_FIGURES\": ARTIFACT_FIGURES,\n",
653
- " \"ARTIFACT_TABLES\": ARTIFACT_TABLES,\n",
654
- "}\n",
655
- "\n",
656
- "for label, folder in check_dirs.items():\n",
657
- " files = sorted([p.name for p in folder.iterdir() if p.is_file()])\n",
658
- " print(label, \"=\", files)"
659
- ],
660
- "metadata": {
661
- "colab": {
662
- "base_uri": "https://localhost:8080/"
663
- },
664
- "id": "fexa62gDM2c7",
665
- "outputId": "e84626f3-e126-43f8-a408-665ccd7eb914"
666
- },
667
- "id": "fexa62gDM2c7",
668
- "execution_count": 9,
669
- "outputs": [
670
- {
671
- "output_type": "stream",
672
- "name": "stdout",
673
- "text": [
674
- "\n",
675
- "FINAL ARTIFACT CHECK\n",
676
- "FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
677
- "TABLES = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
678
- "OUTPUTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n",
679
- "OUTPUT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
680
- "OUTPUT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n",
681
- "ARTIFACTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n",
682
- "ARTIFACT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n",
683
- "ARTIFACT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n"
684
- ]
685
- }
686
- ]
687
- },
688
- {
689
- "cell_type": "code",
690
- "source": [
691
- "# ==================================================\n",
692
- "# FORCE DASHBOARD ARTIFACTS FOR SE21 HUGGING FACE APP\n",
693
- "# Put this as the VERY LAST CELL of pythonanalysis.ipynb\n",
694
- "# ==================================================\n",
695
- "\n",
696
- "import os\n",
697
- "import json\n",
698
- "from pathlib import Path\n",
699
- "\n",
700
- "import pandas as pd\n",
701
- "import numpy as np\n",
702
- "\n",
703
- "import matplotlib\n",
704
- "matplotlib.use(\"Agg\")\n",
705
- "import matplotlib.pyplot as plt\n",
706
- "\n",
707
- "# Detect runtime\n",
708
- "if Path(\"/app\").exists():\n",
709
- " BASE_PATH = Path(\"/app\")\n",
710
- "elif Path(\"/content\").exists():\n",
711
- " BASE_PATH = Path(\"/content\")\n",
712
- "else:\n",
713
- " BASE_PATH = Path.cwd()\n",
714
- "\n",
715
- "# THESE ARE THE EXACT FOLDERS app.py READS\n",
716
- "PY_FIG_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"figures\"\n",
717
- "PY_TAB_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"tables\"\n",
718
- "\n",
719
- "PY_FIG_DIR.mkdir(parents=True, exist_ok=True)\n",
720
- "PY_TAB_DIR.mkdir(parents=True, exist_ok=True)\n",
721
- "\n",
722
- "print(\"Saving dashboard artifacts to:\")\n",
723
- "print(\"Figures:\", PY_FIG_DIR)\n",
724
- "print(\"Tables:\", PY_TAB_DIR)\n",
725
- "\n",
726
- "# Find CSV files\n",
727
- "csv_paths = [\n",
728
- " p for p in BASE_PATH.rglob(\"*.csv\")\n",
729
- " if \"sample_data\" not in str(p)\n",
730
- " and \"artifacts\" not in str(p)\n",
731
- " and \"outputs\" not in str(p)\n",
732
- " and \"figures\" not in str(p)\n",
733
- " and \"tables\" not in str(p)\n",
734
- "]\n",
735
- "\n",
736
- "print(\"CSV files found:\")\n",
737
- "for p in csv_paths:\n",
738
- " print(\"-\", p)\n",
739
- "\n",
740
- "# Find reviews dataset\n",
741
- "reviews_candidates = [\n",
742
- " BASE_PATH / \"data_processed\" / \"reviews_cleaned.csv\",\n",
743
- " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n",
744
- "]\n",
745
- "\n",
746
- "reviews_path = next((p for p in reviews_candidates if p.exists()), None)\n",
747
- "\n",
748
- "if reviews_path is None:\n",
749
- " matches = [\n",
750
- " p for p in csv_paths\n",
751
- " if \"clothing\" in p.name.lower() or \"review\" in p.name.lower()\n",
752
- " ]\n",
753
- " reviews_path = matches[0] if matches else None\n",
754
- "\n",
755
- "# Find returns dataset\n",
756
- "returns_candidates = [\n",
757
- " BASE_PATH / \"data_processed\" / \"returns_input.csv\",\n",
758
- " BASE_PATH / \"data_processed\" / \"returns_cleaned.csv\",\n",
759
- " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n",
760
- " BASE_PATH / \"data_processed\" / \"synthetic_return_risk.csv\",\n",
761
- "]\n",
762
- "\n",
763
- "returns_path = next((p for p in returns_candidates if p.exists()), None)\n",
764
- "\n",
765
- "if returns_path is None:\n",
766
- " matches = [\n",
767
- " p for p in csv_paths\n",
768
- " if \"return\" in p.name.lower()\n",
769
- " ]\n",
770
- " returns_path = matches[0] if matches else None\n",
771
- "\n",
772
- "if reviews_path is None:\n",
773
- " raise FileNotFoundError(\"Could not find reviews CSV.\")\n",
774
- "\n",
775
- "if returns_path is None:\n",
776
- " raise FileNotFoundError(\"Could not find returns CSV.\")\n",
777
- "\n",
778
- "print(\"Using reviews:\", reviews_path)\n",
779
- "print(\"Using returns:\", returns_path)\n",
780
- "\n",
781
- "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
782
- "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
783
- "\n",
784
- "print(\"Reviews shape:\", reviews_df.shape)\n",
785
- "print(\"Returns shape:\", returns_df.shape)\n",
786
- "\n",
787
- "# --------------------------------------------------\n",
788
- "# 1. Rating distribution\n",
789
- "# --------------------------------------------------\n",
790
- "if \"Rating\" in reviews_df.columns:\n",
791
- " rating_distribution = (\n",
792
- " reviews_df[\"Rating\"]\n",
793
- " .dropna()\n",
794
- " .value_counts()\n",
795
- " .sort_index()\n",
796
- " .reset_index()\n",
797
- " )\n",
798
- " rating_distribution.columns = [\"rating\", \"count\"]\n",
799
- "\n",
800
- " rating_distribution.to_csv(PY_TAB_DIR / \"rating_distribution.csv\", index=False)\n",
801
- "\n",
802
- " plt.figure(figsize=(7, 4))\n",
803
- " plt.bar(rating_distribution[\"rating\"].astype(str), rating_distribution[\"count\"])\n",
804
- " plt.title(\"Distribution of Customer Ratings\")\n",
805
- " plt.xlabel(\"Rating\")\n",
806
- " plt.ylabel(\"Number of Reviews\")\n",
807
- " plt.tight_layout()\n",
808
- " plt.savefig(PY_FIG_DIR / \"rating_distribution.png\", dpi=150, bbox_inches=\"tight\")\n",
809
- " plt.close()\n",
810
- "\n",
811
- "# --------------------------------------------------\n",
812
- "# 2. Sentiment counts for app's sentiment chart\n",
813
- "# The app specifically looks for sentiment_counts_sampled.csv\n",
814
- "# --------------------------------------------------\n",
815
- "if \"Rating\" in reviews_df.columns:\n",
816
- " temp = reviews_df.copy()\n",
817
- "\n",
818
- " def rating_to_sentiment(r):\n",
819
- " try:\n",
820
- " r = float(r)\n",
821
- " if r <= 2:\n",
822
- " return \"negative\"\n",
823
- " elif r == 3:\n",
824
- " return \"neutral\"\n",
825
- " else:\n",
826
- " return \"positive\"\n",
827
- " except:\n",
828
- " return \"neutral\"\n",
829
- "\n",
830
- " temp[\"sentiment\"] = temp[\"Rating\"].apply(rating_to_sentiment)\n",
831
- "\n",
832
- " group_col = \"Class Name\" if \"Class Name\" in temp.columns else None\n",
833
- "\n",
834
- " if group_col:\n",
835
- " sentiment_counts = (\n",
836
- " temp.groupby([group_col, \"sentiment\"])\n",
837
- " .size()\n",
838
- " .unstack(fill_value=0)\n",
839
- " .reset_index()\n",
840
- " .head(15)\n",
841
- " )\n",
842
- " sentiment_counts = sentiment_counts.rename(columns={group_col: \"title\"})\n",
843
- " else:\n",
844
- " sentiment_counts = (\n",
845
- " temp[\"sentiment\"]\n",
846
- " .value_counts()\n",
847
- " .to_frame()\n",
848
- " .T\n",
849
- " .reset_index(drop=True)\n",
850
- " )\n",
851
- " sentiment_counts.insert(0, \"title\", \"All Reviews\")\n",
852
- "\n",
853
- " for col in [\"negative\", \"neutral\", \"positive\"]:\n",
854
- " if col not in sentiment_counts.columns:\n",
855
- " sentiment_counts[col] = 0\n",
856
- "\n",
857
- " sentiment_counts[[\"title\", \"negative\", \"neutral\", \"positive\"]].to_csv(\n",
858
- " PY_TAB_DIR / \"sentiment_counts_sampled.csv\",\n",
859
- " index=False\n",
860
- " )\n",
861
- "\n",
862
- " # Also save a normal figure\n",
863
- " sentiment_total = temp[\"sentiment\"].value_counts().reindex(\n",
864
- " [\"negative\", \"neutral\", \"positive\"],\n",
865
- " fill_value=0\n",
866
- " )\n",
867
- "\n",
868
- " plt.figure(figsize=(7, 4))\n",
869
- " plt.bar(sentiment_total.index, sentiment_total.values)\n",
870
- " plt.title(\"Review Sentiment Distribution\")\n",
871
- " plt.xlabel(\"Sentiment\")\n",
872
- " plt.ylabel(\"Number of Reviews\")\n",
873
- " plt.tight_layout()\n",
874
- " plt.savefig(PY_FIG_DIR / \"sentiment_distribution.png\", dpi=150, bbox_inches=\"tight\")\n",
875
- " plt.close()\n",
876
- "\n",
877
- "# --------------------------------------------------\n",
878
- "# 3. Category return rate\n",
879
- "# --------------------------------------------------\n",
880
- "return_col = None\n",
881
- "for candidate in [\"likely_return\", \"synthetic_return_risk\", \"returned\", \"return_flag\"]:\n",
882
- " if candidate in returns_df.columns:\n",
883
- " return_col = candidate\n",
884
- " break\n",
885
- "\n",
886
- "category_col = None\n",
887
- "for candidate in [\"product_category_name\", \"category\", \"Class Name\", \"product_id\"]:\n",
888
- " if candidate in returns_df.columns:\n",
889
- " category_col = candidate\n",
890
- " break\n",
891
- "\n",
892
- "if return_col is not None:\n",
893
- " returns_df[return_col] = pd.to_numeric(returns_df[return_col], errors=\"coerce\")\n",
894
- "\n",
895
- "if return_col is not None and category_col is not None:\n",
896
- " category_return_rate = (\n",
897
- " returns_df.groupby(category_col)[return_col]\n",
898
- " .mean()\n",
899
- " .sort_values(ascending=False)\n",
900
- " .head(15)\n",
901
- " .reset_index()\n",
902
- " )\n",
903
- " category_return_rate.columns = [\"category\", \"return_rate\"]\n",
904
- "\n",
905
- " category_return_rate.to_csv(PY_TAB_DIR / \"category_return_rate.csv\", index=False)\n",
906
- "\n",
907
- " plt.figure(figsize=(11, 5))\n",
908
- " plt.bar(category_return_rate[\"category\"].astype(str), category_return_rate[\"return_rate\"])\n",
909
- " plt.title(\"Highest Return-Rate Categories\")\n",
910
- " plt.xlabel(\"Category\")\n",
911
- " plt.ylabel(\"Return Rate\")\n",
912
- " plt.xticks(rotation=75)\n",
913
- " plt.tight_layout()\n",
914
- " plt.savefig(PY_FIG_DIR / \"category_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n",
915
- " plt.close()\n",
916
- "\n",
917
- " # The template's AI fallback weirdly expects this filename for \"top\" questions.\n",
918
- " # We reuse it to show highest return-risk categories.\n",
919
- " top_titles_by_units_sold = category_return_rate.copy()\n",
920
- " top_titles_by_units_sold.columns = [\"title\", \"units_sold\"]\n",
921
- " top_titles_by_units_sold.to_csv(PY_TAB_DIR / \"top_titles_by_units_sold.csv\", index=False)\n",
922
- "\n",
923
- "# --------------------------------------------------\n",
924
- "# 4. Dashboard time-series file\n",
925
- "# The app's dashboard chart specifically looks for df_dashboard.csv\n",
926
- "# --------------------------------------------------\n",
927
- "if \"order_purchase_timestamp\" in returns_df.columns and return_col is not None:\n",
928
- " ts = returns_df.copy()\n",
929
- " ts[\"order_purchase_timestamp\"] = pd.to_datetime(\n",
930
- " ts[\"order_purchase_timestamp\"],\n",
931
- " errors=\"coerce\"\n",
932
- " )\n",
933
- " ts = ts.dropna(subset=[\"order_purchase_timestamp\"])\n",
934
- "\n",
935
- " if not ts.empty:\n",
936
- " dashboard_df = (\n",
937
- " ts.set_index(\"order_purchase_timestamp\")\n",
938
- " .resample(\"M\")\n",
939
- " .agg(\n",
940
- " return_rate=(return_col, \"mean\"),\n",
941
- " orders=(return_col, \"count\")\n",
942
- " )\n",
943
- " .reset_index()\n",
944
- " )\n",
945
- " dashboard_df = dashboard_df.rename(columns={\"order_purchase_timestamp\": \"month\"})\n",
946
- " else:\n",
947
- " dashboard_df = pd.DataFrame({\n",
948
- " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n",
949
- " \"return_rate\": [0, 0, 0],\n",
950
- " \"orders\": [0, 0, 0],\n",
951
- " })\n",
952
- "else:\n",
953
- " dashboard_df = pd.DataFrame({\n",
954
- " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n",
955
- " \"return_rate\": [0, 0, 0],\n",
956
- " \"orders\": [0, 0, 0],\n",
957
- " })\n",
958
- "\n",
959
- "dashboard_df.to_csv(PY_TAB_DIR / \"df_dashboard.csv\", index=False)\n",
960
- "\n",
961
- "plt.figure(figsize=(9, 4))\n",
962
- "plt.plot(pd.to_datetime(dashboard_df[\"month\"]), dashboard_df[\"return_rate\"], marker=\"o\")\n",
963
- "plt.title(\"Monthly Estimated Return Rate\")\n",
964
- "plt.xlabel(\"Month\")\n",
965
- "plt.ylabel(\"Return Rate\")\n",
966
- "plt.tight_layout()\n",
967
- "plt.savefig(PY_FIG_DIR / \"monthly_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n",
968
- "plt.close()\n",
969
- "\n",
970
- "# --------------------------------------------------\n",
971
- "# 5. KPIs\n",
972
- "# --------------------------------------------------\n",
973
- "kpis = {\n",
974
- " \"reviews_rows\": int(len(reviews_df)),\n",
975
- " \"returns_rows\": int(len(returns_df)),\n",
976
- " \"n_titles\": int(reviews_df[\"Clothing ID\"].nunique()) if \"Clothing ID\" in reviews_df.columns else int(len(reviews_df)),\n",
977
- " \"n_months\": int(len(dashboard_df)),\n",
978
- " \"total_units_sold\": int(len(returns_df)),\n",
979
- " \"estimated_return_rate\": float(returns_df[return_col].mean()) if return_col is not None else None,\n",
980
- "}\n",
981
- "\n",
982
- "with open(PY_TAB_DIR / \"kpis.json\", \"w\", encoding=\"utf-8\") as f:\n",
983
- " json.dump(kpis, f, indent=2)\n",
984
- "\n",
985
- "# --------------------------------------------------\n",
986
- "# Final verification\n",
987
- "# --------------------------------------------------\n",
988
- "print(\"\\nFORCE ARTIFACT CELL RAN SUCCESSFULLY\")\n",
989
- "print(\"Figures now in app-readable folder:\")\n",
990
- "print(sorted([p.name for p in PY_FIG_DIR.glob(\"*\")]))\n",
991
- "\n",
992
- "print(\"Tables now in app-readable folder:\")\n",
993
- "print(sorted([p.name for p in PY_TAB_DIR.glob(\"*\")]))"
994
- ],
995
- "metadata": {
996
- "id": "G-jXRriWP1TW",
997
- "outputId": "23349a23-0bdc-476f-fb72-8e388be9630c",
998
- "colab": {
999
- "base_uri": "https://localhost:8080/"
1000
- }
1001
- },
1002
- "id": "G-jXRriWP1TW",
1003
- "execution_count": 10,
1004
- "outputs": [
1005
- {
1006
- "output_type": "stream",
1007
- "name": "stdout",
1008
- "text": [
1009
- "Saving dashboard artifacts to:\n",
1010
- "Figures: /content/artifacts/py/figures\n",
1011
- "Tables: /content/artifacts/py/tables\n",
1012
- "CSV files found:\n",
1013
- "- /content/Womens Clothing E-Commerce Reviews.csv\n",
1014
- "- /content/ecommerce_returns_cleaned.csv\n",
1015
- "Using reviews: /content/Womens Clothing E-Commerce Reviews.csv\n",
1016
- "Using returns: /content/ecommerce_returns_cleaned.csv\n",
1017
- "Reviews shape: (23486, 10)\n",
1018
- "Returns shape: (113314, 29)\n",
1019
- "\n",
1020
- "FORCE ARTIFACT CELL RAN SUCCESSFULLY\n",
1021
- "Figures now in app-readable folder:\n",
1022
- "['category_return_rate.png', 'monthly_return_rate.png', 'rating_distribution.png', 'sentiment_distribution.png']\n",
1023
- "Tables now in app-readable folder:\n",
1024
- "['category_return_rate.csv', 'df_dashboard.csv', 'kpis.json', 'rating_distribution.csv', 'sentiment_counts_sampled.csv', 'top_titles_by_units_sold.csv']\n"
1025
- ]
1026
- }
1027
- ]
1028
- }
1029
- ],
1030
- "metadata": {
1031
- "kernelspec": {
1032
- "display_name": "Python 3",
1033
- "language": "python",
1034
- "name": "python3"
1035
- },
1036
- "language_info": {
1037
- "name": "python",
1038
- "version": "3.10"
1039
- },
1040
- "colab": {
1041
- "provenance": []
1042
- }
1043
- },
1044
- "nbformat": 4,
1045
- "nbformat_minor": 5
1046
- }