Jbdddsai commited on
Commit
e7ab06c
1 Parent(s): 2e09c0b

initial commit

Browse files
Files changed (4) hide show
  1. Dockerfile +22 -0
  2. README.md +4 -3
  3. app.py +322 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.14-slim
2
+
3
+ COPY requirements.txt .
4
+
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ RUN useradd -m -u 1000 user
8
+
9
+ USER user
10
+
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+
14
+ WORKDIR $HOME/app
15
+
16
+ COPY --chown=user . $HOME/app
17
+
18
+ COPY . /app
19
+
20
+ EXPOSE 7860
21
+
22
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -6,6 +6,7 @@ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  short_description: Interactive web application to get insights from reviews
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
6
  sdk: docker
7
  pinned: false
8
  short_description: Interactive web application to get insights from reviews
9
+ datasets:
10
+ - Jbddai/customer_reviews
11
+ app_port: 7860
12
+ ---
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dash import Dash, dcc, html, Input, Output, State
2
+ import dash.dependencies as dd
3
+ import plotly.express as px
4
+ import pandas as pd
5
+
6
+ # Load DataFrame from CSV
7
+ df_result_bad_distil_2 = df = pd.read_csv(
8
+ "hf://datasets/Jbddai/customer_reviews/bad_distil_2_with_cluster_labels_cleaned_company.csv"
9
+ )
10
+ df_result_good_distil_2 = df = pd.read_csv(
11
+ "hf://datasets/Jbddai/customer_reviews/good_distil_2_with_cluster_labels_cleaned_company.csv"
12
+ )
13
+
14
+
15
+ def preprocess_data_for_slider_marks(df):
16
+ min_label = df["labels"].min()
17
+ max_label = df["labels"].max()
18
+ min_cluster_rank = df["cluster_rank"].min()
19
+ max_cluster_rank = df["cluster_rank"].max()
20
+
21
+ label_marks = {i: str(i + 1) for i in range(min_label, max_label + 1, 10)}
22
+ cluster_rank_marks = {i: str(i + 1) for i in range(min_cluster_rank, max_cluster_rank + 1, 10)}
23
+
24
+ return label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank
25
+
26
+
27
+ label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = (
28
+ preprocess_data_for_slider_marks(df_result_good_distil_2)
29
+ )
30
+
31
+ sentiment_options = [
32
+ {"label": "gut", "value": "gut"},
33
+ {"label": "schlecht", "value": "schlecht"},
34
+ ]
35
+
36
+ app = Dash(__name__)
37
+
38
+ app.layout = html.Div(
39
+ [
40
+ html.Div(
41
+ [
42
+ html.H4("Interactive Plot of Customer Reviews"),
43
+ html.Div(
44
+ [
45
+ html.P("Select sentiment:"),
46
+ dcc.Dropdown(
47
+ id="sentiment-dropdown",
48
+ options=sentiment_options,
49
+ value="gut",
50
+ style={"width": "50%", "margin": "auto"},
51
+ clearable=False,
52
+ multi=False,
53
+ ),
54
+ ],
55
+ style={"width": "50%", "margin": "auto"},
56
+ ),
57
+ html.Div(
58
+ [
59
+ html.P("Select range of labels:"),
60
+ dcc.RangeSlider(
61
+ id="label-range-slider",
62
+ min=df_result_good_distil_2["labels"].min(),
63
+ max=df_result_good_distil_2["labels"].max(),
64
+ step=1,
65
+ value=[
66
+ df_result_good_distil_2["labels"].min(),
67
+ df_result_good_distil_2["labels"].max(),
68
+ ],
69
+ marks={
70
+ i: str(i + 1)
71
+ for i in range(
72
+ df_result_good_distil_2["labels"].min(),
73
+ df_result_good_distil_2["labels"].max() + 1,
74
+ 10,
75
+ )
76
+ },
77
+ tooltip={"always_visible": True, "placement": "bottom"},
78
+ ),
79
+ html.Button("Reset Range", id="reset-button", n_clicks=0),
80
+ ],
81
+ style={"width": "50%", "margin": "auto"},
82
+ ),
83
+ html.Div(
84
+ [
85
+ html.P("Select range of cluster by rank/popularity (by number of reviews descending):"),
86
+ dcc.RangeSlider(
87
+ id="cluster-rank-slider",
88
+ min=df_result_good_distil_2["cluster_rank"].min(),
89
+ max=df_result_good_distil_2["cluster_rank"].max(),
90
+ step=1,
91
+ value=[
92
+ df_result_good_distil_2["cluster_rank"].min(),
93
+ df_result_good_distil_2["cluster_rank"].max(),
94
+ ],
95
+ marks={
96
+ i: str(i + 1)
97
+ for i in range(
98
+ df_result_good_distil_2["cluster_rank"].min(),
99
+ df_result_good_distil_2["cluster_rank"].max() + 1,
100
+ 10,
101
+ )
102
+ },
103
+ tooltip={"always_visible": True, "placement": "bottom"},
104
+ ),
105
+ html.Button("Reset Cluster Rank", id="reset-cluster-button", n_clicks=0),
106
+ ],
107
+ style={"width": "50%", "margin": "auto"},
108
+ ),
109
+ html.Div(
110
+ [
111
+ html.P("Show Cluster Labels:"),
112
+ dcc.Checklist(
113
+ id="show-cluster-labels",
114
+ options=[{"label": "Show", "value": "on"}],
115
+ value=["off"],
116
+ ),
117
+ ],
118
+ style={"width": "50%", "margin": "auto"},
119
+ ),
120
+ ],
121
+ style={"position": "relative", "zIndex": "1001", "marginBottom": "20px"},
122
+ ),
123
+ dcc.Graph(
124
+ id="scatter-plot",
125
+ style={
126
+ "height": "80vh",
127
+ "width": "90vw",
128
+ "position": "relative",
129
+ "zIndex": "999",
130
+ },
131
+ ),
132
+ html.Div(
133
+ [html.Button("Generate LLM Prompt from current selection", id="generate-cluster-button", n_clicks=0)],
134
+ style={"width": "50%", "margin": "auto"},
135
+ ),
136
+ html.Div(
137
+ [
138
+ html.P("Prompt for LLM:"),
139
+ dcc.Textarea(
140
+ id="cluster-text-output",
141
+ style={"width": "100%", "height": "200px", "display": "none"},
142
+ value="",
143
+ ),
144
+ ],
145
+ style={"width": "50%", "margin": "auto"},
146
+ ),
147
+ ]
148
+ )
149
+
150
+
151
+ @app.callback(
152
+ Output("scatter-plot", "figure"),
153
+ [
154
+ Input("label-range-slider", "value"),
155
+ Input("cluster-rank-slider", "value"),
156
+ Input("sentiment-dropdown", "value"),
157
+ Input("show-cluster-labels", "value"),
158
+ ],
159
+ )
160
+ def update_scatter_plot(label_range, cluster_rank_range, selected_sentiment, show_cluster_labels):
161
+ show_labels = "on" in show_cluster_labels
162
+
163
+ if selected_sentiment == "gut":
164
+ df_filtered = df_result_good_distil_2
165
+ else:
166
+ df_filtered = df_result_bad_distil_2
167
+
168
+ df_filtered = df_filtered[
169
+ (df_filtered["labels"].between(label_range[0], label_range[1]))
170
+ & (df_filtered["cluster_rank"].between(cluster_rank_range[0], cluster_rank_range[1]))
171
+ ]
172
+
173
+ outliers = df_filtered[df_filtered.labels == -1]
174
+ clustered = df_filtered[df_filtered.labels != -1]
175
+
176
+ fig = px.scatter(
177
+ clustered,
178
+ x="x",
179
+ y="y",
180
+ hover_data=[
181
+ "summary_good_bad",
182
+ "sentiment",
183
+ "cluster_rank",
184
+ "cluster_count",
185
+ "clean_review_br",
186
+ ],
187
+ hover_name="cluster_label",
188
+ color="labels",
189
+ color_continuous_scale="rainbow",
190
+ opacity=0.7,
191
+ )
192
+
193
+ if show_labels:
194
+ centroids = clustered.groupby("labels", sort=False).agg(
195
+ {
196
+ "x": "mean",
197
+ "y": "mean",
198
+ "cluster_label": "first",
199
+ "cluster_count": "count",
200
+ }
201
+ )
202
+ for row in centroids.itertuples():
203
+ fig.add_annotation(
204
+ x=row.x,
205
+ y=row.y,
206
+ text=f"{row.cluster_label}, #reviews: {row.cluster_count}",
207
+ showarrow=False,
208
+ )
209
+
210
+ fig.add_scatter(
211
+ x=outliers["x"],
212
+ y=outliers["y"],
213
+ mode="markers",
214
+ marker=dict(color="lightgray", opacity=0.5, size=5.0),
215
+ name="No cluster",
216
+ selectedpoints=False,
217
+ hoverinfo="skip",
218
+ )
219
+
220
+ fig.update_layout(coloraxis_colorbar=dict(len=0.9, x=1.0), height=600)
221
+ fig.update_traces(marker=dict(size=3), selector=dict(mode="markers"))
222
+
223
+ return fig
224
+
225
+
226
+ @app.callback(
227
+ [
228
+ Output("label-range-slider", "marks"),
229
+ Output("label-range-slider", "min"),
230
+ Output("label-range-slider", "max"),
231
+ Output("cluster-rank-slider", "marks"),
232
+ Output("cluster-rank-slider", "min"),
233
+ Output("cluster-rank-slider", "max"),
234
+ ],
235
+ [Input("sentiment-dropdown", "value")],
236
+ )
237
+ def update_slider_marks(selected_sentiment):
238
+ if selected_sentiment == "gut":
239
+ df_filtered = df_result_good_distil_2
240
+ else:
241
+ df_filtered = df_result_bad_distil_2
242
+
243
+ label_marks, min_label, max_label, cluster_rank_marks, min_cluster_rank, max_cluster_rank = (
244
+ preprocess_data_for_slider_marks(df_filtered)
245
+ )
246
+
247
+ return (
248
+ label_marks,
249
+ min_label,
250
+ max_label,
251
+ cluster_rank_marks,
252
+ min_cluster_rank,
253
+ max_cluster_rank,
254
+ )
255
+
256
+
257
+ @app.callback(
258
+ Output("label-range-slider", "value"),
259
+ [Input("reset-button", "n_clicks")],
260
+ [State("label-range-slider", "min"), State("label-range-slider", "max")],
261
+ )
262
+ def reset_label_slider(n_clicks, min_val, max_val):
263
+ return [min_val, max_val]
264
+
265
+
266
+ @app.callback(
267
+ Output("cluster-rank-slider", "value"),
268
+ [Input("reset-cluster-button", "n_clicks")],
269
+ [State("cluster-rank-slider", "min"), State("cluster-rank-slider", "max")],
270
+ )
271
+ def reset_cluster_slider(n_clicks, min_val, max_val):
272
+ return [min_val, max_val]
273
+
274
+
275
+ @app.callback(
276
+ Output("cluster-text-output", "style"),
277
+ [Input("generate-cluster-button", "n_clicks")],
278
+ )
279
+ def show_cluster_text_output(n_clicks):
280
+ if n_clicks > 0:
281
+ return {"width": "100%", "height": "200px", "display": "block"}
282
+ else:
283
+ return {"width": "100%", "height": "200px", "display": "none"}
284
+
285
+
286
+ @app.callback(
287
+ Output("cluster-text-output", "value"),
288
+ [Input("generate-cluster-button", "n_clicks")],
289
+ [State("cluster-rank-slider", "value"), State("sentiment-dropdown", "value")],
290
+ )
291
+ def update_cluster_text_output(n_clicks, cluster_rank_range, selected_sentiment):
292
+ if n_clicks > 0:
293
+ if selected_sentiment == "gut":
294
+ df_text_outp = df_result_good_distil_2
295
+ else:
296
+ df_text_outp = df_result_bad_distil_2
297
+
298
+ df_text_outp = df_text_outp[
299
+ (df_text_outp["cluster_rank"] <= cluster_rank_range[1])
300
+ & (df_text_outp["cluster_rank"] >= cluster_rank_range[0])
301
+ ]
302
+
303
+ df_text_outp["summary_good_bad"] = df_text_outp["summary_good_bad"].fillna("").astype(str)
304
+
305
+ sampled_data = df_text_outp.sample(frac=0.1, random_state=42)
306
+
307
+ grouped_data = (
308
+ sampled_data.groupby("cluster_label", sort=False)["summary_good_bad"].agg("\n".join).reset_index()
309
+ )
310
+
311
+ prompt_instruction = """Analysiere die nach ### folgenden CLUSTER "Clustertitel", die einzelene Bestandteile von Bewertungen erhalten\nund leite pro Cluster eine Business Massnahme ab um das Hauptproblem des Clusters zu lösen oder zu Verbessern.\nGib das Cluster mit seinem "Clustertitel" sowie die dazugehörige Maßnahme zurück.\n###"""
312
+
313
+ cluster_texts = prompt_instruction + "\n\n".join(
314
+ f"\nCLUSTER - {row['cluster_label']}\n{row['summary_good_bad']}" for _, row in grouped_data.iterrows()
315
+ )
316
+ return cluster_texts
317
+ else:
318
+ return ""
319
+
320
+
321
+ if __name__ == "__main__":
322
+ app.run_server(debug=True, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ dash==2.15.0
2
+ fsspec==2024.2.0
3
+ huggingface-hub==0.20.3
4
+ pandas==2.2.0
5
+ plotly==5.18.0