greco commited on
Commit
5219889
1 Parent(s): 87c94b8

update codes

Browse files
app.py CHANGED
@@ -18,6 +18,8 @@ from scipy.stats import zscore
18
 
19
  # nlp
20
  from bertopic import BERTopic
 
 
21
 
22
  # custom
23
  import survey_analytics_library as LIB
@@ -61,6 +63,14 @@ def read_topic_results():
61
  return topic_results
62
  topic_results = read_topic_results()
63
 
 
 
 
 
 
 
 
 
64
  # write title of app
65
  st.title('DACoP - Survey Analytics')
66
  st.markdown('''---''')
@@ -366,9 +376,184 @@ st.markdown('''---''')
366
 
367
 
368
  st.header('Classifiying Text Responses and Sentiment Analysis')
369
- st.write('''
370
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
371
- E.g.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
 
 
 
373
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  st.write('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # nlp
20
  from bertopic import BERTopic
21
+ from transformers import pipeline
22
+ import transformers
23
 
24
  # custom
25
  import survey_analytics_library as LIB
 
63
  return topic_results
64
  topic_results = read_topic_results()
65
 
66
+ @st.cache
67
+ def read_climate_change_results():
68
+ sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
69
+ zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
70
+ return sentiment_results, zero_shot_results
71
+ sentiment_results, zero_shot_results = read_climate_change_results()
72
+
73
+
74
  # write title of app
75
  st.title('DACoP - Survey Analytics')
76
  st.markdown('''---''')
 
376
 
377
 
378
  st.header('Classifiying Text Responses and Sentiment Analysis')
379
+ st.write(f'''
380
  With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
381
+ An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
382
+ Using **Zero-shot Classification**, we can classify responses into one of these four categories.
383
+ As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
384
+ We'll use a different set of 10,000 tweets related to climate change.
385
+ ''')
386
+ st.write('\n')
387
+
388
+ # rename column
389
+ sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
390
+ st.dataframe(sentiment_results[['Tweet']])
391
+
392
+ @st.cache(allow_output_mutation=True)
393
+ def load_transfomer_pipelines():
394
+ classifier_zero_shot = pipeline(
395
+ task='zero-shot-classification',
396
+ model=model_path+'distilbart-mnli-12-1',
397
+ return_all_scores=True
398
+ )
399
+ classifier_sentiment = pipeline(
400
+ task='sentiment-analysis',
401
+ model=model_path+'distilbert-base-uncased-finetuned-sst-2-english',
402
+ return_all_scores=True
403
+ )
404
+ return classifier_zero_shot, classifier_sentiment
405
+ classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
406
+
407
+ # define candidate labels
408
+ candidate_labels = [
409
+ 'finance',
410
+ 'politics',
411
+ 'technology',
412
+ 'wildlife',
413
+ ]
414
+
415
+ # define sample tweet
416
+ sample_tweet_index = 5000
417
+
418
+ # define the first and last topic number
419
+ # create range of index
420
+ tweet_index = sentiment_results.index
421
+ first_tweet = tweet_index[0]
422
+ last_tweet = tweet_index[-1]
423
 
424
+ st.write(f'''
425
+ As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
426
+ Feel free to add your own categories or even input your own text!
427
  ''')
428
+
429
+ # interactive input for user to define candidate labels and tweet index for analysis
430
+ with st.form('classify_tweets'):
431
+ # input for labels
432
+ user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
433
+ candidate_labels = user_defined_labels
434
+ # input for tweet index
435
+ user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
436
+ sample_tweet_index = user_define_tweet
437
+ sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
438
+ # input for user defined text
439
+ user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
440
+ # check if user has entered any custom text
441
+ # if user_define_input is not blank, then override sample_tweet
442
+ if user_defined_input:
443
+ sample_tweet = user_defined_input
444
+
445
+ # submit form
446
+ submit = st.form_submit_button('Classify Tweet')
447
+
448
  st.write('\n')
449
+ st.write(f'''
450
+ Here are the results:
451
+ ''')
452
+ st.write(f'Input Text: *\'{sample_tweet}\'*')
453
+
454
+ # get predictions from models
455
+ zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
456
+ sentiment_sample = classifier_sentiment(sample_tweet)
457
+
458
+ # get sentiment
459
+ sentiment_sample = sentiment_sample[1].get('score')
460
+ sentiment_label = 'positive'
461
+ if sentiment_sample < 0.5:
462
+ sentiment_label = 'negative'
463
+
464
+ st.write(f'''
465
+ The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
466
+ Main category score ranges from 0 to 1, with 1 being very likely.
467
+
468
+ The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
469
+ Full set of scores cores add up to 1.
470
+
471
+ The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
472
+ Sentiment score ranges from 0 to 1, with 1 being very positive.
473
+ ''')
474
+ st.write('\n')
475
+ st.write('\n')
476
+
477
+ # drop unused columns and rename columns
478
+ zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
479
+ zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
480
+ st.write(f'''
481
+ Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
482
+ ''')
483
+
484
+ st.dataframe(zero_shot_results)
485
+
486
+ st.write(f'''
487
+ We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
488
+ It is likely that the tweet does not natually fall into one of the defined categories.
489
+ Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
490
+ ''')
491
+ st.write('\n')
492
+
493
+ # interactive input for user to define candidate labels and tweet index for analysis
494
+ with st.form('classification_score_threshold'):
495
+ user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
496
+ # submit form
497
+ submit = st.form_submit_button('Set Threshold')
498
+ st.write('\n')
499
+
500
+ # filter and keep results with score above defined threshold
501
+ zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
502
+
503
+ # rename columns
504
+ sentiment_results.columns = ['tweet', 'sentiment']
505
+
506
+ st.write(f'''
507
+ The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
508
+ Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
509
+ We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
510
+ ''')
511
+
512
+ # merge in sentiment score on index
513
+ # drop unused columns
514
+ classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
515
+ classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
516
+ st.dataframe(classification_sentiment_df)
517
+
518
+ st.write(f'''
519
+ The difficult part for zero-shot classification is defining the right set of categories for each business case.
520
+ Some trial and error is required to find the appropriate words that can return the optimal results.
521
+ ''')
522
+ st.write('\n')
523
+
524
+ # group by category, count tweets and get mean of sentiment
525
+ classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
526
+ classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
527
+
528
+ st.write(f'''
529
+ Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
530
+ ''')
531
+
532
+ fig = px.pie(
533
+ classification_sentiment_agg,
534
+ values='count',
535
+ names='category',
536
+ hole=0.35,
537
+ title='Percentage of Tweets in Each Category',
538
+ template='simple_white',
539
+ width=1000,
540
+ height=600
541
+ )
542
+ fig.update_traces(textposition='inside', textinfo='percent+label')
543
+ st.plotly_chart(fig)
544
+
545
+ fig = px.bar(
546
+ classification_sentiment_agg,
547
+ x='category',
548
+ y='sentiment',
549
+ title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
550
+ template='simple_white',
551
+ width=1000,
552
+ height=600
553
+ )
554
+ fig.update_yaxes(range=[0, 1])
555
+ fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
556
+ st.plotly_chart(fig)
557
+
558
+ st.write('\n')
559
+ st.markdown('''---''')
data/climate_change_tweets.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/imdb.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/sentiment_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/zero_shot_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
survey_analytics_library.py CHANGED
@@ -18,126 +18,6 @@ from nltk.corpus import stopwords
18
 
19
 
20
 
21
- # # create elbow plot with kmeans to find optimal number of clusters
22
- # def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
23
- # '''
24
- # create elbow plot with kmeans to find optimal number of clusters based on inertia
25
- # where the clusters strikes a balance between being not segmented enough and being too fragmented
26
-
27
- # we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
28
- # where inertia is how close the data points are to their respective centers or centroids
29
-
30
- # arguments:
31
- # df (df): a dataframe of data to cluster
32
- # num_clusters (int): number of clusters to plot
33
- # init_method (str): default to 'k-means++', other option is 'random'
34
- # n_init (int): default to 10, number of times to run model, cost from the best run will be used
35
- # random_state (int): default to 42, random seed used to initialise the model
36
- # plot (bool): default to True, option to turn off plots
37
- # template (str): default to 'simple_white', change as desired
38
- # save (bool): default to False, if True save plot as .html file
39
-
40
- # returns:
41
- # a list of inertia for each run
42
- # '''
43
-
44
- # # create empty list to store inertia for each run
45
- # inertia = []
46
- # # define range of clusters to try
47
- # k = range(2, num_clusters+1)
48
-
49
- # # loop through number of clusters
50
- # for num_clusters in tqdm(k):
51
- # # define model
52
- # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
53
- # # fit and predict data
54
- # kmeans.fit_predict(df)
55
- # # get predicted labels
56
- # predicted_labels = kmeans.labels_
57
- # # append score to list of scores
58
- # inertia.append(kmeans.inertia_)
59
-
60
- # # plot elbow plot
61
- # if plot:
62
- # fig = px.line(
63
- # pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
64
- # x='num_clusters',
65
- # y='inertia',
66
- # title='Elbow Plot for Optimal Number of Clusters with '+init_method,
67
- # markers=True,
68
- # template=template,
69
- # width=800,
70
- # height=500,
71
- # )
72
- # st.plotly_chart(fig, use_container_width=True)
73
- # if save:
74
- # fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
75
-
76
- # # return
77
- # return inertia
78
-
79
-
80
-
81
- # # create plot of silhouette scores with sklearn model to find optimal number of clusters
82
- # def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
83
- # '''
84
- # create plot of silhouette score with kmeans to find optimal number of clusters
85
- # where the clusters strikes a balance between being not segmented enough and being too fragmented
86
- # the closer the score is to 1, the more easily distinguishable are the clusters from each other
87
-
88
- # arguments:
89
- # df (df): a dataframe of data to cluster
90
- # num_clusters (int): number of clusters to plot
91
- # init_method (str): default to 'k-means++', other option is 'random'
92
- # n_init (int): default to 10, number of times to run model, cost from the best run will be used
93
- # random_state (int): default to 42, random seed used to initialise the model
94
- # plot (bool): default to True, option to turn off plots
95
- # template (str): default to 'simple_white', change as desired
96
- # save (bool): default to False, if True save plot as .html file
97
-
98
- # returns:
99
- # a list of silhouette scores for each run
100
- # '''
101
-
102
- # # create empty list to store silhoutte scores for each run
103
- # silhouette_scores = []
104
- # # define range of clusters to try
105
- # k = range(2, num_clusters+1)
106
-
107
- # # loop through number of clusters
108
- # for num_clusters in tqdm(k):
109
- # # define model
110
- # kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
111
- # # fit and predict data
112
- # kmeans.fit_predict(df)
113
- # # get predicted labels
114
- # predicted_labels = kmeans.labels_
115
- # # get silhoutte score
116
- # score = silhouette_score(df, predicted_labels)
117
- # # append score to list of scores
118
- # silhouette_scores.append(score)
119
-
120
- # # plot silhouette scores
121
- # if plot:
122
- # fig = px.line(
123
- # pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
124
- # x='num_clusters',
125
- # y='silhouette_scores',
126
- # title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
127
- # markers=True,
128
- # template=template,
129
- # width=800,
130
- # height=500,
131
- # )
132
- # st.plotly_chart(fig, use_container_width=True)
133
- # if save:
134
- # fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
135
-
136
- # # return
137
- # return silhouette_scores
138
-
139
-
140
-
141
  # replace text with multiple replacements
142
  def replace_text(string, dict_of_replacements):
143
  '''
@@ -379,5 +259,41 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
379
  # drop unused columns
380
  results = results.drop(['labels', 'scores'], axis=1)
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  # return
383
  return results
 
18
 
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # replace text with multiple replacements
22
  def replace_text(string, dict_of_replacements):
23
  '''
 
259
  # drop unused columns
260
  results = results.drop(['labels', 'scores'], axis=1)
261
 
262
+ # return
263
+ return results
264
+
265
+
266
+ # convert transformer model sentiment classification prediction into dataframe
267
+ def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
268
+ '''
269
+ convert sentiment classification output into a dataframe
270
+
271
+ the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
272
+ within each dictionary is a label negative or postive and the respective score
273
+ [
274
+ [
275
+ {'label': 'NEGATIVE', 'score': 0.18449656665325165},
276
+ {'label': 'POSITIVE', 'score': 0.8155034780502319}
277
+ ],
278
+ ...
279
+ ]
280
+ the scores sum up to 1, and we extract only the positive score in this function,
281
+ append the scores to the model's input and return a dataframe
282
+
283
+ arguments:
284
+ text_input (list): a list of sequences that is input for the model
285
+ model_output (list): a list of labels and scores
286
+
287
+ return:
288
+ a dataframe of sequences and sentiment score
289
+
290
+ '''
291
+ # store model positive scores as dataframe
292
+ results = pd.DataFrame(model_output)[[1]]
293
+ # get score from column
294
+ results = results[1].apply(lambda x: x.get('score'))
295
+ # store input sequences and scores as dataframe
296
+ results = pd.DataFrame({'sequence':text_input, 'score':results})
297
+
298
  # return
299
  return results