Spaces:

greco
/

survey_analytics_spaces

Runtime error

App Files Files Community

greco commited on Jul 17, 2022

Commit

5219889

1 Parent(s): 87c94b8

update codes

Browse files

Files changed (6) hide show

app.py +187 -2
data/climate_change_tweets.csv +0 -0
data/imdb.csv +0 -0
data/sentiment_results.csv +0 -0
data/zero_shot_results.csv +0 -0
survey_analytics_library.py +36 -120

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ from scipy.stats import zscore
 # nlp
 from bertopic import BERTopic
 # custom
 import survey_analytics_library as LIB
@@ -61,6 +63,14 @@ def read_topic_results():
     return topic_results
 topic_results = read_topic_results()
 # write title of app
 st.title('DACoP - Survey Analytics')
 st.markdown('''---''')
@@ -366,9 +376,184 @@ st.markdown('''---''')
 st.header('Classifiying Text Responses and Sentiment Analysis')
-st.write('''
     With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
-    E.g.
     ''')
 st.write('\n')

 # nlp
 from bertopic import BERTopic
+from transformers import pipeline
+import transformers
 # custom
 import survey_analytics_library as LIB
     return topic_results
 topic_results = read_topic_results()
+@st.cache
+def read_climate_change_results():
+    sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
+    zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
+    return sentiment_results, zero_shot_results
+sentiment_results, zero_shot_results = read_climate_change_results()
 # write title of app
 st.title('DACoP - Survey Analytics')
 st.markdown('''---''')
 st.header('Classifiying Text Responses and Sentiment Analysis')
+st.write(f'''
     With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
+    An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
+    Using **Zero-shot Classification**, we can classify responses into one of these four categories.
+    As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
+    We'll use a different set of 10,000 tweets related to climate change.
+    ''')
+st.write('\n')
+# rename column
+sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
+st.dataframe(sentiment_results[['Tweet']])
+@st.cache(allow_output_mutation=True)
+def load_transfomer_pipelines():
+    classifier_zero_shot = pipeline(
+        task='zero-shot-classification',
+        model=model_path+'distilbart-mnli-12-1',
+        return_all_scores=True
+        )
+    classifier_sentiment = pipeline(
+        task='sentiment-analysis',
+        model=model_path+'distilbert-base-uncased-finetuned-sst-2-english',
+        return_all_scores=True
+        )
+    return classifier_zero_shot, classifier_sentiment
+classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
+# define candidate labels
+candidate_labels = [
+    'finance',
+    'politics',
+    'technology',
+    'wildlife',
+]
+# define sample tweet
+sample_tweet_index = 5000
+# define the first and last topic number
+# create range of index
+tweet_index = sentiment_results.index
+first_tweet = tweet_index[0]
+last_tweet = tweet_index[-1]
+st.write(f'''
+    As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
+    Feel free to add your own categories or even input your own text!
     ''')
+# interactive input for user to define candidate labels and tweet index for analysis
+with st.form('classify_tweets'):
+    # input for labels
+    user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
+    candidate_labels = user_defined_labels
+    # input for tweet index
+    user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
+    sample_tweet_index = user_define_tweet
+    sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
+    # input for user defined text
+    user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
+    # check if user has entered any custom text
+    # if user_define_input is not blank, then override sample_tweet
+    if user_defined_input:
+        sample_tweet = user_defined_input
+    # submit form
+    submit = st.form_submit_button('Classify Tweet')
 st.write('\n')
+st.write(f'''
+    Here are the results:
+    ''')
+st.write(f'Input Text: *\'{sample_tweet}\'*')
+# get predictions from models
+zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
+sentiment_sample = classifier_sentiment(sample_tweet)
+# get sentiment
+sentiment_sample = sentiment_sample[1].get('score')
+sentiment_label = 'positive'
+if sentiment_sample < 0.5:
+    sentiment_label = 'negative'
+st.write(f'''
+    The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
+    Main category score ranges from 0 to 1, with 1 being very likely.
+    The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
+    Full set of scores cores add up to 1.
+    The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
+    Sentiment score ranges from 0 to 1, with 1 being very positive.
+    ''')
+st.write('\n')
+st.write('\n')
+# drop unused columns and rename columns
+zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
+zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
+st.write(f'''
+    Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
+    ''')
+st.dataframe(zero_shot_results)
+st.write(f'''
+    We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
+    It is likely that the tweet does not natually fall into one of the defined categories.
+    Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
+    ''')
+st.write('\n')
+# interactive input for user to define candidate labels and tweet index for analysis
+with st.form('classification_score_threshold'):
+    user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
+    # submit form
+    submit = st.form_submit_button('Set Threshold')
+st.write('\n')
+# filter and keep results with score above defined threshold
+zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
+# rename columns
+sentiment_results.columns = ['tweet', 'sentiment']
+st.write(f'''
+    The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
+    Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
+    We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
+    ''')
+# merge in sentiment score on index
+# drop unused columns
+classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
+classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
+st.dataframe(classification_sentiment_df)
+st.write(f'''
+    The difficult part for zero-shot classification is defining the right set of categories for each business case.
+    Some trial and error is required to find the appropriate words that can return the optimal results.
+    ''')
+st.write('\n')
+# group by category, count tweets and get mean of sentiment
+classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
+classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
+st.write(f'''
+    Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
+    ''')
+fig = px.pie(
+    classification_sentiment_agg,
+    values='count',
+    names='category',
+    hole=0.35,
+    title='Percentage of Tweets in Each Category',
+    template='simple_white',
+    width=1000,
+    height=600
+)
+fig.update_traces(textposition='inside', textinfo='percent+label')
+st.plotly_chart(fig)
+fig = px.bar(
+    classification_sentiment_agg,
+    x='category',
+    y='sentiment',
+    title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
+    template='simple_white',
+    width=1000,
+    height=600
+)
+fig.update_yaxes(range=[0, 1])
+fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
+st.plotly_chart(fig)
+st.write('\n')
+st.markdown('''---''')

data/climate_change_tweets.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/imdb.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/sentiment_results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/zero_shot_results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

survey_analytics_library.py CHANGED Viewed

@@ -18,126 +18,6 @@ from nltk.corpus import stopwords
-# # create elbow plot with kmeans to find optimal number of clusters
-# def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
-#     '''
-#     create elbow plot with kmeans to find optimal number of clusters based on inertia
-#     where the clusters strikes a balance between being not segmented enough and being too fragmented
-#     we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
-#     where inertia is how close the data points are to their respective centers or centroids
-#     arguments:
-#     df (df): a dataframe of data to cluster
-#     num_clusters (int): number of clusters to plot
-#     init_method (str): default to 'k-means++', other option is 'random'
-#     n_init (int): default to 10, number of times to run model, cost from the best run will be used
-#     random_state (int): default to 42, random seed used to initialise the model
-#     plot (bool): default to True, option to turn off plots
-#     template (str): default to 'simple_white', change as desired
-#     save (bool): default to False, if True save plot as .html file
-#     returns:
-#     a list of inertia for each run
-#     '''
-#     # create empty list to store inertia for each run
-#     inertia = []
-#     # define range of clusters to try
-#     k = range(2, num_clusters+1)
-#     # loop through number of clusters
-#     for num_clusters in tqdm(k):
-#         # define model
-#         kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
-#         # fit and predict data
-#         kmeans.fit_predict(df)
-#         # get predicted labels
-#         predicted_labels = kmeans.labels_
-#         # append score to list of scores
-#         inertia.append(kmeans.inertia_)
-#     # plot elbow plot
-#     if plot:
-#         fig = px.line(
-#             pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
-#             x='num_clusters',
-#             y='inertia',
-#             title='Elbow Plot for Optimal Number of Clusters with '+init_method,
-#             markers=True,
-#             template=template,
-#             width=800,
-#             height=500,
-#             )
-#         st.plotly_chart(fig, use_container_width=True)
-#         if save:
-#             fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
-#     # return
-#     return inertia
-# # create plot of silhouette scores with sklearn model to find optimal number of clusters
-# def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
-#     '''
-#     create plot of silhouette score with kmeans to find optimal number of clusters
-#     where the clusters strikes a balance between being not segmented enough and being too fragmented
-#     the closer the score is to 1, the more easily distinguishable are the clusters from each other
-#     arguments:
-#     df (df): a dataframe of data to cluster
-#     num_clusters (int): number of clusters to plot
-#     init_method (str): default to 'k-means++', other option is 'random'
-#     n_init (int): default to 10, number of times to run model, cost from the best run will be used
-#     random_state (int): default to 42, random seed used to initialise the model
-#     plot (bool): default to True, option to turn off plots
-#     template (str): default to 'simple_white', change as desired
-#     save (bool): default to False, if True save plot as .html file
-#     returns:
-#     a list of silhouette scores for each run
-#     '''
-#     # create empty list to store silhoutte scores for each run
-#     silhouette_scores = []
-#     # define range of clusters to try
-#     k = range(2, num_clusters+1)
-#     # loop through number of clusters
-#     for num_clusters in tqdm(k):
-#         # define model
-#         kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
-#         # fit and predict data
-#         kmeans.fit_predict(df)
-#         # get predicted labels
-#         predicted_labels = kmeans.labels_
-#         # get silhoutte score
-#         score = silhouette_score(df, predicted_labels)
-#         # append score to list of scores
-#         silhouette_scores.append(score)
-#     # plot silhouette scores
-#     if plot:
-#         fig = px.line(
-#             pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
-#             x='num_clusters',
-#             y='silhouette_scores',
-#             title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
-#             markers=True,
-#             template=template,
-#             width=800,
-#             height=500,
-#             )
-#         st.plotly_chart(fig, use_container_width=True)
-#         if save:
-#             fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
-#     # return
-#     return silhouette_scores
 # replace text with multiple replacements
 def replace_text(string, dict_of_replacements):
     '''
@@ -379,5 +259,41 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
     # drop unused columns
     results = results.drop(['labels', 'scores'], axis=1)
     # return
     return results

 # replace text with multiple replacements
 def replace_text(string, dict_of_replacements):
     '''
     # drop unused columns
     results = results.drop(['labels', 'scores'], axis=1)
+    # return
+    return results
+# convert transformer model sentiment classification prediction into dataframe
+def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
+    '''
+    convert sentiment classification output into a dataframe
+    the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
+    within each dictionary is a label negative or postive and the respective score
+    [
+        [
+            {'label': 'NEGATIVE', 'score': 0.18449656665325165},
+            {'label': 'POSITIVE', 'score': 0.8155034780502319}
+            ],
+            ...
+    ]
+    the scores sum up to 1, and we extract only the positive score in this function,
+    append the scores to the model's input and return a dataframe
+    arguments:
+    text_input (list): a list of sequences that is input for the model
+    model_output (list): a list of labels and scores
+    return:
+    a dataframe of sequences and sentiment score
+    '''
+    # store model positive scores as dataframe
+    results = pd.DataFrame(model_output)[[1]]
+    # get score from column
+    results = results[1].apply(lambda x: x.get('score'))
+    # store input sequences and scores as dataframe
+    results = pd.DataFrame({'sequence':text_input, 'score':results})
     # return
     return results