Spaces:
Runtime error
Runtime error
update codes
Browse files- app.py +187 -2
- data/climate_change_tweets.csv +0 -0
- data/imdb.csv +0 -0
- data/sentiment_results.csv +0 -0
- data/zero_shot_results.csv +0 -0
- survey_analytics_library.py +36 -120
app.py
CHANGED
|
@@ -18,6 +18,8 @@ from scipy.stats import zscore
|
|
| 18 |
|
| 19 |
# nlp
|
| 20 |
from bertopic import BERTopic
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# custom
|
| 23 |
import survey_analytics_library as LIB
|
|
@@ -61,6 +63,14 @@ def read_topic_results():
|
|
| 61 |
return topic_results
|
| 62 |
topic_results = read_topic_results()
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# write title of app
|
| 65 |
st.title('DACoP - Survey Analytics')
|
| 66 |
st.markdown('''---''')
|
|
@@ -366,9 +376,184 @@ st.markdown('''---''')
|
|
| 366 |
|
| 367 |
|
| 368 |
st.header('Classifiying Text Responses and Sentiment Analysis')
|
| 369 |
-
st.write('''
|
| 370 |
With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
|
|
|
|
|
|
|
|
|
| 373 |
''')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
st.write('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# nlp
|
| 20 |
from bertopic import BERTopic
|
| 21 |
+
from transformers import pipeline
|
| 22 |
+
import transformers
|
| 23 |
|
| 24 |
# custom
|
| 25 |
import survey_analytics_library as LIB
|
|
|
|
| 63 |
return topic_results
|
| 64 |
topic_results = read_topic_results()
|
| 65 |
|
| 66 |
+
@st.cache
|
| 67 |
+
def read_climate_change_results():
|
| 68 |
+
sentiment_results = pd.read_csv(data_path+'sentiment_results.csv')
|
| 69 |
+
zero_shot_results = pd.read_csv(data_path+'zero_shot_results.csv')
|
| 70 |
+
return sentiment_results, zero_shot_results
|
| 71 |
+
sentiment_results, zero_shot_results = read_climate_change_results()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
# write title of app
|
| 75 |
st.title('DACoP - Survey Analytics')
|
| 76 |
st.markdown('''---''')
|
|
|
|
| 376 |
|
| 377 |
|
| 378 |
st.header('Classifiying Text Responses and Sentiment Analysis')
|
| 379 |
+
st.write(f'''
|
| 380 |
With survey responses, sometimes as a business user, we already have an general idea of what responders are talking about and we want to categorise or classify the responses accordingly.
|
| 381 |
+
An an example, within the topic of 'Climate Change', we are interested in finance, politics, technology, and wildlife.
|
| 382 |
+
Using **Zero-shot Classification**, we can classify responses into one of these four categories.
|
| 383 |
+
As an added bonus, we can also find out how responders feel about the categories using **Sentiment Analysis**.
|
| 384 |
+
We'll use a different set of 10,000 tweets related to climate change.
|
| 385 |
+
''')
|
| 386 |
+
st.write('\n')
|
| 387 |
+
|
| 388 |
+
# rename column
|
| 389 |
+
sentiment_results = sentiment_results.rename(columns={'sequence':'Tweet'})
|
| 390 |
+
st.dataframe(sentiment_results[['Tweet']])
|
| 391 |
+
|
| 392 |
+
@st.cache(allow_output_mutation=True)
|
| 393 |
+
def load_transfomer_pipelines():
|
| 394 |
+
classifier_zero_shot = pipeline(
|
| 395 |
+
task='zero-shot-classification',
|
| 396 |
+
model=model_path+'distilbart-mnli-12-1',
|
| 397 |
+
return_all_scores=True
|
| 398 |
+
)
|
| 399 |
+
classifier_sentiment = pipeline(
|
| 400 |
+
task='sentiment-analysis',
|
| 401 |
+
model=model_path+'distilbert-base-uncased-finetuned-sst-2-english',
|
| 402 |
+
return_all_scores=True
|
| 403 |
+
)
|
| 404 |
+
return classifier_zero_shot, classifier_sentiment
|
| 405 |
+
classifier_zero_shot, classifier_sentiment = load_transfomer_pipelines()
|
| 406 |
+
|
| 407 |
+
# define candidate labels
|
| 408 |
+
candidate_labels = [
|
| 409 |
+
'finance',
|
| 410 |
+
'politics',
|
| 411 |
+
'technology',
|
| 412 |
+
'wildlife',
|
| 413 |
+
]
|
| 414 |
+
|
| 415 |
+
# define sample tweet
|
| 416 |
+
sample_tweet_index = 5000
|
| 417 |
+
|
| 418 |
+
# define the first and last topic number
|
| 419 |
+
# create range of index
|
| 420 |
+
tweet_index = sentiment_results.index
|
| 421 |
+
first_tweet = tweet_index[0]
|
| 422 |
+
last_tweet = tweet_index[-1]
|
| 423 |
|
| 424 |
+
st.write(f'''
|
| 425 |
+
As a demonstration, we'll define some categories and pick a tweet to classify and determine its sentiment.
|
| 426 |
+
Feel free to add your own categories or even input your own text!
|
| 427 |
''')
|
| 428 |
+
|
| 429 |
+
# interactive input for user to define candidate labels and tweet index for analysis
|
| 430 |
+
with st.form('classify_tweets'):
|
| 431 |
+
# input for labels
|
| 432 |
+
user_defined_labels = st.text_input('Enter categories (separate categories by comma):', ', '.join(candidate_labels))
|
| 433 |
+
candidate_labels = user_defined_labels
|
| 434 |
+
# input for tweet index
|
| 435 |
+
user_define_tweet = st.number_input(f'Enter tweet index (from {first_tweet} to {last_tweet}) to classify:', min_value=first_tweet, max_value=last_tweet, value=sample_tweet_index)
|
| 436 |
+
sample_tweet_index = user_define_tweet
|
| 437 |
+
sample_tweet = sentiment_results['Tweet'].iloc[sample_tweet_index]
|
| 438 |
+
# input for user defined text
|
| 439 |
+
user_defined_input = st.text_input('Enter custom text (optional, leave blank to use Tweets):', '')
|
| 440 |
+
# check if user has entered any custom text
|
| 441 |
+
# if user_define_input is not blank, then override sample_tweet
|
| 442 |
+
if user_defined_input:
|
| 443 |
+
sample_tweet = user_defined_input
|
| 444 |
+
|
| 445 |
+
# submit form
|
| 446 |
+
submit = st.form_submit_button('Classify Tweet')
|
| 447 |
+
|
| 448 |
st.write('\n')
|
| 449 |
+
st.write(f'''
|
| 450 |
+
Here are the results:
|
| 451 |
+
''')
|
| 452 |
+
st.write(f'Input Text: *\'{sample_tweet}\'*')
|
| 453 |
+
|
| 454 |
+
# get predictions from models
|
| 455 |
+
zero_shot_sample = classifier_zero_shot(sample_tweet, candidate_labels)
|
| 456 |
+
sentiment_sample = classifier_sentiment(sample_tweet)
|
| 457 |
+
|
| 458 |
+
# get sentiment
|
| 459 |
+
sentiment_sample = sentiment_sample[1].get('score')
|
| 460 |
+
sentiment_label = 'positive'
|
| 461 |
+
if sentiment_sample < 0.5:
|
| 462 |
+
sentiment_label = 'negative'
|
| 463 |
+
|
| 464 |
+
st.write(f'''
|
| 465 |
+
The main category is: **{zero_shot_sample['labels'][0]}** with a score of {round(zero_shot_sample['scores'][0], 2)}
|
| 466 |
+
Main category score ranges from 0 to 1, with 1 being very likely.
|
| 467 |
+
|
| 468 |
+
The full set of scores are: {dict(zip(zero_shot_sample['labels'], [round(score, 2) for score in zero_shot_sample['scores']]))}
|
| 469 |
+
Full set of scores cores add up to 1.
|
| 470 |
+
|
| 471 |
+
The sentiment is: **{sentiment_label}** with a score of {round(sentiment_sample, 2)}
|
| 472 |
+
Sentiment score ranges from 0 to 1, with 1 being very positive.
|
| 473 |
+
''')
|
| 474 |
+
st.write('\n')
|
| 475 |
+
st.write('\n')
|
| 476 |
+
|
| 477 |
+
# drop unused columns and rename columns
|
| 478 |
+
zero_shot_results = zero_shot_results.drop('labels_scores', axis=1)
|
| 479 |
+
zero_shot_results = zero_shot_results.rename(columns={'sequence':'tweet', 'label':'category'})
|
| 480 |
+
st.write(f'''
|
| 481 |
+
Lets review all the tweets and how they fall into the categories of finance, politics, technology, and wildlife.
|
| 482 |
+
''')
|
| 483 |
+
|
| 484 |
+
st.dataframe(zero_shot_results)
|
| 485 |
+
|
| 486 |
+
st.write(f'''
|
| 487 |
+
We can observe that the model does not have strong confidence in predicting the categories for some of the tweets.
|
| 488 |
+
It is likely that the tweet does not natually fall into one of the defined categories.
|
| 489 |
+
Before performing further analysis on our results, we can set a score threshold to only keep predictions that we're confident in.
|
| 490 |
+
''')
|
| 491 |
+
st.write('\n')
|
| 492 |
+
|
| 493 |
+
# interactive input for user to define candidate labels and tweet index for analysis
|
| 494 |
+
with st.form('classification_score_threshold'):
|
| 495 |
+
user_defined_threshold = st.number_input('Enter score threshold (between 0.01 and 0.99):', min_value=0.01, max_value=0.99, value=0.7, step=0.05)
|
| 496 |
+
# submit form
|
| 497 |
+
submit = st.form_submit_button('Set Threshold')
|
| 498 |
+
st.write('\n')
|
| 499 |
+
|
| 500 |
+
# filter and keep results with score above defined threshold
|
| 501 |
+
zero_shot_results_clean = zero_shot_results.loc[(zero_shot_results['score'] >= user_defined_threshold)].copy()
|
| 502 |
+
|
| 503 |
+
# rename columns
|
| 504 |
+
sentiment_results.columns = ['tweet', 'sentiment']
|
| 505 |
+
|
| 506 |
+
st.write(f'''
|
| 507 |
+
The predictions get better with a higher threshold, but reduces the final number of tweets available for further analysis.
|
| 508 |
+
Out of the 10,000 tweets, we are now left with {len(zero_shot_results_clean)}.
|
| 509 |
+
We also add on the sentiment score for the tweets, the score here ranges from 0 (most negative) to 1 (most positive).
|
| 510 |
+
''')
|
| 511 |
+
|
| 512 |
+
# merge in sentiment score on index
|
| 513 |
+
# drop unused columns
|
| 514 |
+
classification_sentiment_df = pd.merge(zero_shot_results_clean, sentiment_results[['sentiment']], how='left', left_index=True, right_index=True)
|
| 515 |
+
classification_sentiment_df = classification_sentiment_df[['tweet', 'category', 'score', 'sentiment']]
|
| 516 |
+
st.dataframe(classification_sentiment_df)
|
| 517 |
+
|
| 518 |
+
st.write(f'''
|
| 519 |
+
The difficult part for zero-shot classification is defining the right set of categories for each business case.
|
| 520 |
+
Some trial and error is required to find the appropriate words that can return the optimal results.
|
| 521 |
+
''')
|
| 522 |
+
st.write('\n')
|
| 523 |
+
|
| 524 |
+
# group by category, count tweets and get mean of sentiment
|
| 525 |
+
classification_sentiment_agg = classification_sentiment_df.groupby(['category']).agg({'tweet':'count', 'sentiment':'mean'}).reset_index()
|
| 526 |
+
classification_sentiment_agg = classification_sentiment_agg.rename(columns={'tweet':'count'})
|
| 527 |
+
|
| 528 |
+
st.write(f'''
|
| 529 |
+
Finally, we can visualise the percentage of tweets in each category and the respective average sentiment scores.
|
| 530 |
+
''')
|
| 531 |
+
|
| 532 |
+
fig = px.pie(
|
| 533 |
+
classification_sentiment_agg,
|
| 534 |
+
values='count',
|
| 535 |
+
names='category',
|
| 536 |
+
hole=0.35,
|
| 537 |
+
title='Percentage of Tweets in Each Category',
|
| 538 |
+
template='simple_white',
|
| 539 |
+
width=1000,
|
| 540 |
+
height=600
|
| 541 |
+
)
|
| 542 |
+
fig.update_traces(textposition='inside', textinfo='percent+label')
|
| 543 |
+
st.plotly_chart(fig)
|
| 544 |
+
|
| 545 |
+
fig = px.bar(
|
| 546 |
+
classification_sentiment_agg,
|
| 547 |
+
x='category',
|
| 548 |
+
y='sentiment',
|
| 549 |
+
title='Average Sentiment of Tweets in Each Category <br><sup>Overall, the sentiment of the tweets are on the negative side.</sup>',
|
| 550 |
+
template='simple_white',
|
| 551 |
+
width=1000,
|
| 552 |
+
height=600
|
| 553 |
+
)
|
| 554 |
+
fig.update_yaxes(range=[0, 1])
|
| 555 |
+
fig.add_hline(y=0.5, line_width=3, line_color='darkgreen')
|
| 556 |
+
st.plotly_chart(fig)
|
| 557 |
+
|
| 558 |
+
st.write('\n')
|
| 559 |
+
st.markdown('''---''')
|
data/climate_change_tweets.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/imdb.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/sentiment_results.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/zero_shot_results.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
survey_analytics_library.py
CHANGED
|
@@ -18,126 +18,6 @@ from nltk.corpus import stopwords
|
|
| 18 |
|
| 19 |
|
| 20 |
|
| 21 |
-
# # create elbow plot with kmeans to find optimal number of clusters
|
| 22 |
-
# def create_elbow_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
|
| 23 |
-
# '''
|
| 24 |
-
# create elbow plot with kmeans to find optimal number of clusters based on inertia
|
| 25 |
-
# where the clusters strikes a balance between being not segmented enough and being too fragmented
|
| 26 |
-
|
| 27 |
-
# we look for the point of diminishing returns (also known as the 'elbow') in terms of the inertia,
|
| 28 |
-
# where inertia is how close the data points are to their respective centers or centroids
|
| 29 |
-
|
| 30 |
-
# arguments:
|
| 31 |
-
# df (df): a dataframe of data to cluster
|
| 32 |
-
# num_clusters (int): number of clusters to plot
|
| 33 |
-
# init_method (str): default to 'k-means++', other option is 'random'
|
| 34 |
-
# n_init (int): default to 10, number of times to run model, cost from the best run will be used
|
| 35 |
-
# random_state (int): default to 42, random seed used to initialise the model
|
| 36 |
-
# plot (bool): default to True, option to turn off plots
|
| 37 |
-
# template (str): default to 'simple_white', change as desired
|
| 38 |
-
# save (bool): default to False, if True save plot as .html file
|
| 39 |
-
|
| 40 |
-
# returns:
|
| 41 |
-
# a list of inertia for each run
|
| 42 |
-
# '''
|
| 43 |
-
|
| 44 |
-
# # create empty list to store inertia for each run
|
| 45 |
-
# inertia = []
|
| 46 |
-
# # define range of clusters to try
|
| 47 |
-
# k = range(2, num_clusters+1)
|
| 48 |
-
|
| 49 |
-
# # loop through number of clusters
|
| 50 |
-
# for num_clusters in tqdm(k):
|
| 51 |
-
# # define model
|
| 52 |
-
# kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
|
| 53 |
-
# # fit and predict data
|
| 54 |
-
# kmeans.fit_predict(df)
|
| 55 |
-
# # get predicted labels
|
| 56 |
-
# predicted_labels = kmeans.labels_
|
| 57 |
-
# # append score to list of scores
|
| 58 |
-
# inertia.append(kmeans.inertia_)
|
| 59 |
-
|
| 60 |
-
# # plot elbow plot
|
| 61 |
-
# if plot:
|
| 62 |
-
# fig = px.line(
|
| 63 |
-
# pd.DataFrame({'num_clusters':list(k), 'inertia':inertia}),
|
| 64 |
-
# x='num_clusters',
|
| 65 |
-
# y='inertia',
|
| 66 |
-
# title='Elbow Plot for Optimal Number of Clusters with '+init_method,
|
| 67 |
-
# markers=True,
|
| 68 |
-
# template=template,
|
| 69 |
-
# width=800,
|
| 70 |
-
# height=500,
|
| 71 |
-
# )
|
| 72 |
-
# st.plotly_chart(fig, use_container_width=True)
|
| 73 |
-
# if save:
|
| 74 |
-
# fig.write_html('Elbow Plot for Optimal Number of Clusters with '+init_method+'.html')
|
| 75 |
-
|
| 76 |
-
# # return
|
| 77 |
-
# return inertia
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
# # create plot of silhouette scores with sklearn model to find optimal number of clusters
|
| 82 |
-
# def silhouette_score_plot_kmeans(df, num_clusters, init_method='k-means++', n_init=10, random_state=42, plot=True, template='simple_white', save=False):
|
| 83 |
-
# '''
|
| 84 |
-
# create plot of silhouette score with kmeans to find optimal number of clusters
|
| 85 |
-
# where the clusters strikes a balance between being not segmented enough and being too fragmented
|
| 86 |
-
# the closer the score is to 1, the more easily distinguishable are the clusters from each other
|
| 87 |
-
|
| 88 |
-
# arguments:
|
| 89 |
-
# df (df): a dataframe of data to cluster
|
| 90 |
-
# num_clusters (int): number of clusters to plot
|
| 91 |
-
# init_method (str): default to 'k-means++', other option is 'random'
|
| 92 |
-
# n_init (int): default to 10, number of times to run model, cost from the best run will be used
|
| 93 |
-
# random_state (int): default to 42, random seed used to initialise the model
|
| 94 |
-
# plot (bool): default to True, option to turn off plots
|
| 95 |
-
# template (str): default to 'simple_white', change as desired
|
| 96 |
-
# save (bool): default to False, if True save plot as .html file
|
| 97 |
-
|
| 98 |
-
# returns:
|
| 99 |
-
# a list of silhouette scores for each run
|
| 100 |
-
# '''
|
| 101 |
-
|
| 102 |
-
# # create empty list to store silhoutte scores for each run
|
| 103 |
-
# silhouette_scores = []
|
| 104 |
-
# # define range of clusters to try
|
| 105 |
-
# k = range(2, num_clusters+1)
|
| 106 |
-
|
| 107 |
-
# # loop through number of clusters
|
| 108 |
-
# for num_clusters in tqdm(k):
|
| 109 |
-
# # define model
|
| 110 |
-
# kmeans = KMeans(n_clusters=num_clusters, init=init_method, n_init=n_init, random_state=random_state)
|
| 111 |
-
# # fit and predict data
|
| 112 |
-
# kmeans.fit_predict(df)
|
| 113 |
-
# # get predicted labels
|
| 114 |
-
# predicted_labels = kmeans.labels_
|
| 115 |
-
# # get silhoutte score
|
| 116 |
-
# score = silhouette_score(df, predicted_labels)
|
| 117 |
-
# # append score to list of scores
|
| 118 |
-
# silhouette_scores.append(score)
|
| 119 |
-
|
| 120 |
-
# # plot silhouette scores
|
| 121 |
-
# if plot:
|
| 122 |
-
# fig = px.line(
|
| 123 |
-
# pd.DataFrame({'num_clusters':list(k), 'silhouette_scores':silhouette_scores}),
|
| 124 |
-
# x='num_clusters',
|
| 125 |
-
# y='silhouette_scores',
|
| 126 |
-
# title='Silhouette Scores for Optimal Number of Clusters with '+init_method,
|
| 127 |
-
# markers=True,
|
| 128 |
-
# template=template,
|
| 129 |
-
# width=800,
|
| 130 |
-
# height=500,
|
| 131 |
-
# )
|
| 132 |
-
# st.plotly_chart(fig, use_container_width=True)
|
| 133 |
-
# if save:
|
| 134 |
-
# fig.write_html('Silhouette Scores for Optimal Number of Clusters with '+init_method+'.html')
|
| 135 |
-
|
| 136 |
-
# # return
|
| 137 |
-
# return silhouette_scores
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
# replace text with multiple replacements
|
| 142 |
def replace_text(string, dict_of_replacements):
|
| 143 |
'''
|
|
@@ -379,5 +259,41 @@ def convert_zero_shot_classification_output_to_dataframe(model_output):
|
|
| 379 |
# drop unused columns
|
| 380 |
results = results.drop(['labels', 'scores'], axis=1)
|
| 381 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
# return
|
| 383 |
return results
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# replace text with multiple replacements
|
| 22 |
def replace_text(string, dict_of_replacements):
|
| 23 |
'''
|
|
|
|
| 259 |
# drop unused columns
|
| 260 |
results = results.drop(['labels', 'scores'], axis=1)
|
| 261 |
|
| 262 |
+
# return
|
| 263 |
+
return results
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# convert transformer model sentiment classification prediction into dataframe
|
| 267 |
+
def convert_sentiment_classification_output_to_dataframe(text_input, model_output):
|
| 268 |
+
'''
|
| 269 |
+
convert sentiment classification output into a dataframe
|
| 270 |
+
|
| 271 |
+
the model used distilbert-base-uncased-finetuned-sst-2-english outputs a list of lists with two dictionaries,
|
| 272 |
+
within each dictionary is a label negative or postive and the respective score
|
| 273 |
+
[
|
| 274 |
+
[
|
| 275 |
+
{'label': 'NEGATIVE', 'score': 0.18449656665325165},
|
| 276 |
+
{'label': 'POSITIVE', 'score': 0.8155034780502319}
|
| 277 |
+
],
|
| 278 |
+
...
|
| 279 |
+
]
|
| 280 |
+
the scores sum up to 1, and we extract only the positive score in this function,
|
| 281 |
+
append the scores to the model's input and return a dataframe
|
| 282 |
+
|
| 283 |
+
arguments:
|
| 284 |
+
text_input (list): a list of sequences that is input for the model
|
| 285 |
+
model_output (list): a list of labels and scores
|
| 286 |
+
|
| 287 |
+
return:
|
| 288 |
+
a dataframe of sequences and sentiment score
|
| 289 |
+
|
| 290 |
+
'''
|
| 291 |
+
# store model positive scores as dataframe
|
| 292 |
+
results = pd.DataFrame(model_output)[[1]]
|
| 293 |
+
# get score from column
|
| 294 |
+
results = results[1].apply(lambda x: x.get('score'))
|
| 295 |
+
# store input sequences and scores as dataframe
|
| 296 |
+
results = pd.DataFrame({'sequence':text_input, 'score':results})
|
| 297 |
+
|
| 298 |
# return
|
| 299 |
return results
|