Spaces:
Runtime error
Runtime error
Adding the Data Viz
Browse files* Adding helper functions for making the data viz
* Making plotting functions
* Adding the necessary
* adding the gradio related code for data viz
app.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
|
7 |
-
#
|
8 |
import re
|
9 |
|
10 |
from sklearn.pipeline import Pipeline
|
@@ -20,6 +20,19 @@ from nltk.stem import RSLPStemmer
|
|
20 |
|
21 |
import joblib
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
#--------------------------------------------------------------------------------------
|
24 |
#------------------------ NEWS DATA RETRIEVER------------------------------------------
|
25 |
#--------------------------------------------------------------------------------------
|
@@ -318,9 +331,153 @@ def sentiment_analyzer(csv_file_name='combined_news_response.csv'):
|
|
318 |
|
319 |
|
320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
|
323 |
-
|
324 |
|
325 |
with gr.Blocks() as demo:
|
326 |
gr.Markdown("# Welcome to News Retrieval and Sentiment Analyzer App a.k.a InfoMood Tracker")
|
@@ -329,8 +486,11 @@ with gr.Blocks() as demo:
|
|
329 |
gr.Markdown("1. Select the Domain from which you want to retrieve the news")
|
330 |
gr.Markdown("2. Click on the `Retrieve news` to retrieve the news from the domain. You Should see that the result displayed in the form of Table")
|
331 |
gr.Markdown("3. Click on the `Analyze Sentiment` to analyze the sentiments of the news retrieved.")
|
332 |
-
|
|
|
|
|
333 |
|
|
|
334 |
with gr.Row():
|
335 |
with gr.Column(scale=1, min_width=600):
|
336 |
ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
|
@@ -339,6 +499,7 @@ with gr.Blocks() as demo:
|
|
339 |
|
340 |
retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
|
341 |
|
|
|
342 |
with gr.Row():
|
343 |
with gr.Column(scale=1, min_width=600):
|
344 |
ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
|
@@ -347,5 +508,17 @@ with gr.Blocks() as demo:
|
|
347 |
|
348 |
view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
|
349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
|
351 |
demo.launch(debug=True)
|
|
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
|
7 |
+
# ----------------imports for Sentiment Analyzer----------------------
|
8 |
import re
|
9 |
|
10 |
from sklearn.pipeline import Pipeline
|
|
|
20 |
|
21 |
import joblib
|
22 |
|
23 |
+
# --------------------------------imports for Data Vizualisation
|
24 |
+
from wordcloud import WordCloud
|
25 |
+
from collections import Counter
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import seaborn as sns
|
29 |
+
%matplotlib inline
|
30 |
+
from matplotlib.gridspec import GridSpec
|
31 |
+
import plotly.offline as py
|
32 |
+
import plotly.express as px
|
33 |
+
import plotly.graph_objs as go
|
34 |
+
|
35 |
+
|
36 |
#--------------------------------------------------------------------------------------
|
37 |
#------------------------ NEWS DATA RETRIEVER------------------------------------------
|
38 |
#--------------------------------------------------------------------------------------
|
|
|
331 |
|
332 |
|
333 |
|
334 |
+
#----------------------------------------------------------------------------------------------
|
335 |
+
#----------------------------------DATA VIZUALIZER---------------------------------------------
|
336 |
+
#----------------------------------------------------------------------------------------------
|
337 |
+
|
338 |
+
|
339 |
+
def get_senti_pct_distribution(expt_df):
|
340 |
+
sentiment_counts = expt_df['sentiment'].value_counts()
|
341 |
+
labels = sentiment_counts.index
|
342 |
+
sizes = sentiment_counts.values
|
343 |
+
colors = ['lightblue', 'limegreen', 'lightcoral']
|
344 |
+
|
345 |
+
# Create a pie chart
|
346 |
+
plt.figure(figsize=(8, 8))
|
347 |
+
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
|
348 |
+
|
349 |
+
# Equal aspect ratio ensures that pie is drawn as a circle
|
350 |
+
plt.axis('equal')
|
351 |
+
plt.title('Sentiment Distribution for Labelled Data')
|
352 |
+
# plt.show()
|
353 |
+
|
354 |
+
return plt
|
355 |
+
|
356 |
+
def preprocessing_data(expt_df):
|
357 |
+
# Creating a list of comment reviews
|
358 |
+
news = list(expt_df['content'].values)
|
359 |
+
|
360 |
+
# Applying RegEx
|
361 |
+
news_breakline = re_breakline(news)
|
362 |
+
expt_df['re_breakline'] = news_breakline
|
363 |
+
|
364 |
+
# Applying RegEx
|
365 |
+
news_hyperlinks = re_hyperlinks(news_breakline)
|
366 |
+
expt_df['re_hyperlinks'] = news_hyperlinks
|
367 |
+
|
368 |
+
# Applying RegEx
|
369 |
+
news_dates = re_dates(news_hyperlinks)
|
370 |
+
expt_df['re_dates'] = news_dates
|
371 |
+
|
372 |
+
# Applying RegEx
|
373 |
+
news_money = re_money(news_dates)
|
374 |
+
expt_df['re_money'] = news_money
|
375 |
+
|
376 |
+
# Applying RegEx
|
377 |
+
news_numbers = re_numbers(news_money)
|
378 |
+
expt_df['re_numbers'] = news_numbers
|
379 |
+
|
380 |
+
# Applying RegEx
|
381 |
+
news_negation = re_negation(news_numbers)
|
382 |
+
expt_df['re_negation'] = news_negation
|
383 |
+
|
384 |
+
# Applying RegEx
|
385 |
+
news_special_chars = re_special_chars(news_negation)
|
386 |
+
expt_df['re_special_chars'] = news_special_chars
|
387 |
+
|
388 |
+
# Applying RegEx
|
389 |
+
news_whitespaces = re_whitespaces(news_special_chars)
|
390 |
+
expt_df['re_whitespaces'] = news_whitespaces
|
391 |
+
|
392 |
+
# Removing stopwords and looking at some examples
|
393 |
+
news_stopwords = [' '.join(stopwords_removal(news)) for news in news_whitespaces]
|
394 |
+
expt_df['stopwords_removed'] = news_stopwords
|
395 |
+
|
396 |
+
return expt_df
|
397 |
+
|
398 |
+
def generate_wc(processed_expt_df):
|
399 |
+
# Generating words
|
400 |
+
pos_news = list(processed_expt_df.query('sentiment == "positive"')['stopwords_removed'].values)
|
401 |
+
positive_words = ' '.join(pos_news).split(' ')
|
402 |
+
neg_news = list(processed_expt_df.query('sentiment == "negative"')['stopwords_removed'].values)
|
403 |
+
negative_words = ' '.join(neg_news).split(' ')
|
404 |
+
neu_news = list(processed_expt_df.query('sentiment == "neutral"')['stopwords_removed'].values)
|
405 |
+
neutral_words = ' '.join(neu_news).split(' ')
|
406 |
+
|
407 |
+
# Using Counter for creating a dictionary counting
|
408 |
+
positive_dict = Counter(positive_words)
|
409 |
+
negative_dict = Counter(negative_words)
|
410 |
+
neutral_dict = Counter(neutral_words)
|
411 |
+
|
412 |
+
# Generating wordclouds for news
|
413 |
+
positive_wc = WordCloud(width=1280,
|
414 |
+
height=720,
|
415 |
+
collocations=False,
|
416 |
+
random_state=42,
|
417 |
+
# mask=transf_like_mask,
|
418 |
+
colormap='Blues', background_color='white',
|
419 |
+
max_words=50).generate_from_frequencies(positive_dict)
|
420 |
+
|
421 |
+
negative_wc = WordCloud(width=1280,
|
422 |
+
height=720,
|
423 |
+
collocations=False,
|
424 |
+
random_state=42,
|
425 |
+
# mask=transf_bomb_mask,
|
426 |
+
colormap='Reds',
|
427 |
+
background_color='white',
|
428 |
+
max_words=50).generate_from_frequencies(negative_dict)
|
429 |
+
|
430 |
+
neutral_wc = WordCloud(width=1280,
|
431 |
+
height=720,
|
432 |
+
collocations=False,
|
433 |
+
random_state=42,
|
434 |
+
# mask=transf_bomb_mask,
|
435 |
+
colormap='Greens',
|
436 |
+
background_color='white',
|
437 |
+
max_words=50).generate_from_frequencies(neutral_dict)
|
438 |
+
|
439 |
+
return positive_wc, negative_wc, neutral_wc
|
440 |
+
|
441 |
+
|
442 |
+
def plot_news_wc(positive_wc, negative_wc, neutral_wc):
|
443 |
+
fig, axs = plt.subplots(1, 3, figsize=(20, 20))
|
444 |
+
ax1 = axs[0]
|
445 |
+
ax2 = axs[1]
|
446 |
+
ax3 = axs[2]
|
447 |
+
|
448 |
+
ax1.imshow(positive_wc)
|
449 |
+
ax1.axis('off')
|
450 |
+
ax1.set_title('WordCloud for Positive Words in News', size=18, pad=20)
|
451 |
+
|
452 |
+
ax2.imshow(negative_wc)
|
453 |
+
ax2.axis('off')
|
454 |
+
ax2.set_title('WordCloud for Negative Words in News', size=18, pad=20)
|
455 |
+
|
456 |
+
ax3.imshow(neutral_wc)
|
457 |
+
ax3.axis('off')
|
458 |
+
ax3.set_title('WordCloud for Neutral Words in News', size=18, pad=20)
|
459 |
+
|
460 |
+
return fig
|
461 |
+
|
462 |
+
def get_news_wc(expt_df):
|
463 |
+
processed_expt_df = preprocessing_data(expt_df)
|
464 |
+
positive_wc, negative_wc, neutral_wc = generate_wc(processed_expt_df)
|
465 |
+
return plot_news_wc(positive_wc, negative_wc, neutral_wc)
|
466 |
+
|
467 |
+
def call_data_viz_func(plot_type):
|
468 |
+
senti_csv_file_name = 'sentiment.csv'
|
469 |
+
expt_df = pd.read_csv(senti_csv_file_name)
|
470 |
+
|
471 |
+
if plot_type=='percentage_plot':
|
472 |
+
return get_senti_pct_distribution(expt_df)
|
473 |
+
elif plot_type=='word_count_plot':
|
474 |
+
return get_news_wc(expt_df)
|
475 |
+
else:
|
476 |
+
raise ValueError("Unknown plot type selected")
|
477 |
+
|
478 |
|
479 |
|
480 |
+
#---------------------- GRADIO APP --------------------
|
481 |
|
482 |
with gr.Blocks() as demo:
|
483 |
gr.Markdown("# Welcome to News Retrieval and Sentiment Analyzer App a.k.a InfoMood Tracker")
|
|
|
486 |
gr.Markdown("1. Select the Domain from which you want to retrieve the news")
|
487 |
gr.Markdown("2. Click on the `Retrieve news` to retrieve the news from the domain. You Should see that the result displayed in the form of Table")
|
488 |
gr.Markdown("3. Click on the `Analyze Sentiment` to analyze the sentiments of the news retrieved.")
|
489 |
+
gr.Markdown("4. Select the radio button `percentage_plot` or `word_count_plot`. Click on the `Vizualize data` to view the respective Vizualization. If needed click the `Clear` Button to clear the plot ")
|
490 |
+
gr.Markdown("NOTE: Each depends on the file saved the it's previous step, so the sequence is important. For example, you can't get the data viz until an unless you have the Sentiment Analyzed File ")
|
491 |
+
|
492 |
|
493 |
+
# GRADIO ROW FOR NEWS COLLECTOR
|
494 |
with gr.Row():
|
495 |
with gr.Column(scale=1, min_width=600):
|
496 |
ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
|
|
|
499 |
|
500 |
retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
|
501 |
|
502 |
+
# GRADIO ROW FOR ANALYSING SENTIMENT
|
503 |
with gr.Row():
|
504 |
with gr.Column(scale=1, min_width=600):
|
505 |
ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
|
|
|
508 |
|
509 |
view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
|
510 |
|
511 |
+
with gr.Row():
|
512 |
+
with gr.Column(scale=1, min_width=600):
|
513 |
+
ui_plot_type = gr.Radio(label="Plot type",
|
514 |
+
choices=["percentage_plot", "word_count_plot"],
|
515 |
+
value='percentage_plot')
|
516 |
+
|
517 |
+
data_viz_bt = gr.Button("Vizualize data")
|
518 |
+
|
519 |
+
plt_output = gr.Plot(label="Data Vizualizer for the News App", show_label=True,)
|
520 |
+
gr.ClearButton(plt_output)
|
521 |
+
data_viz_bt.click(call_data_viz_func, inputs=ui_plot_type, outputs=plt_output)
|
522 |
+
|
523 |
|
524 |
demo.launch(debug=True)
|