pknayak commited on
Commit
b386cb4
·
1 Parent(s): c1f7d31

Adding the Data Viz

Browse files

* Adding helper functions for making the data viz
* Making plotting functions
* Adding the necessary
* adding the gradio related code for data viz

Files changed (1) hide show
  1. app.py +176 -3
app.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import json
5
  import pandas as pd
6
 
7
- # -----imports for Sentiment Analyzer
8
  import re
9
 
10
  from sklearn.pipeline import Pipeline
@@ -20,6 +20,19 @@ from nltk.stem import RSLPStemmer
20
 
21
  import joblib
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  #--------------------------------------------------------------------------------------
24
  #------------------------ NEWS DATA RETRIEVER------------------------------------------
25
  #--------------------------------------------------------------------------------------
@@ -318,9 +331,153 @@ def sentiment_analyzer(csv_file_name='combined_news_response.csv'):
318
 
319
 
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
 
323
- # Creating the app for both
324
 
325
  with gr.Blocks() as demo:
326
  gr.Markdown("# Welcome to News Retrieval and Sentiment Analyzer App a.k.a InfoMood Tracker")
@@ -329,8 +486,11 @@ with gr.Blocks() as demo:
329
  gr.Markdown("1. Select the Domain from which you want to retrieve the news")
330
  gr.Markdown("2. Click on the `Retrieve news` to retrieve the news from the domain. You Should see that the result displayed in the form of Table")
331
  gr.Markdown("3. Click on the `Analyze Sentiment` to analyze the sentiments of the news retrieved.")
332
- # gr.Markdown("4. ")
 
 
333
 
 
334
  with gr.Row():
335
  with gr.Column(scale=1, min_width=600):
336
  ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
@@ -339,6 +499,7 @@ with gr.Blocks() as demo:
339
 
340
  retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
341
 
 
342
  with gr.Row():
343
  with gr.Column(scale=1, min_width=600):
344
  ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
@@ -347,5 +508,17 @@ with gr.Blocks() as demo:
347
 
348
  view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
349
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  demo.launch(debug=True)
 
4
  import json
5
  import pandas as pd
6
 
7
+ # ----------------imports for Sentiment Analyzer----------------------
8
  import re
9
 
10
  from sklearn.pipeline import Pipeline
 
20
 
21
  import joblib
22
 
23
+ # --------------------------------imports for Data Vizualisation
24
+ from wordcloud import WordCloud
25
+ from collections import Counter
26
+
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+ %matplotlib inline
30
+ from matplotlib.gridspec import GridSpec
31
+ import plotly.offline as py
32
+ import plotly.express as px
33
+ import plotly.graph_objs as go
34
+
35
+
36
  #--------------------------------------------------------------------------------------
37
  #------------------------ NEWS DATA RETRIEVER------------------------------------------
38
  #--------------------------------------------------------------------------------------
 
331
 
332
 
333
 
334
+ #----------------------------------------------------------------------------------------------
335
+ #----------------------------------DATA VIZUALIZER---------------------------------------------
336
+ #----------------------------------------------------------------------------------------------
337
+
338
+
339
+ def get_senti_pct_distribution(expt_df):
340
+ sentiment_counts = expt_df['sentiment'].value_counts()
341
+ labels = sentiment_counts.index
342
+ sizes = sentiment_counts.values
343
+ colors = ['lightblue', 'limegreen', 'lightcoral']
344
+
345
+ # Create a pie chart
346
+ plt.figure(figsize=(8, 8))
347
+ plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
348
+
349
+ # Equal aspect ratio ensures that pie is drawn as a circle
350
+ plt.axis('equal')
351
+ plt.title('Sentiment Distribution for Labelled Data')
352
+ # plt.show()
353
+
354
+ return plt
355
+
356
+ def preprocessing_data(expt_df):
357
+ # Creating a list of comment reviews
358
+ news = list(expt_df['content'].values)
359
+
360
+ # Applying RegEx
361
+ news_breakline = re_breakline(news)
362
+ expt_df['re_breakline'] = news_breakline
363
+
364
+ # Applying RegEx
365
+ news_hyperlinks = re_hyperlinks(news_breakline)
366
+ expt_df['re_hyperlinks'] = news_hyperlinks
367
+
368
+ # Applying RegEx
369
+ news_dates = re_dates(news_hyperlinks)
370
+ expt_df['re_dates'] = news_dates
371
+
372
+ # Applying RegEx
373
+ news_money = re_money(news_dates)
374
+ expt_df['re_money'] = news_money
375
+
376
+ # Applying RegEx
377
+ news_numbers = re_numbers(news_money)
378
+ expt_df['re_numbers'] = news_numbers
379
+
380
+ # Applying RegEx
381
+ news_negation = re_negation(news_numbers)
382
+ expt_df['re_negation'] = news_negation
383
+
384
+ # Applying RegEx
385
+ news_special_chars = re_special_chars(news_negation)
386
+ expt_df['re_special_chars'] = news_special_chars
387
+
388
+ # Applying RegEx
389
+ news_whitespaces = re_whitespaces(news_special_chars)
390
+ expt_df['re_whitespaces'] = news_whitespaces
391
+
392
+ # Removing stopwords and looking at some examples
393
+ news_stopwords = [' '.join(stopwords_removal(news)) for news in news_whitespaces]
394
+ expt_df['stopwords_removed'] = news_stopwords
395
+
396
+ return expt_df
397
+
398
+ def generate_wc(processed_expt_df):
399
+ # Generating words
400
+ pos_news = list(processed_expt_df.query('sentiment == "positive"')['stopwords_removed'].values)
401
+ positive_words = ' '.join(pos_news).split(' ')
402
+ neg_news = list(processed_expt_df.query('sentiment == "negative"')['stopwords_removed'].values)
403
+ negative_words = ' '.join(neg_news).split(' ')
404
+ neu_news = list(processed_expt_df.query('sentiment == "neutral"')['stopwords_removed'].values)
405
+ neutral_words = ' '.join(neu_news).split(' ')
406
+
407
+ # Using Counter for creating a dictionary counting
408
+ positive_dict = Counter(positive_words)
409
+ negative_dict = Counter(negative_words)
410
+ neutral_dict = Counter(neutral_words)
411
+
412
+ # Generating wordclouds for news
413
+ positive_wc = WordCloud(width=1280,
414
+ height=720,
415
+ collocations=False,
416
+ random_state=42,
417
+ # mask=transf_like_mask,
418
+ colormap='Blues', background_color='white',
419
+ max_words=50).generate_from_frequencies(positive_dict)
420
+
421
+ negative_wc = WordCloud(width=1280,
422
+ height=720,
423
+ collocations=False,
424
+ random_state=42,
425
+ # mask=transf_bomb_mask,
426
+ colormap='Reds',
427
+ background_color='white',
428
+ max_words=50).generate_from_frequencies(negative_dict)
429
+
430
+ neutral_wc = WordCloud(width=1280,
431
+ height=720,
432
+ collocations=False,
433
+ random_state=42,
434
+ # mask=transf_bomb_mask,
435
+ colormap='Greens',
436
+ background_color='white',
437
+ max_words=50).generate_from_frequencies(neutral_dict)
438
+
439
+ return positive_wc, negative_wc, neutral_wc
440
+
441
+
442
+ def plot_news_wc(positive_wc, negative_wc, neutral_wc):
443
+ fig, axs = plt.subplots(1, 3, figsize=(20, 20))
444
+ ax1 = axs[0]
445
+ ax2 = axs[1]
446
+ ax3 = axs[2]
447
+
448
+ ax1.imshow(positive_wc)
449
+ ax1.axis('off')
450
+ ax1.set_title('WordCloud for Positive Words in News', size=18, pad=20)
451
+
452
+ ax2.imshow(negative_wc)
453
+ ax2.axis('off')
454
+ ax2.set_title('WordCloud for Negative Words in News', size=18, pad=20)
455
+
456
+ ax3.imshow(neutral_wc)
457
+ ax3.axis('off')
458
+ ax3.set_title('WordCloud for Neutral Words in News', size=18, pad=20)
459
+
460
+ return fig
461
+
462
+ def get_news_wc(expt_df):
463
+ processed_expt_df = preprocessing_data(expt_df)
464
+ positive_wc, negative_wc, neutral_wc = generate_wc(processed_expt_df)
465
+ return plot_news_wc(positive_wc, negative_wc, neutral_wc)
466
+
467
+ def call_data_viz_func(plot_type):
468
+ senti_csv_file_name = 'sentiment.csv'
469
+ expt_df = pd.read_csv(senti_csv_file_name)
470
+
471
+ if plot_type=='percentage_plot':
472
+ return get_senti_pct_distribution(expt_df)
473
+ elif plot_type=='word_count_plot':
474
+ return get_news_wc(expt_df)
475
+ else:
476
+ raise ValueError("Unknown plot type selected")
477
+
478
 
479
 
480
+ #---------------------- GRADIO APP --------------------
481
 
482
  with gr.Blocks() as demo:
483
  gr.Markdown("# Welcome to News Retrieval and Sentiment Analyzer App a.k.a InfoMood Tracker")
 
486
  gr.Markdown("1. Select the Domain from which you want to retrieve the news")
487
  gr.Markdown("2. Click on the `Retrieve news` to retrieve the news from the domain. You Should see that the result displayed in the form of Table")
488
  gr.Markdown("3. Click on the `Analyze Sentiment` to analyze the sentiments of the news retrieved.")
489
+ gr.Markdown("4. Select the radio button `percentage_plot` or `word_count_plot`. Click on the `Vizualize data` to view the respective Vizualization. If needed click the `Clear` Button to clear the plot ")
490
+ gr.Markdown("NOTE: Each depends on the file saved the it's previous step, so the sequence is important. For example, you can't get the data viz until an unless you have the Sentiment Analyzed File ")
491
+
492
 
493
+ # GRADIO ROW FOR NEWS COLLECTOR
494
  with gr.Row():
495
  with gr.Column(scale=1, min_width=600):
496
  ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
 
499
 
500
  retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
501
 
502
+ # GRADIO ROW FOR ANALYSING SENTIMENT
503
  with gr.Row():
504
  with gr.Column(scale=1, min_width=600):
505
  ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
 
508
 
509
  view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
510
 
511
+ with gr.Row():
512
+ with gr.Column(scale=1, min_width=600):
513
+ ui_plot_type = gr.Radio(label="Plot type",
514
+ choices=["percentage_plot", "word_count_plot"],
515
+ value='percentage_plot')
516
+
517
+ data_viz_bt = gr.Button("Vizualize data")
518
+
519
+ plt_output = gr.Plot(label="Data Vizualizer for the News App", show_label=True,)
520
+ gr.ClearButton(plt_output)
521
+ data_viz_bt.click(call_data_viz_func, inputs=ui_plot_type, outputs=plt_output)
522
+
523
 
524
  demo.launch(debug=True)