Spaces:

Krittaprot
/

YT-comments-analyzer-demo

Running

App Files Files Community

Krittaprot commited on Dec 13, 2023

Commit

84cdcf9

•

1 Parent(s): f2c6a14

Update app.py

Browse files

Add a more efficient text summarization method.

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -13,6 +13,8 @@ import time
 sentiment_task = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")
 def extract_youtube_video_id(url_or_id):
     """
@@ -142,7 +144,7 @@ def comments_analyzer(comments_df):
     for i in range(0, len(comments_df), batch_size):
         batch = comments_df['content'][i:i+batch_size].tolist()
         batch_results = sentiment_task(batch)
         # Extracting both sentiment labels and scores
         batch_sentiments = [item['label'] for item in batch_results]
         batch_scores = [item['score'] for item in batch_results]
@@ -156,24 +158,37 @@ def comments_analyzer(comments_df):
     end_time = time.time()
     print(f"Time taken for batch sentiment analysis: {end_time - start_time} seconds")
-    def get_top_comments(comments, sentiment_type, top_n=3):
         filtered_comments = comments[comments['sentiment'] == sentiment_type]
         top_comments = filtered_comments.nlargest(top_n, 'score')
         if not top_comments.empty:
-            return '\n\n'.join(f"{row['content']} - {row['author']}" for _, row in top_comments.iterrows())
         else:
             return f"No {sentiment_type} comments available."
     start_time = time.time()
     # Get top positive comments
     top_positive_comments = get_top_comments(comments_df, 'positive')
     # Get top negative comments
     top_negative_comments = get_top_comments(comments_df, 'negative')
     end_time = time.time()
     print(f"Time taken for finding top n positive/negative comments: {end_time - start_time} seconds")
     data = {}
     #Categorize the comments by sentiment and count them
     data['total_comments'] = len(comments_df)
@@ -185,7 +200,7 @@ def comments_analyzer(comments_df):
     data['blended_comments'] = comments_df['content'].str.cat(sep=' ')
     data['pct_positive'] = 100 * round(data['num_positive']/data['total_comments'], 2)
-  return data, top_positive_comments, top_negative_comments
 def generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps']):
   # This function generates a word cloud image from a given text and returns it as a PIL image object.
@@ -283,14 +298,14 @@ def process_youtube_comments(youtube_link, max_comments, stop_words):
     end_time = time.time()
     print(f"Time taken for loading comments: {end_time - start_time} seconds")
     # Analyze
-    analysis_dict, top_positive_comments, top_negative_comments = comments_analyzer(comments_df)
     long_text = analysis_dict['blended_comments']
     start_time = time.time()
     # Generate word cloud
     word_cloud_img = generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps'])
@@ -306,7 +321,7 @@ def process_youtube_comments(youtube_link, max_comments, stop_words):
     print(f"Time taken for creating sentiment chart: {end_time - start_time} seconds")
     # Return the generated word cloud image, summary text, and sentiment analysis chart
-    return word_cloud_img, top_positive_comments, top_negative_comments, sentiment_chart
 ############################################################################################################################################
 # Gradio interface
@@ -319,8 +334,8 @@ interface = gr.Interface(
     ],
     outputs=[
         gr.Image(label="Word Cloud ☁️"),
-        gr.Textbox(label="Top 3 Positive Comments 👍🏻"),
-        gr.Textbox(label="Top 3 Negative Comments 👎🏻"),
         gr.Image(label="Sentiment Analysis Chart 📊")
     ],
     title="YouTube Comments Analyzer 📈",

 sentiment_task = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")
+summarization_task = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 def extract_youtube_video_id(url_or_id):
     """
     for i in range(0, len(comments_df), batch_size):
         batch = comments_df['content'][i:i+batch_size].tolist()
         batch_results = sentiment_task(batch)
         # Extracting both sentiment labels and scores
         batch_sentiments = [item['label'] for item in batch_results]
         batch_scores = [item['score'] for item in batch_results]
     end_time = time.time()
     print(f"Time taken for batch sentiment analysis: {end_time - start_time} seconds")
+    def get_top_comments(comments, sentiment_type, top_n=5):
         filtered_comments = comments[comments['sentiment'] == sentiment_type]
         top_comments = filtered_comments.nlargest(top_n, 'score')
         if not top_comments.empty:
+            return '\n\n'.join(f"{row['content']}" for _, row in top_comments.iterrows())
         else:
             return f"No {sentiment_type} comments available."
     start_time = time.time()
     # Get top positive comments
     top_positive_comments = get_top_comments(comments_df, 'positive')
     # Get top negative comments
     top_negative_comments = get_top_comments(comments_df, 'negative')
     end_time = time.time()
     print(f"Time taken for finding top n positive/negative comments: {end_time - start_time} seconds")
+    #Summarize the texts from positive and negative comments
+    start_time = time.time()
+    if top_positive_comments == "No positive comments available.":
+      top_positive_comments_summary = top_positive_comments
+    else:
+      top_positive_comments_summary = summarization_task(top_positive_comments)[0]['summary_text']
+    if top_negative_comments == "No negative comments available.":
+      top_negative_comments_summary = top_negative_comments
+    else:
+      top_negative_comments_summary = summarization_task(top_negative_comments)[0]['summary_text']
+    end_time = time.time()
+    print(f"Time taken for summarizing the top n positive/negative comments: {end_time - start_time} seconds")
     data = {}
     #Categorize the comments by sentiment and count them
     data['total_comments'] = len(comments_df)
     data['blended_comments'] = comments_df['content'].str.cat(sep=' ')
     data['pct_positive'] = 100 * round(data['num_positive']/data['total_comments'], 2)
+  return data, top_positive_comments_summary, top_negative_comments_summary
 def generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps']):
   # This function generates a word cloud image from a given text and returns it as a PIL image object.
     end_time = time.time()
     print(f"Time taken for loading comments: {end_time - start_time} seconds")
     # Analyze
+    analysis_dict, top_positive_comments_summary, top_negative_comments_summary = comments_analyzer(comments_df)
     long_text = analysis_dict['blended_comments']
     start_time = time.time()
     # Generate word cloud
     word_cloud_img = generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps'])
     print(f"Time taken for creating sentiment chart: {end_time - start_time} seconds")
     # Return the generated word cloud image, summary text, and sentiment analysis chart
+    return word_cloud_img, top_positive_comments_summary, top_negative_comments_summary, sentiment_chart
 ############################################################################################################################################
 # Gradio interface
     ],
     outputs=[
         gr.Image(label="Word Cloud ☁️"),
+        gr.Textbox(label="Summary of the Top 5 Positive Comments 👍🏻"),
+        gr.Textbox(label="Summary of the Top 5 Negative Comments 👎🏻"),
         gr.Image(label="Sentiment Analysis Chart 📊")
     ],
     title="YouTube Comments Analyzer 📈",