Krittaprot commited on
Commit
84cdcf9
β€’
1 Parent(s): f2c6a14

Update app.py

Browse files

Add a more efficient text summarization method.

Files changed (1) hide show
  1. app.py +26 -11
app.py CHANGED
@@ -13,6 +13,8 @@ import time
13
 
14
 
15
  sentiment_task = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")
 
 
16
 
17
  def extract_youtube_video_id(url_or_id):
18
  """
@@ -142,7 +144,7 @@ def comments_analyzer(comments_df):
142
  for i in range(0, len(comments_df), batch_size):
143
  batch = comments_df['content'][i:i+batch_size].tolist()
144
  batch_results = sentiment_task(batch)
145
-
146
  # Extracting both sentiment labels and scores
147
  batch_sentiments = [item['label'] for item in batch_results]
148
  batch_scores = [item['score'] for item in batch_results]
@@ -156,24 +158,37 @@ def comments_analyzer(comments_df):
156
  end_time = time.time()
157
  print(f"Time taken for batch sentiment analysis: {end_time - start_time} seconds")
158
 
159
- def get_top_comments(comments, sentiment_type, top_n=3):
160
  filtered_comments = comments[comments['sentiment'] == sentiment_type]
161
  top_comments = filtered_comments.nlargest(top_n, 'score')
162
 
163
  if not top_comments.empty:
164
- return '\n\n'.join(f"{row['content']} - {row['author']}" for _, row in top_comments.iterrows())
165
  else:
166
  return f"No {sentiment_type} comments available."
167
 
168
  start_time = time.time()
169
  # Get top positive comments
170
  top_positive_comments = get_top_comments(comments_df, 'positive')
171
-
172
  # Get top negative comments
173
  top_negative_comments = get_top_comments(comments_df, 'negative')
174
  end_time = time.time()
175
  print(f"Time taken for finding top n positive/negative comments: {end_time - start_time} seconds")
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  data = {}
178
  #Categorize the comments by sentiment and count them
179
  data['total_comments'] = len(comments_df)
@@ -185,7 +200,7 @@ def comments_analyzer(comments_df):
185
  data['blended_comments'] = comments_df['content'].str.cat(sep=' ')
186
  data['pct_positive'] = 100 * round(data['num_positive']/data['total_comments'], 2)
187
 
188
- return data, top_positive_comments, top_negative_comments
189
 
190
  def generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps']):
191
  # This function generates a word cloud image from a given text and returns it as a PIL image object.
@@ -283,14 +298,14 @@ def process_youtube_comments(youtube_link, max_comments, stop_words):
283
 
284
  end_time = time.time()
285
  print(f"Time taken for loading comments: {end_time - start_time} seconds")
286
-
287
  # Analyze
288
- analysis_dict, top_positive_comments, top_negative_comments = comments_analyzer(comments_df)
289
 
290
  long_text = analysis_dict['blended_comments']
291
 
292
  start_time = time.time()
293
-
294
  # Generate word cloud
295
  word_cloud_img = generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps'])
296
 
@@ -306,7 +321,7 @@ def process_youtube_comments(youtube_link, max_comments, stop_words):
306
  print(f"Time taken for creating sentiment chart: {end_time - start_time} seconds")
307
 
308
  # Return the generated word cloud image, summary text, and sentiment analysis chart
309
- return word_cloud_img, top_positive_comments, top_negative_comments, sentiment_chart
310
 
311
  ############################################################################################################################################
312
  # Gradio interface
@@ -319,8 +334,8 @@ interface = gr.Interface(
319
  ],
320
  outputs=[
321
  gr.Image(label="Word Cloud ☁️"),
322
- gr.Textbox(label="Top 3 Positive Comments πŸ‘πŸ»"),
323
- gr.Textbox(label="Top 3 Negative Comments πŸ‘ŽπŸ»"),
324
  gr.Image(label="Sentiment Analysis Chart πŸ“Š")
325
  ],
326
  title="YouTube Comments Analyzer πŸ“ˆ",
 
13
 
14
 
15
  sentiment_task = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")
16
+ summarization_task = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
17
+
18
 
19
  def extract_youtube_video_id(url_or_id):
20
  """
 
144
  for i in range(0, len(comments_df), batch_size):
145
  batch = comments_df['content'][i:i+batch_size].tolist()
146
  batch_results = sentiment_task(batch)
147
+
148
  # Extracting both sentiment labels and scores
149
  batch_sentiments = [item['label'] for item in batch_results]
150
  batch_scores = [item['score'] for item in batch_results]
 
158
  end_time = time.time()
159
  print(f"Time taken for batch sentiment analysis: {end_time - start_time} seconds")
160
 
161
+ def get_top_comments(comments, sentiment_type, top_n=5):
162
  filtered_comments = comments[comments['sentiment'] == sentiment_type]
163
  top_comments = filtered_comments.nlargest(top_n, 'score')
164
 
165
  if not top_comments.empty:
166
+ return '\n\n'.join(f"{row['content']}" for _, row in top_comments.iterrows())
167
  else:
168
  return f"No {sentiment_type} comments available."
169
 
170
  start_time = time.time()
171
  # Get top positive comments
172
  top_positive_comments = get_top_comments(comments_df, 'positive')
 
173
  # Get top negative comments
174
  top_negative_comments = get_top_comments(comments_df, 'negative')
175
  end_time = time.time()
176
  print(f"Time taken for finding top n positive/negative comments: {end_time - start_time} seconds")
177
 
178
+ #Summarize the texts from positive and negative comments
179
+ start_time = time.time()
180
+ if top_positive_comments == "No positive comments available.":
181
+ top_positive_comments_summary = top_positive_comments
182
+ else:
183
+ top_positive_comments_summary = summarization_task(top_positive_comments)[0]['summary_text']
184
+
185
+ if top_negative_comments == "No negative comments available.":
186
+ top_negative_comments_summary = top_negative_comments
187
+ else:
188
+ top_negative_comments_summary = summarization_task(top_negative_comments)[0]['summary_text']
189
+ end_time = time.time()
190
+ print(f"Time taken for summarizing the top n positive/negative comments: {end_time - start_time} seconds")
191
+
192
  data = {}
193
  #Categorize the comments by sentiment and count them
194
  data['total_comments'] = len(comments_df)
 
200
  data['blended_comments'] = comments_df['content'].str.cat(sep=' ')
201
  data['pct_positive'] = 100 * round(data['num_positive']/data['total_comments'], 2)
202
 
203
+ return data, top_positive_comments_summary, top_negative_comments_summary
204
 
205
  def generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps']):
206
  # This function generates a word cloud image from a given text and returns it as a PIL image object.
 
298
 
299
  end_time = time.time()
300
  print(f"Time taken for loading comments: {end_time - start_time} seconds")
301
+
302
  # Analyze
303
+ analysis_dict, top_positive_comments_summary, top_negative_comments_summary = comments_analyzer(comments_df)
304
 
305
  long_text = analysis_dict['blended_comments']
306
 
307
  start_time = time.time()
308
+
309
  # Generate word cloud
310
  word_cloud_img = generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps'])
311
 
 
321
  print(f"Time taken for creating sentiment chart: {end_time - start_time} seconds")
322
 
323
  # Return the generated word cloud image, summary text, and sentiment analysis chart
324
+ return word_cloud_img, top_positive_comments_summary, top_negative_comments_summary, sentiment_chart
325
 
326
  ############################################################################################################################################
327
  # Gradio interface
 
334
  ],
335
  outputs=[
336
  gr.Image(label="Word Cloud ☁️"),
337
+ gr.Textbox(label="Summary of the Top 5 Positive Comments πŸ‘πŸ»"),
338
+ gr.Textbox(label="Summary of the Top 5 Negative Comments πŸ‘ŽπŸ»"),
339
  gr.Image(label="Sentiment Analysis Chart πŸ“Š")
340
  ],
341
  title="YouTube Comments Analyzer πŸ“ˆ",