Krittaprot commited on
Commit
f9632a1
1 Parent(s): f14d844

Create app.py

Browse files

initial commit

Files changed (1) hide show
  1. app.py +279 -0
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from wordcloud import WordCloud, STOPWORDS
4
+ from youtubesearchpython import *
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ from PIL import Image
9
+ import re
10
+ import io
11
+ from io import BytesIO
12
+
13
+ sentiment_task = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")
14
+ text_summarization_task = pipeline("summarization", model="facebook/bart-large-cnn")
15
+
16
+ def extract_youtube_video_id(url_or_id):
17
+ """
18
+ Extracts the YouTube video ID from a given URL or returns the ID if a direct ID is provided.
19
+
20
+ Args:
21
+ url_or_id (str): A YouTube URL or a video ID.
22
+
23
+ Returns:
24
+ str: The extracted YouTube video ID.
25
+ """
26
+ # Check if it's already a valid YouTube ID (typically 11 characters)
27
+ if len(url_or_id) == 11 and not re.search(r'[^0-9A-Za-z_-]', url_or_id):
28
+ return url_or_id
29
+
30
+ # Regular expressions for various YouTube URL formats
31
+ regex_patterns = [
32
+ r'(?:https?://)?www\.youtube\.com/watch\?v=([0-9A-Za-z_-]{11})',
33
+ r'(?:https?://)?youtu\.be/([0-9A-Za-z_-]{11})',
34
+ r'(?:https?://)?www\.youtube\.com/embed/([0-9A-Za-z_-]{11})',
35
+ r'(?:https?://)?www\.youtube\.com/v/([0-9A-Za-z_-]{11})',
36
+ r'(?:https?://)?www\.youtube\.com/shorts/([0-9A-Za-z_-]{11})'
37
+ ]
38
+
39
+ # Try each regex pattern to find a match
40
+ for pattern in regex_patterns:
41
+ match = re.search(pattern, url_or_id)
42
+ if match:
43
+ return match.group(1)
44
+
45
+ # If no pattern matches, return an error or a specific message
46
+ return "Invalid YouTube URL or ID"
47
+
48
+ def comments_collector(video_link, max_comments = 100):
49
+ # This function collects comments from a given YouTube video link.
50
+ # It uses the youtubesearchpython library to extract comments and pandas for data manipulation.
51
+ # Args:
52
+ # video_link (str): The YouTube video link from which to collect comments.
53
+ # max_comments (int, optional): The maximum number of comments to retrieve. Defaults to 100.
54
+ # Returns:
55
+ # pandas.DataFrame: A DataFrame containing the comments, or None in case of an exception.
56
+ video_id = extract_youtube_video_id(video_link)
57
+ max_comments -= 1
58
+
59
+ try:
60
+ #load the first 20 comments
61
+ comments = Comments(video_id)
62
+ print(f'Comments Retrieved and Loading...')
63
+
64
+ #load more comments, 20 at a time, until the limit is reached
65
+ while comments.hasMoreComments and (len(comments.comments["result"]) <= max_comments):
66
+ comments.getNextComments()
67
+ print(f'Found all the {len(comments.comments["result"])} comments.')
68
+
69
+ #load all the comments into "comments" variable
70
+ comments = comments.comments
71
+
72
+ #define data list for collecting comments for a particular video
73
+ data = []
74
+
75
+ #loop through all the comments
76
+ for i in range(len(comments['result'])):
77
+ #############################################################################
78
+ is_author = comments['result'][i]['authorIsChannelOwner']
79
+
80
+ #check if the comment is from the video author or not -> neglect if so.
81
+ if is_author:
82
+ pass
83
+ #############################################################################
84
+ #comment comes from others, we will save this comment.
85
+ else:
86
+ comment_dict = {}
87
+ comment_id = comments['result'][i]['id']
88
+ author = comments['result'][i]['author']['name']
89
+ content = comments['result'][i]['content']
90
+
91
+ #############################################################################
92
+ #cleaning comments likes (e.g., convert 10K likes to 10000, convert None like to 0)
93
+ if comments['result'][i]['votes']['label'] is None:
94
+ likes = 0
95
+ else:
96
+ likes = comments['result'][i]['votes']['label'].split(' ')[0]
97
+ if 'K' in likes:
98
+ likes = int(float(likes.replace('K', '')) * 1000)
99
+
100
+ #############################################################################
101
+ #cleaning comments reply count
102
+ replyCount = comments['result'][i]['replyCount']
103
+ #if there is no reply, we will log it as 0
104
+ if replyCount is None:
105
+ comment_dict['replyCount'] = 0
106
+ #otherwise, we will log as integer
107
+ else:
108
+ comment_dict['replyCount'] = int(replyCount)
109
+
110
+ #############################################################################
111
+ comment_dict['comment_id'] = comment_id
112
+ comment_dict['author'] = author
113
+ comment_dict['content'] = content
114
+ comment_dict['likes'] = likes
115
+
116
+ data.append(comment_dict)
117
+ #############################################################################
118
+ print(f'Excluding author comments, we ended up with {len(data)} comments')
119
+ return pd.DataFrame(data)
120
+ except Exception as e:
121
+ print(e)
122
+ return None
123
+
124
+ def comments_analyzer(comments_df):
125
+ # This function analyzes the sentiment of comments in a given DataFrame.
126
+ # It requires a DataFrame of comments, typically generated by the comments_collector function.
127
+ # Args:
128
+ # comments_df (pandas.DataFrame): A DataFrame containing YouTube comments.
129
+ # Returns:
130
+ # dict: A dictionary with analysis results, including sentiment counts and percentages, or None if input is None.
131
+ # The function applies a sentiment analysis task on each comment and categorizes them as positive, neutral, or negative.
132
+ # It also calculates the percentage of positive comments and blends all comments into a single string.
133
+ if comments_df is None:
134
+ return None
135
+ else:
136
+ comments_df['sentiment'] = comments_df['content'].apply(lambda x: sentiment_task(x)[0]['label'])
137
+
138
+ data = {}
139
+ #Categorize the comments by sentiment and count them
140
+ data['video_link'] = video_link
141
+ data['total_comments'] = len(comments_df)
142
+ data['num_positive'] = comments_df['sentiment'].value_counts().get('positive', 0)
143
+ data['num_neutral'] = comments_df['sentiment'].value_counts().get('neutral', 0)
144
+ data['num_negative'] = comments_df['sentiment'].value_counts().get('negative', 0)
145
+
146
+ #blend all the comments
147
+ data['blended_comments'] = comments_df['content'].str.cat(sep=' ')
148
+ data['pct_positive'] = 100 * round(data['num_positive']/data['total_comments'], 2)
149
+
150
+ return data
151
+
152
+ def generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps']):
153
+ # This function generates a word cloud image from a given text and returns it as a PIL image object.
154
+ # Args:
155
+ # long_text (str): The text from which to generate the word cloud.
156
+ # additional_stopwords (list, optional): A list of words to be excluded from the word cloud.
157
+ # The function creates a word cloud with specified font size, word limit, and background color.
158
+ # It then converts the matplotlib plot to a PIL Image object for further use or saving.
159
+ # Returns:
160
+ # PIL.Image: The generated word cloud as a PIL image object.
161
+
162
+ #Call the default STOPWORDS from wordcloud library
163
+ stopwords = set(STOPWORDS)
164
+
165
+ #Combine the default STOPWORDS with the manually specified STOPWORDS to exclude them from the wordcloud.
166
+ all_stopwords = stopwords.union(additional_stopwords)
167
+
168
+ # Create a Word Cloud
169
+ wordcloud = WordCloud(max_font_size=50, max_words=20, background_color="black", stopwords=all_stopwords, colormap='plasma').generate(long_text)
170
+
171
+ # Create a figure
172
+ plt.figure(figsize=(10,10), facecolor=None)
173
+ plt.imshow(wordcloud, interpolation="bilinear")
174
+ plt.axis("off")
175
+ plt.tight_layout(pad=0)
176
+
177
+ # Save to a BytesIO object
178
+ img_buf = io.BytesIO()
179
+ plt.savefig(img_buf, format='png', bbox_inches='tight', pad_inches=0)
180
+ img_buf.seek(0)
181
+
182
+ # Close the plt figure to prevent display
183
+ plt.close()
184
+
185
+ # Use PIL to open the image from the BytesIO object
186
+ image = Image.open(img_buf)
187
+
188
+ return image
189
+
190
+ def create_sentiment_analysis_chart(data):
191
+ # This function creates a bar chart for sentiment analysis results and returns it as a PIL image object.
192
+ # Args:
193
+ # data (dict): A dictionary containing the count of positive, negative, and neutral comments.
194
+ # The function first converts the input data into a pandas DataFrame.
195
+ # It then creates a bar chart using matplotlib, setting specific colors for different sentiment types.
196
+ # Titles, labels, and legends are added for clarity.
197
+ # Finally, the plot is saved to a BytesIO object and converted to a PIL image.
198
+ # Returns:
199
+ # PIL.Image: The sentiment analysis bar chart as a PIL image object.
200
+
201
+ # Convert the data to a DataFrame
202
+ df = {}
203
+ df['num_positive'] = data['num_positive']
204
+ df['num_negative'] = data['num_negative']
205
+ df['num_neutral'] = data['num_neutral']
206
+ df = pd.DataFrame(df, index=[0])
207
+
208
+ # Plotting
209
+ plt.figure(figsize=(8, 6))
210
+ bar_colors = ['green', 'red', 'blue'] # Colors for positive, negative, neutral
211
+ df.plot(kind='bar', color=bar_colors, legend=True)
212
+
213
+ # Adding titles and labels
214
+ plt.title('Sentiment Analysis Results')
215
+ plt.xlabel('Sentiment Types')
216
+ plt.ylabel('Number of Comments')
217
+ plt.xticks(ticks=[0], labels=['Sentiments'], rotation=0) # Adjust x-ticks
218
+ plt.legend(['Positive', 'Negative', 'Neutral'])
219
+
220
+ # Save the plot to a BytesIO object
221
+ buf = BytesIO()
222
+ plt.savefig(buf, format='png')
223
+ buf.seek(0)
224
+
225
+ # Close the plt figure to prevent display
226
+ plt.close()
227
+
228
+ # Use PIL to open the image from the BytesIO object
229
+ image = Image.open(buf)
230
+
231
+ return image
232
+
233
+
234
+ ############################################################################################################################################
235
+ # The code for processing the YouTube link, generating the word cloud, summary, and sentiment analysis
236
+ # should be defined here (using your existing functions).
237
+
238
+ def process_youtube_comments(youtube_link, max_comments, stop_words):
239
+ # Process the YouTube link and generate the word cloud, summary, and sentiment analysis
240
+
241
+ # Pull comments from the YouTube Video
242
+ comments_df = comments_collector(video_link=youtube_link, max_comments=max_comments)
243
+ # Analyze
244
+ analysis_dict = comments_analyzer(comments_df)
245
+ long_text = analysis_dict['blended_comments']
246
+
247
+ # Generate word cloud
248
+ word_cloud_img = generate_wordcloud(long_text, additional_stopwords=['Timestamps', 'timestamps'])
249
+
250
+ # Text Summarization
251
+ summarized_text = text_summarization_task(long_text, min_length=100, max_length=200, truncation=True)[0]['summary_text']
252
+
253
+ # Create Sentiment Chart
254
+ sentiment_chart = create_sentiment_analysis_chart(analysis_dict)
255
+
256
+ # Return the generated word cloud image, summary text, and sentiment analysis chart
257
+ return word_cloud_img, summarized_text, sentiment_chart
258
+
259
+ ############################################################################################################################################
260
+ # Gradio interface
261
+ interface = gr.Interface(
262
+ fn=process_youtube_comments,
263
+ inputs=[
264
+ gr.Textbox(label="YouTube Video Link"),
265
+ gr.Number(label="Maximum Comments", value=100),
266
+ gr.Textbox(label="Excluded Words (comma-separated)")
267
+ ],
268
+ outputs=[
269
+ gr.Image(label="Word Cloud"),
270
+ gr.Textbox(label="Summary of Comments"),
271
+ gr.Image(label="Sentiment Analysis Chart")
272
+ ],
273
+ title="YouTube Comments Analyzer",
274
+ description="Enter a YouTube link to generate a word cloud, summary, and sentiment analysis of the comments."
275
+ )
276
+
277
+ # Run the interface
278
+ interface.launch()
279
+ ############################################################################################################################################