Spaces:
Sleeping
Sleeping
import os | |
import re | |
import streamlit as st | |
import googleapiclient.discovery | |
import pandas as pd | |
from transformers import pipeline | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
st.title('Анализатор комментариев :red[YouTube] :sunglasses:') | |
# Инициализируем модель Hugging Face для анализа тональности текста | |
# Кэшируем ресурс для одной загрузки модели на все сессии | |
#@st.cache_resource | |
def load_model(): | |
""" | |
Loads the 'blanchefort/rubert-base-cased-sentiment' model from HuggingFace | |
and saves to cache for consecutive loads. | |
""" | |
model = pipeline( | |
"sentiment-analysis", | |
"blanchefort/rubert-base-cased-sentiment") | |
return model | |
def extract_video_id(url: str) -> str: | |
""" | |
Extracts the video ID from a YouTube video URL. | |
Args: url (str): The YouTube video URL. | |
Returns: str: The extracted video ID, | |
or an empty string if the URL is not valid. | |
""" | |
pattern = r"(?<=v=)[\w-]+(?=&|\b)" | |
match = re.search(pattern, url) | |
if match: | |
return match.group() | |
else: | |
return "" | |
def download_comments(video_id: str) -> pd.DataFrame: | |
""" | |
Downloads comments from a YouTube video based on the provided video ID | |
and returns them as a DataFrame. | |
Args: video_id (str): The video ID of the YouTube video. | |
Returns: DataFrame: A DataFrame containing the downloaded comments from the video. | |
""" | |
DEV_KEY = os.getenv('API_KEY_YOUTUBE') | |
youtube = googleapiclient.discovery.build("youtube", | |
"v3", | |
developerKey=DEV_KEY) | |
request = youtube.commentThreads().list(part="snippet", | |
videoId=video_id, | |
maxResults=100) | |
response = request.execute() | |
comments = [] | |
for item in response['items']: | |
comment = item['snippet']['topLevelComment']['snippet'] | |
comments.append([comment['authorDisplayName'], | |
comment['publishedAt'], | |
comment['updatedAt'], | |
comment['likeCount'], | |
comment['textDisplay'],]) | |
return pd.DataFrame(comments, | |
columns=['author', | |
'published_at', | |
'updated_at', | |
'like_count', | |
'text',]) | |
def analyze_emotions_in_comments(df: pd.DataFrame) -> tuple: | |
""" | |
Takes a DataFrame with comments, | |
processes the emotional sentiment of each comment in the DataFrame | |
Args: dataframe (pandas.DataFrame): DataFrame containing comments to analyze. | |
Returns: tuple: containing the updated DataFrame with the added 'Emotional Sentiment' column | |
and the total count of processed comments. | |
""" | |
model = load_model() | |
selected_columns = ['text', 'author', 'published_at'] | |
df = df[selected_columns] | |
res_list = [] | |
res_list = model(df['text'][:513].to_list()) | |
full_df = pd.concat([pd.DataFrame(res_list), df], axis=1) | |
return (full_df, len(res_list)) | |
def plot_heatmap_from_dataframe(df: pd.DataFrame) -> plt: | |
""" | |
Visualizes the data from the input DataFrame and returns a matplotlib plot object. | |
Args: df (DataFrame): The input DataFrame containing the data to be visualized. | |
Returns: plt: A matplotlib plot object showing the visualization of the data. | |
""" | |
df['published_at'] = pd.to_datetime(df['published_at']) | |
df['Date'] = df['published_at'].dt.date | |
df['Hour'] = df['published_at'].dt.hour | |
pivot_table = df.pivot_table(index='Hour', | |
columns='Date', | |
values='text', | |
aggfunc='count') | |
plt.figure(figsize=(10, 6)) | |
sns.heatmap(pivot_table, | |
cmap='YlGnBu') | |
plt.title('Количество комментариев по часам и датам') | |
plt.xlabel('Дата') | |
plt.ylabel('Час') | |
return plt | |
def visualize_data(df: pd.DataFrame): | |
""" | |
Visualizes the data from the input DataFrame and returns a matplotlib figure object. | |
Args: df (DataFrame): The input DataFrame containing the data to be visualized. | |
Returns: fig: A matplotlib figure object | |
""" | |
data = df['label'].value_counts() | |
fig, ax = plt.subplots() | |
plt.title("Эмоциональная окраска комментариев на YouTube") | |
label = data.index | |
ax.pie(data, labels=label, autopct='%1.1f%%') | |
return fig | |
def change_url(): | |
st.session_state.start = False | |
if "start" not in st.session_state: | |
st.session_state.start = False | |
# Получаем id видеоролика из URL для отправки запроса | |
url = st.text_input(label="Enter URL from YouTube", on_change=change_url) | |
video_id = extract_video_id(url) | |
if video_id != "": | |
if btn_start := st.button('Загрузить комментарии'): | |
st.session_state.start = True | |
if st.session_state.start: | |
# Выводим таблицу с результатами на странице | |
comments_df = download_comments(video_id) | |
with st.spinner('Analyzing comments...'): | |
full_df, num_comments = analyze_emotions_in_comments(comments_df) | |
st.success(f'Готово! Обработано {num_comments} комментариев.') | |
st.write(full_df) | |
st.markdown('***') | |
# Выводим heatmap комментариев по часам и датам | |
st.pyplot(plot_heatmap_from_dataframe(full_df)) | |
st.markdown('***') | |
# Выводим круговую диаграмму | |
st.pyplot(visualize_data(full_df)) | |