import os import re import streamlit as st import googleapiclient.discovery import pandas as pd from transformers import pipeline import matplotlib.pyplot as plt import seaborn as sns st.title('Анализатор комментариев :red[YouTube] :sunglasses:') # Инициализируем модель Hugging Face для анализа тональности текста # Кэшируем ресурс для одной загрузки модели на все сессии #@st.cache_resource def load_model(): """ Loads the 'blanchefort/rubert-base-cased-sentiment' model from HuggingFace and saves to cache for consecutive loads. """ model = pipeline( "sentiment-analysis", "blanchefort/rubert-base-cased-sentiment") return model def extract_video_id(url: str) -> str: """ Extracts the video ID from a YouTube video URL. Args: url (str): The YouTube video URL. Returns: str: The extracted video ID, or an empty string if the URL is not valid. """ pattern = r"(?<=v=)[\w-]+(?=&|\b)" match = re.search(pattern, url) if match: return match.group() else: return "" def download_comments(video_id: str) -> pd.DataFrame: """ Downloads comments from a YouTube video based on the provided video ID and returns them as a DataFrame. Args: video_id (str): The video ID of the YouTube video. Returns: DataFrame: A DataFrame containing the downloaded comments from the video. """ DEV_KEY = os.getenv('API_KEY_YOUTUBE') youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=DEV_KEY) request = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=100) response = request.execute() comments = [] for item in response['items']: comment = item['snippet']['topLevelComment']['snippet'] comments.append([comment['authorDisplayName'], comment['publishedAt'], comment['updatedAt'], comment['likeCount'], comment['textDisplay'],]) return pd.DataFrame(comments, columns=['author', 'published_at', 'updated_at', 'like_count', 'text',]) def analyze_emotions_in_comments(df: pd.DataFrame) -> tuple: """ Takes a DataFrame with comments, processes the emotional sentiment of each comment in the DataFrame Args: dataframe (pandas.DataFrame): DataFrame containing comments to analyze. Returns: tuple: containing the updated DataFrame with the added 'Emotional Sentiment' column and the total count of processed comments. """ model = load_model() selected_columns = ['text', 'author', 'published_at'] df = df[selected_columns] res_list = [] res_list = model(df['text'][:513].to_list()) full_df = pd.concat([pd.DataFrame(res_list), df], axis=1) return (full_df, len(res_list)) def plot_heatmap_from_dataframe(df: pd.DataFrame) -> plt: """ Visualizes the data from the input DataFrame and returns a matplotlib plot object. Args: df (DataFrame): The input DataFrame containing the data to be visualized. Returns: plt: A matplotlib plot object showing the visualization of the data. """ df['published_at'] = pd.to_datetime(df['published_at']) df['Date'] = df['published_at'].dt.date df['Hour'] = df['published_at'].dt.hour pivot_table = df.pivot_table(index='Hour', columns='Date', values='text', aggfunc='count') plt.figure(figsize=(10, 6)) sns.heatmap(pivot_table, cmap='YlGnBu') plt.title('Количество комментариев по часам и датам') plt.xlabel('Дата') plt.ylabel('Час') return plt def visualize_data(df: pd.DataFrame): """ Visualizes the data from the input DataFrame and returns a matplotlib figure object. Args: df (DataFrame): The input DataFrame containing the data to be visualized. Returns: fig: A matplotlib figure object """ data = df['label'].value_counts() fig, ax = plt.subplots() plt.title("Эмоциональная окраска комментариев на YouTube") label = data.index ax.pie(data, labels=label, autopct='%1.1f%%') return fig def change_url(): st.session_state.start = False if "start" not in st.session_state: st.session_state.start = False # Получаем id видеоролика из URL для отправки запроса url = st.text_input(label="Enter URL from YouTube", on_change=change_url) video_id = extract_video_id(url) if video_id != "": if btn_start := st.button('Загрузить комментарии'): st.session_state.start = True if st.session_state.start: # Выводим таблицу с результатами на странице comments_df = download_comments(video_id) with st.spinner('Analyzing comments...'): full_df, num_comments = analyze_emotions_in_comments(comments_df) st.success(f'Готово! Обработано {num_comments} комментариев.') st.write(full_df) st.markdown('***') # Выводим heatmap комментариев по часам и датам st.pyplot(plot_heatmap_from_dataframe(full_df)) st.markdown('***') # Выводим круговую диаграмму st.pyplot(visualize_data(full_df))