File size: 5,924 Bytes
03f7867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38abd5a
 
03f7867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf33f7d
d76fb4c
03f7867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf33f7d
03f7867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf33f7d
03f7867
 
cf33f7d
03f7867
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import re
import streamlit as st
import googleapiclient.discovery
import pandas as pd
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns

st.title('Анализатор комментариев :red[YouTube] :sunglasses:')


# Инициализируем модель Hugging Face для анализа тональности текста
# Кэшируем ресурс для одной загрузки модели на все сессии
@st.cache_resource
def load_model():
    """
    Loads the 'blanchefort/rubert-base-cased-sentiment' model from HuggingFace
    and saves to cache for consecutive loads.
    """
    model = pipeline(
        "sentiment-analysis",
        "blanchefort/rubert-base-cased-sentiment")
    return model


def extract_video_id(url: str) -> str:
    """
    Extracts the video ID from a YouTube video URL.
    Args:       url (str): The YouTube video URL.
    Returns:    str: The extracted video ID,
                or an empty string if the URL is not valid.
    """
    pattern = r"(?<=v=)[\w-]+(?=&|\b)"
    match = re.search(pattern, url)
    if match:
        return match.group()
    else:
        return ""


def download_comments(video_id: str) -> pd.DataFrame:
    """
    Downloads comments from a YouTube video based on the provided video ID
    and returns them as a DataFrame.
    Args: video_id (str): The video ID of the YouTube video.
    Returns: DataFrame: A DataFrame containing the downloaded comments from the video.
    """
    DEV_KEY = os.getenv('API_KEY_YOUTUBE')
    if DEV_KEY:
        print("Key is ok")
    youtube = googleapiclient.discovery.build("youtube",
                                              "v3",
                                              developerKey=DEV_KEY)
    request = youtube.commentThreads().list(part="snippet",
                                            videoId=video_id,
                                            maxResults=100)
    response = request.execute()
    comments = []
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']
        comments.append([comment['authorDisplayName'],
                        comment['publishedAt'],
                        comment['updatedAt'],
                        comment['likeCount'],
                        comment['textDisplay'],])
    return pd.DataFrame(comments,
                        columns=['author',
                                'published_at',
                                'updated_at',
                                'like_count',
                                'text',])


def analyze_emotions_in_comments(df: pd.DataFrame) -> tuple:
    """
    Takes a DataFrame with comments,
    processes the emotional sentiment of each comment in the DataFrame
    Args: dataframe (pandas.DataFrame): DataFrame containing comments to analyze.
    Returns: tuple: containing the updated DataFrame with the added 'Emotional Sentiment' column
    and the total count of processed comments.
    """
    model = load_model()
    selected_columns = ['text', 'author', 'published_at']
    df = df[selected_columns]
    res_list = []
    res_list = model(df['text'][:513].to_list()) 
    full_df = pd.concat([pd.DataFrame(res_list), df], axis=1)
    return (full_df, len(res_list))


def plot_heatmap_from_dataframe(df: pd.DataFrame) -> plt:
    """
    Visualizes the data from the input DataFrame and returns a matplotlib plot object.
    Args: df (DataFrame): The input DataFrame containing the data to be visualized.
    Returns: plt: A matplotlib plot object showing the visualization of the data.
    """
    df['published_at'] = pd.to_datetime(df['published_at'])
    df['Date'] = df['published_at'].dt.date
    df['Hour'] = df['published_at'].dt.hour
    pivot_table = df.pivot_table(index='Hour',
                                columns='Date',
                                values='text',
                                aggfunc='count')
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_table,
                cmap='YlGnBu')
    plt.title('Количество комментариев по часам и датам')
    plt.xlabel('Дата')
    plt.ylabel('Час')
    return plt


def visualize_data(df: pd.DataFrame):
    """
    Visualizes the data from the input DataFrame and returns a matplotlib figure object.
    Args: df (DataFrame): The input DataFrame containing the data to be visualized.
    Returns: fig: A matplotlib figure object
    """
    data = df['label'].value_counts()
    fig, ax = plt.subplots()
    plt.title("Эмоциональная окраска комментариев на YouTube")
    label = df['label'].unique()
    ax.pie(data, labels=label, autopct='%1.1f%%')
    return fig


def change_url():
    st.session_state.start = False


if "start" not in st.session_state:
    st.session_state.start = False

# Получаем id видеоролика из URL для отправки запроса
url = st.text_input(label="Enter URL from YouTube", on_change=change_url)
video_id = extract_video_id(url)
if  video_id != "":
    if btn_start := st.button('Загрузить комментарии'):
        st.session_state.start = True

if st.session_state.start:
    # Выводим таблицу с результатами на странице
    comments_df = download_comments(video_id)
    with st.spinner('Analyzing comments...'):
        full_df,  num_comments = analyze_emotions_in_comments(comments_df)
        st.success(f'Готово! Обработано {num_comments} комментариев.')
    st.write(full_df)
    st.markdown('***')

    # Выводим heatmap комментариев по часам и датам
    st.pyplot(plot_heatmap_from_dataframe(full_df))
    st.markdown('***')

    # Выводим круговую диаграмму
    st.pyplot(visualize_data(full_df))