molokhovdmitry commited on
Commit
1f47c3b
1 Parent(s): ecaa557

Move from FastAPI to Streamlit web app

Browse files
Files changed (10) hide show
  1. .env.example +3 -0
  2. Dockerfile +1 -1
  3. requirements.txt +6 -5
  4. src/__init__.py +0 -0
  5. src/app.py +367 -0
  6. src/main.py +0 -54
  7. src/models.py +0 -10
  8. src/test_main.py +0 -27
  9. src/yt_api.py +4 -0
  10. vm_startup.sh +0 -6
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ YT_API_KEY=""
2
+ PRED_BATCH_SIZE=512
3
+ MAX_COMMENT_SIZE=300
Dockerfile CHANGED
@@ -5,4 +5,4 @@ RUN python -m pip install --upgrade pip
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
  EXPOSE 8000
8
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
  EXPOSE 8000
8
+ CMD ["streamlit", "run", "src/app.py", "--server.port", "8000"]
requirements.txt CHANGED
@@ -1,11 +1,12 @@
1
  requests
2
- fastapi
3
- uvicorn
4
- pydantic_settings
5
  torch
6
  torchvision
7
  torchaudio
8
  transformers
 
9
  pandas
10
- pytest
11
- httpx
 
 
 
1
  requests
2
+ python-dotenv
 
 
3
  torch
4
  torchvision
5
  torchaudio
6
  transformers
7
+ sentence-transformers
8
  pandas
9
+ seaborn
10
+ plotly
11
+ nbformat
12
+ streamlit
src/__init__.py DELETED
File without changes
src/app.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from transformers import pipeline
4
+ from sentence_transformers import SentenceTransformer
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from plotly.subplots import make_subplots
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.decomposition import NMF
12
+ from sklearn.manifold import TSNE
13
+
14
+ from yt_api import YouTubeAPI
15
+
16
+
17
+ # Load app settings
18
+ load_dotenv()
19
+ YT_API_KEY = os.getenv('YT_API_KEY')
20
+ MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
21
+ PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
22
+
23
+
24
+ @st.cache_resource
25
+ def init_emotions_model():
26
+ classifier = pipeline(
27
+ task="text-classification",
28
+ model="SamLowe/roberta-base-go_emotions",
29
+ top_k=None)
30
+
31
+ return classifier
32
+
33
+
34
+ @st.cache_resource
35
+ def init_embedding_model():
36
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
37
+ return model
38
+
39
+
40
+ def predict_emotions(df, clf):
41
+ """
42
+ Predicts emotions for every `text_original` in a DataFrame `df` with a
43
+ classifier `clf`.
44
+ Returns a DataFrame with emotion columns.
45
+ """
46
+ # Predict emotions in batches
47
+ text_list = df['text_original'].to_list()
48
+ batch_size = PRED_BATCH_SIZE
49
+ text_batches = [text_list[i:i + batch_size]
50
+ for i in range(0, len(text_list), batch_size)]
51
+ preds = [comment_emotions
52
+ for text_batch in text_batches
53
+ for comment_emotions in clf(text_batch)]
54
+
55
+ # Add predictions to DataFrame
56
+ preds_df = pd.DataFrame([{emotion['label']: emotion['score']
57
+ for emotion in pred} for pred in preds])
58
+ df = pd.concat([df, preds_df], axis=1)
59
+
60
+ return df
61
+
62
+
63
+ def emotion_dist_plot(df, emotion_cols):
64
+ """
65
+ Creates an emotion distribution plotly figure from `df` DataFrame
66
+ and `emotion_cols` and returns it.
67
+ """
68
+ fig = px.bar(df[emotion_cols].sum().sort_values(ascending=False))
69
+ fig.update_layout(title_text="Emotion Distribution",
70
+ width=2000)
71
+
72
+ return fig
73
+
74
+
75
+ def nmf_plots(df,
76
+ nmf_components,
77
+ tfidf_max_features,
78
+ tfidf_stop_words='english'
79
+ ):
80
+ """
81
+ Converts all `text_original` values of `df` DataFrame to TF-IDF features and
82
+ performs Non-negative matrix factorization on them.
83
+
84
+ Returns a tuple of the modified DataFrame with NMF values and a list of
85
+ plotly figures (`df`, [plotly figures]).
86
+ """
87
+ # Convert to TF-IDF features
88
+ vectorizer = TfidfVectorizer(max_features=tfidf_max_features,
89
+ stop_words=tfidf_stop_words)
90
+ embeddings = vectorizer.fit_transform(df['text_original'])
91
+
92
+ # Get feature_names (words) from the vectorizer
93
+ feature_names = vectorizer.get_feature_names_out()
94
+
95
+ # Perform NMF
96
+ nmf = NMF(n_components=nmf_components)
97
+ nmf_embeddings = nmf.fit_transform(embeddings).T
98
+ topic_cols = [f'topic_{topic_num+1}'
99
+ for topic_num in range(nmf_components)]
100
+
101
+ # Add NMF values to the DataFrame
102
+ for i, col in enumerate(topic_cols):
103
+ df[col] = nmf_embeddings[i]
104
+
105
+ # Get word values for every topic
106
+ word_df = pd.DataFrame(
107
+ nmf.components_.T,
108
+ columns=topic_cols,
109
+ index=feature_names
110
+ )
111
+
112
+ # Plot word distributions of each topic
113
+ topic_words_fig = make_subplots(
114
+ rows=1, cols=nmf_components,
115
+ subplot_titles=topic_cols)
116
+
117
+ for i, col in enumerate(topic_cols):
118
+ topic_words = word_df[col].sort_values(ascending=False)
119
+ top_topic_words = topic_words[:top_words_in_topic]
120
+ topic_words_fig.add_trace(go.Bar(y=top_topic_words.index,
121
+ x=top_topic_words.values,
122
+ orientation='h',
123
+ base=0),
124
+ row=1, col=i+1)
125
+ topic_words_fig.update_layout(title_text="Topic Word Distributions")
126
+
127
+ # Plot topic contribution for the dataset
128
+ for col in topic_cols:
129
+ df[col + '_cumsum'] = df[col].cumsum()
130
+ for col in topic_cols:
131
+ cumsum_sum = df[[col + '_cumsum' for col in topic_cols]].sum(axis=1)
132
+ df[col + '_percentage'] = df[col + '_cumsum'] / cumsum_sum
133
+ contributions_fig = stacked_area_plot(
134
+ x=df['published_at'],
135
+ y_list=[df[f'topic_{i+1}_percentage'] for i in range(nmf_components)],
136
+ names=topic_cols)
137
+
138
+ return df, [topic_words_fig, contributions_fig]
139
+
140
+
141
+ def tsne_plots(df, encoder, emotion_cols, color_emotion, tsne_perplexity):
142
+ """
143
+ Encodes all `text_original` values of `df` DataFrame with `encoder`,
144
+ uses t-SNE algorithm for visualization on these embeddings and on
145
+ predicted emotions if they were predicted.
146
+ """
147
+ # Encode and add embeddings to the DataFrame
148
+ embeddings = encoder.encode(df['text_original'])
149
+ embedding_cols = [f'embedding_{i+1}' for i in range(embeddings.shape[1])]
150
+ df = pd.concat([df, pd.DataFrame(embeddings, columns=embedding_cols)],
151
+ axis=1)
152
+
153
+ # t-SNE
154
+ TSNE_COMPONENTS = 2
155
+ tsne = TSNE(
156
+ n_components=2,
157
+ perplexity=tsne_perplexity,
158
+ )
159
+
160
+ # Also use predicted emotions
161
+ if emotion_cols:
162
+ tsne_cols = embedding_cols + emotion_cols
163
+ color = color_emotion
164
+ hover_data = ['first_emotion', 'second_emotion', 'text_original']
165
+ else:
166
+ tsne_cols = embedding_cols
167
+ color = None
168
+ hover_data = 'text_original'
169
+
170
+ tsne_results = tsne.fit_transform(df[tsne_cols])
171
+ tsne_results = pd.DataFrame(
172
+ tsne_results,
173
+ columns=[f'tsne_{i+1}' for i in range(TSNE_COMPONENTS)]
174
+ )
175
+
176
+ df = pd.concat([df, tsne_results], axis=1)
177
+
178
+ # 2D Visualization
179
+ fig2d = px.scatter(
180
+ df,
181
+ x='tsne_1',
182
+ y='tsne_2',
183
+ color=color,
184
+ hover_data=hover_data
185
+ )
186
+ fig2d.update_layout(
187
+ title_text="t-SNE Visualization"
188
+ )
189
+
190
+ # 3D Visualization with date as the third axis
191
+ fig3d = px.scatter_3d(
192
+ df,
193
+ x='published_at',
194
+ y='tsne_1',
195
+ z='tsne_2',
196
+ color=color,
197
+ hover_data=hover_data
198
+ )
199
+ fig3d.update_layout(
200
+ title_text="t-SNE Visualization Over Time"
201
+ )
202
+
203
+ return df, [fig2d, fig3d]
204
+
205
+
206
+ def stacked_area_plot(x, y_list, names):
207
+ """Creates plotly stacked area plot. Returns a figure of that plot."""
208
+ fig = go.Figure()
209
+ for y, name in zip(y_list, names):
210
+ fig.add_trace(go.Scatter(
211
+ x=x, y=y*100,
212
+ mode='lines',
213
+ line=dict(width=0.5),
214
+ stackgroup='one',
215
+ name=name,
216
+ ))
217
+
218
+ fig.update_layout(
219
+ showlegend=True,
220
+ xaxis_type='category',
221
+ yaxis=dict(
222
+ type='linear',
223
+ range=[0, 100],
224
+ ticksuffix='%')
225
+ )
226
+
227
+ fig.update_layout(title_text="Topic Contribution")
228
+
229
+ return fig
230
+
231
+
232
+ def add_top_2_emotions(row):
233
+ emotions = row[emotion_cols].sort_values(ascending=False)
234
+ row['first_emotion'] = emotions.index[0]
235
+ row['second_emotion'] = emotions.index[1]
236
+ return row
237
+
238
+
239
+ st.set_page_config(layout='wide')
240
+ st.title("Social-Stat")
241
+
242
+ # Load models
243
+ emotions_clf = init_emotions_model()
244
+ sentence_encoder = init_embedding_model()
245
+
246
+ # Init YouTube API
247
+ yt_api = YouTubeAPI(
248
+ api_key=YT_API_KEY,
249
+ max_comment_size=MAX_COMMENT_SIZE
250
+ )
251
+
252
+ # Input form
253
+ with st.form(key='input'):
254
+ video_id = st.text_input("Video ID")
255
+
256
+ # Emotions
257
+ emotions_checkbox = st.checkbox(
258
+ "Predict Emotions",
259
+ value=True,
260
+ )
261
+
262
+ # NMF
263
+ nmf_checkbox = st.checkbox(
264
+ "Non-Negative Matrix Factorization",
265
+ value=True,
266
+ )
267
+
268
+ nmf_components = st.slider(
269
+ "Topics (NMF Components)",
270
+ min_value=2,
271
+ max_value=20,
272
+ value=10,
273
+ step=1,
274
+ )
275
+
276
+ tfidf_max_features = st.select_slider(
277
+ "Words (TF-IDF Vectorizer Max Features)",
278
+ options=list(range(10, 501)) + [None],
279
+ value=100,
280
+ )
281
+
282
+ top_words_in_topic = st.slider(
283
+ "Top Topic Words",
284
+ min_value=1,
285
+ max_value=50,
286
+ value=10,
287
+ step=1,
288
+ )
289
+
290
+ # t-SNE
291
+ tsne_checkbox = st.checkbox(
292
+ "t-SNE Visualization",
293
+ value=True,
294
+ )
295
+
296
+ tsne_perplexity = st.slider(
297
+ "t-SNE Perplexity",
298
+ min_value=5,
299
+ max_value=50,
300
+ value=10,
301
+ step=1,
302
+ )
303
+
304
+ tsne_color_emotion = st.selectbox(
305
+ "Emotion For The Plot Color",
306
+ options=['first_emotion', 'second_emotion']
307
+ )
308
+
309
+ submit = st.form_submit_button("Analyze")
310
+
311
+
312
+ if submit:
313
+ # Get comments
314
+ try:
315
+ bad_id = False
316
+ comments = yt_api.get_comments(video_id)
317
+ except KeyError:
318
+ st.write("Video not found.")
319
+ bad_id = True
320
+
321
+ if not bad_id:
322
+ plots = []
323
+
324
+ # Convert to pandas DataFrame and sort by publishing date
325
+ df = pd.DataFrame(comments).sort_values('published_at')
326
+
327
+ emotion_cols = []
328
+ if emotions_checkbox:
329
+ # Predict emotions
330
+ df = predict_emotions(df, emotions_clf)
331
+ emotion_cols = list(df.columns[11:])
332
+
333
+ # Get emotion distribution figure
334
+ emotion_fig = emotion_dist_plot(df, emotion_cols)
335
+
336
+ # TODO: Get emotion contribution figure
337
+
338
+ # Get top 2 emotions
339
+ df = df.apply(add_top_2_emotions, axis=1)
340
+
341
+ if nmf_checkbox:
342
+ # NMF
343
+ df, nmf_figs = nmf_plots(df, nmf_components, tfidf_max_features)
344
+ plots.extend(nmf_figs)
345
+
346
+ if tsne_checkbox:
347
+ # t-SNE visualization
348
+ df, tsne_figs = tsne_plots(df,
349
+ sentence_encoder,
350
+ emotion_cols,
351
+ tsne_color_emotion,
352
+ tsne_perplexity)
353
+ plots.extend(tsne_figs)
354
+
355
+ # Show the final DataFrame
356
+ st.dataframe(df)
357
+
358
+ # Plot all figures
359
+ if emotions_checkbox:
360
+ st.plotly_chart(emotion_fig, use_container_width=True)
361
+
362
+ cols = st.columns(2)
363
+ for i, plot in enumerate(plots):
364
+ cols[i % 2].plotly_chart(
365
+ plot, sharing='streamlit',
366
+ theme='streamlit',
367
+ use_container_width=True)
src/main.py DELETED
@@ -1,54 +0,0 @@
1
- from fastapi import FastAPI, Response
2
- from pydantic_settings import BaseSettings, SettingsConfigDict
3
- import pandas as pd
4
-
5
- from src.yt_api import YouTubeAPI
6
- from src.models import init_emotions_model
7
-
8
-
9
- class Settings(BaseSettings):
10
- YT_API_KEY: str
11
- PRED_BATCH_SIZE: int = 512
12
- MAX_COMMENT_SIZE: int = 300
13
- model_config = SettingsConfigDict(env_file='.env')
14
-
15
-
16
- settings = Settings()
17
- app = FastAPI(title='social-stat')
18
-
19
- emotions_clf = init_emotions_model()
20
- yt_api = YouTubeAPI(
21
- api_key=settings.YT_API_KEY,
22
- max_comment_size=settings.MAX_COMMENT_SIZE
23
- )
24
-
25
-
26
- @app.get('/')
27
- def home():
28
- return 'social-stat'
29
-
30
-
31
- @app.get('/predict')
32
- def predict(video_id):
33
- # Get comments
34
- comments = yt_api.get_comments(video_id)
35
- comments_df = pd.DataFrame(comments)
36
-
37
- # Predict emotions in batches
38
- text_list = comments_df['text_display'].to_list()
39
- batch_size = settings.PRED_BATCH_SIZE
40
- text_batches = [text_list[i:i + batch_size]
41
- for i in range(0, len(text_list), batch_size)]
42
- preds = [comment_emotions
43
- for text_batch in text_batches
44
- for comment_emotions in emotions_clf(text_batch)]
45
-
46
- # Add predictions to DataFrame
47
- preds_df = pd.DataFrame([{emotion['label']: emotion['score']
48
- for emotion in pred} for pred in preds])
49
- comments_df = pd.concat([comments_df, preds_df], axis=1)
50
-
51
- # Return DataFrame as a JSON file
52
- return Response(
53
- content=comments_df.to_json(orient='records'),
54
- media_type='application/json')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models.py DELETED
@@ -1,10 +0,0 @@
1
- from transformers import pipeline
2
-
3
-
4
- def init_emotions_model():
5
- classifier = pipeline(
6
- task="text-classification",
7
- model="SamLowe/roberta-base-go_emotions",
8
- top_k=None)
9
-
10
- return classifier
 
 
 
 
 
 
 
 
 
 
 
src/test_main.py DELETED
@@ -1,27 +0,0 @@
1
- from fastapi.testclient import TestClient
2
- from src.main import app
3
- import pandas as pd
4
-
5
-
6
- client = TestClient(app)
7
-
8
-
9
- def test_home():
10
- """Test home page."""
11
- response = client.get("/")
12
- assert response.status_code == 200
13
-
14
-
15
- def test_predict():
16
- """Test predict method on an example video."""
17
- TEST_VIDEO_ID = "0peXnOnDgQ8"
18
- response = client.get(
19
- "/predict/",
20
- params={"video_id": TEST_VIDEO_ID}
21
- )
22
- df = pd.read_json(response, orient='records')
23
-
24
- # Ensure the DataFrame has the right amount of columns
25
- assert df.shape[1] == 39
26
- # Ensure there are no NaN values
27
- assert df.isna().sum().sum() == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/yt_api.py CHANGED
@@ -34,6 +34,10 @@ class YouTubeAPI():
34
  'pageToken': page_token,
35
  }
36
  response = requests.get(url, params=payload)
 
 
 
 
37
  return response.json()
38
 
39
  def response_to_comments(self, response):
 
34
  'pageToken': page_token,
35
  }
36
  response = requests.get(url, params=payload)
37
+
38
+ # Ensure it's not a bad request
39
+ assert response.status_code != 400
40
+
41
  return response.json()
42
 
43
  def response_to_comments(self, response):
vm_startup.sh DELETED
@@ -1,6 +0,0 @@
1
- # Script for an automatic startup on a virtual machine.
2
- . /home/user/python_venv/social-stat/bin/activate
3
- cd /home/user/social-stat
4
- git pull
5
- pip install -r requirements.txt
6
- uvicorn src.main:app --host 0.0.0.0 --port 8000 > /home/user/log.txt 2>&1