molokhovdmitry commited on
Commit
0d1ee8d
1 Parent(s): 47d6fb9

Add language map

Browse files
Files changed (4) hide show
  1. .env.example +2 -1
  2. data/countries.geo.json +0 -0
  3. src/app.py +50 -4
  4. src/maps.py +129 -0
.env.example CHANGED
@@ -1,3 +1,4 @@
1
  YT_API_KEY=""
2
  PRED_BATCH_SIZE=512
3
- MAX_COMMENT_SIZE=300
 
 
1
  YT_API_KEY=""
2
  PRED_BATCH_SIZE=512
3
+ MAX_COMMENT_SIZE=300
4
+ LANG_DETECTION_CONF=0.5
data/countries.geo.json ADDED
The diff for this file is too large to render. See raw diff
 
src/app.py CHANGED
@@ -12,6 +12,7 @@ from sklearn.decomposition import NMF
12
  from sklearn.manifold import TSNE
13
 
14
  from yt_api import YouTubeAPI
 
15
 
16
 
17
  # Load app settings
@@ -19,6 +20,7 @@ load_dotenv()
19
  YT_API_KEY = os.getenv('YT_API_KEY')
20
  MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
21
  PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
 
22
 
23
 
24
  @st.cache_resource
@@ -37,6 +39,13 @@ def init_embedding_model():
37
  return model
38
 
39
 
 
 
 
 
 
 
 
40
  def predict_emotions(df, clf):
41
  """
42
  Predicts emotions for every `text_original` in a DataFrame `df` with a
@@ -60,6 +69,29 @@ def predict_emotions(df, clf):
60
  return df
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def emotion_dist_plot(df, emotion_cols):
64
  """
65
  Creates an emotion distribution plotly figure from `df` DataFrame
@@ -78,8 +110,8 @@ def nmf_plots(df,
78
  tfidf_stop_words='english'
79
  ):
80
  """
81
- Converts all `text_original` values of `df` DataFrame to TF-IDF features and
82
- performs Non-negative matrix factorization on them.
83
 
84
  Returns a tuple of the modified DataFrame with NMF values and a list of
85
  plotly figures (`df`, [plotly figures]).
@@ -242,6 +274,7 @@ st.title("Social-Stat")
242
  # Load models
243
  emotions_clf = init_emotions_model()
244
  sentence_encoder = init_embedding_model()
 
245
 
246
  # Init YouTube API
247
  yt_api = YouTubeAPI(
@@ -306,6 +339,12 @@ with st.form(key='input'):
306
  options=['first_emotion', 'second_emotion']
307
  )
308
 
 
 
 
 
 
 
309
  submit = st.form_submit_button("Analyze")
310
 
311
 
@@ -352,16 +391,23 @@ if submit:
352
  tsne_perplexity)
353
  plots.extend(tsne_figs)
354
 
355
- # Show the final DataFrame
356
- st.dataframe(df)
 
357
 
358
  # Plot all figures
359
  if emotions_checkbox:
360
  st.plotly_chart(emotion_fig, use_container_width=True)
361
 
 
 
 
362
  cols = st.columns(2)
363
  for i, plot in enumerate(plots):
364
  cols[i % 2].plotly_chart(
365
  plot, sharing='streamlit',
366
  theme='streamlit',
367
  use_container_width=True)
 
 
 
 
12
  from sklearn.manifold import TSNE
13
 
14
  from yt_api import YouTubeAPI
15
+ from maps import lang_map
16
 
17
 
18
  # Load app settings
 
20
  YT_API_KEY = os.getenv('YT_API_KEY')
21
  MAX_COMMENT_SIZE = int(os.getenv('MAX_COMMENT_SIZE'))
22
  PRED_BATCH_SIZE = int(os.getenv('PRED_BATCH_SIZE'))
23
+ LANG_DETECTION_CONF = float(os.getenv('LANG_DETECTION_CONF'))
24
 
25
 
26
  @st.cache_resource
 
39
  return model
40
 
41
 
42
+ @st.cache_resource
43
+ def init_lang_model():
44
+ model_ckpt = "papluca/xlm-roberta-base-language-detection"
45
+ pipe = pipeline("text-classification", model=model_ckpt)
46
+ return pipe
47
+
48
+
49
  def predict_emotions(df, clf):
50
  """
51
  Predicts emotions for every `text_original` in a DataFrame `df` with a
 
69
  return df
70
 
71
 
72
+ def detect_languages(df, clf):
73
+ """
74
+ Detects languages for every `text_original` in a DataFrame `df` with a
75
+ classifier `clf`. Takes the language with the highest score.
76
+ Returns a DataFrame with `predicted_language` column.
77
+ """
78
+ # Detect languages in batches
79
+ text_list = df['text_original'].to_list()
80
+ batch_size = PRED_BATCH_SIZE
81
+ text_batches = [text_list[i:i + batch_size]
82
+ for i in range(0, len(text_list), batch_size)]
83
+ preds = [batch_preds[0]['label']
84
+ if batch_preds[0]['score'] > LANG_DETECTION_CONF
85
+ else None
86
+ for text_batch in text_batches
87
+ for batch_preds in clf(text_batch, top_k=1, truncation=True)]
88
+
89
+ # Add predictions to DataFrame
90
+ df['predicted_language'] = preds
91
+
92
+ return df
93
+
94
+
95
  def emotion_dist_plot(df, emotion_cols):
96
  """
97
  Creates an emotion distribution plotly figure from `df` DataFrame
 
110
  tfidf_stop_words='english'
111
  ):
112
  """
113
+ Converts all `text_original` values of `df` DataFrame to TF-IDF features
114
+ and performs Non-negative matrix factorization on them.
115
 
116
  Returns a tuple of the modified DataFrame with NMF values and a list of
117
  plotly figures (`df`, [plotly figures]).
 
274
  # Load models
275
  emotions_clf = init_emotions_model()
276
  sentence_encoder = init_embedding_model()
277
+ lang_model = init_lang_model()
278
 
279
  # Init YouTube API
280
  yt_api = YouTubeAPI(
 
339
  options=['first_emotion', 'second_emotion']
340
  )
341
 
342
+ # Language Map
343
+ map_checkbox = st.checkbox(
344
+ "Language Map",
345
+ value=True,
346
+ )
347
+
348
  submit = st.form_submit_button("Analyze")
349
 
350
 
 
391
  tsne_perplexity)
392
  plots.extend(tsne_figs)
393
 
394
+ if map_checkbox:
395
+ df = detect_languages(df, lang_model)
396
+ map_figure = lang_map(df)
397
 
398
  # Plot all figures
399
  if emotions_checkbox:
400
  st.plotly_chart(emotion_fig, use_container_width=True)
401
 
402
+ if map_checkbox:
403
+ st.plotly_chart(map_figure, use_container_width=True)
404
+
405
  cols = st.columns(2)
406
  for i, plot in enumerate(plots):
407
  cols[i % 2].plotly_chart(
408
  plot, sharing='streamlit',
409
  theme='streamlit',
410
  use_container_width=True)
411
+
412
+ # Show the final DataFrame
413
+ st.dataframe(df)
src/maps.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ # Language codes predicted by language detection model
6
+ LANG_CODES = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
7
+ 'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
8
+
9
+ COUNTRY_TO_LANG_CODE = {
10
+ 'Algeria': 'ar',
11
+ 'Chad': 'ar',
12
+ 'Djibouti': 'ar',
13
+ 'Egypt': 'ar',
14
+ 'Iraq': 'ar',
15
+ 'Jordan': 'ar',
16
+ 'Kuwait': 'ar',
17
+ 'Lebanon': 'ar',
18
+ 'Libya': 'ar',
19
+ 'Mali': 'ar',
20
+ 'Mauritania': 'ar',
21
+ 'Morocco': 'ar',
22
+ 'Oman': 'ar',
23
+ 'Palestine': 'ar',
24
+ 'Qatar': 'ar',
25
+ 'Saudi Arabia': 'ar',
26
+ 'Somalia': 'ar',
27
+ 'Sudan': 'ar',
28
+ 'Syria': 'ar',
29
+ 'Tunisia': 'ar',
30
+ 'United Arab Emirates': 'ar',
31
+ 'Yemen': 'ar',
32
+ 'Bulgaria': 'bg',
33
+ 'Germany': 'de',
34
+ 'Greece': 'el',
35
+ 'Cyprus': 'el',
36
+ 'United States of America': 'en',
37
+ 'Ireland': 'en',
38
+ 'United Kingdom': 'en',
39
+ 'Canada': 'en',
40
+ 'Australia': 'en',
41
+ 'Mexico': 'es',
42
+ 'Mexico': 'es',
43
+ 'Colombia': 'es',
44
+ 'Spain': 'es',
45
+ 'Argentina': 'es',
46
+ 'Peru': 'es',
47
+ 'Venezuela': 'es',
48
+ 'Chile': 'es',
49
+ 'Guatemala': 'es',
50
+ 'Ecuador': 'es',
51
+ 'Bolivia': 'es',
52
+ 'Cuba': 'es',
53
+ 'Dominican Rep.': 'es',
54
+ 'Honduras': 'es',
55
+ 'Paraguay': 'es',
56
+ 'El Salvador': 'es',
57
+ 'Nicaragua': 'es',
58
+ 'Costa Rica': 'es',
59
+ 'Panama': 'es',
60
+ 'Uruguay': 'es',
61
+ 'Guinea': 'es',
62
+ 'France': 'fr',
63
+ 'India': 'hi',
64
+ 'Italy': 'it',
65
+ 'Japan': 'ja',
66
+ 'Netherlands': 'nl',
67
+ 'Belgium': 'nl',
68
+ 'Poland': 'pl',
69
+ 'Portugal': 'pt',
70
+ 'Russia': 'ru',
71
+ 'Uganda': 'sw',
72
+ 'Kenya': 'sw',
73
+ 'Tanzania': 'sw',
74
+ 'Thailand': 'th',
75
+ 'Turkey': 'tr',
76
+ 'Pakistan': 'ur',
77
+ 'Vietnam': 'vi',
78
+ 'China': 'zh'
79
+ }
80
+
81
+
82
+ def lang_map(df):
83
+ with open('data/countries.geo.json') as f:
84
+ countries = json.load(f)
85
+ country_list = [country['properties']['name']
86
+ for country in dict(countries)['features']]
87
+ LANG_CODES = df.value_counts('predicted_language')
88
+
89
+ countries_data = []
90
+ lang_count_data = []
91
+ lang_code_data = []
92
+ for country in country_list:
93
+ if country in COUNTRY_TO_LANG_CODE:
94
+ country_lang = COUNTRY_TO_LANG_CODE[country]
95
+ if country_lang in LANG_CODES.index:
96
+ countries_data.append(country)
97
+ lang_count = LANG_CODES.loc[COUNTRY_TO_LANG_CODE[country]]
98
+ lang_count_data.append(lang_count)
99
+ lang_code_data.append(country_lang)
100
+ lang_df = pd.DataFrame({
101
+ 'country': countries_data,
102
+ 'count': lang_count_data,
103
+ 'lang_code': lang_code_data
104
+ })
105
+
106
+ fig = px.choropleth(
107
+ lang_df,
108
+ geojson=countries,
109
+ locations='country',
110
+ locationmode='country names',
111
+ color='count',
112
+ color_continuous_scale=[
113
+ [0, "rgb(45,45,48)"],
114
+ [0.33, "rgb(116,173,209)"],
115
+ [0.66, "rgb(255,255,0)"],
116
+ [1, "rgb(255,94,5)"]
117
+ ],
118
+ scope='world',
119
+ hover_data=['lang_code'],
120
+ labels={'count': "Language Count"},
121
+ template='plotly_dark'
122
+ )
123
+ fig.update_geos(showcountries=True)
124
+ fig.update_layout(
125
+ title_text="Language Map",
126
+ margin={"r": 0, "t": 20, "l": 0, "b": 0}
127
+ )
128
+
129
+ return fig