joshdavham commited on
Commit
360a122
·
1 Parent(s): 391b919

modify comments

Browse files
Files changed (1) hide show
  1. app.py +99 -307
app.py CHANGED
@@ -9,16 +9,6 @@ st.set_page_config(
9
  page_icon='favicon.svg'
10
  )
11
 
12
-
13
- #st.markdown("""
14
- #<link href="https://fonts.googleapis.com/css2?family=Urbanist:wght@400;700&display=swap" rel="stylesheet">
15
- #<style>
16
- # .vega-embed * {
17
- # font-family: 'Urbanist', sans-serif;
18
- # }
19
- #</style>
20
- #""", unsafe_allow_html=True)
21
-
22
  @st.cache_data
23
  def load_dataframes():
24
 
@@ -49,20 +39,18 @@ st.markdown("To answer this question, I'll be analyzing the videos on \
49
  [cijapanese.com](https://cijapanese.com/) (CIJ), a \
50
  video platform for learning Japanese.")
51
 
52
- # Plot the WPM histogram
53
-
 
54
  st.markdown("## How fast is CI?")
55
 
56
  st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
57
  they speak more slowly in videos meant for beginners and more quickly \
58
  for advanced learners.")
59
 
60
- #st.markdown("### Rate of speech in words per minute (WPM)")
61
-
62
  @st.cache_data
63
  def get_wpm_chart(show_medians=False):
64
 
65
- # Data for vertical lines corresponding to each level
66
  line_data = pd.DataFrame({
67
  'x': [75, 91, 124, 149],
68
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -88,10 +76,8 @@ def get_wpm_chart(show_medians=False):
88
  axis=alt.Axis(
89
  labelFontSize=14,
90
  titleFontSize=18,
91
- #titleFont='Urbanist',
92
  titleColor='black',
93
  titleFontWeight='normal',
94
- #titleFontStyle='italic',
95
  titlePadding=20
96
  )
97
  ),
@@ -101,10 +87,8 @@ def get_wpm_chart(show_medians=False):
101
  axis=alt.Axis(
102
  labelFontSize=14,
103
  titleFontSize=18,
104
- #titleFont='Urbanist',
105
  titleColor='black',
106
  titleFontWeight='normal',
107
- #titleFontStyle='italic',
108
  titlePadding=20,
109
  tickCount=5
110
  ),
@@ -116,11 +100,9 @@ def get_wpm_chart(show_medians=False):
116
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
117
  legend=alt.Legend(
118
  title='CIJ Level',
119
- #titleFont='Urbanist',
120
  titleFontSize=18,
121
  titleFontWeight='bolder',
122
  labelFontSize=16,
123
- #labelFont='Urbanist',
124
  symbolType='circle',
125
  symbolSize=200,
126
  symbolStrokeWidth=0,
@@ -132,24 +114,17 @@ def get_wpm_chart(show_medians=False):
132
  )
133
  ),
134
  tooltip=[
135
- alt.Tooltip('wpm:Q', title='Words per minute:', bin=True), # Properly indicate that `wpm` is binned
136
  alt.Tooltip('level:N', title='Level:'),
137
  alt.Tooltip('count()', title='Video count:')
138
  ],
139
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
140
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
141
  ).properties(
142
- #width=750,
143
- #width='container',
144
- #height='container',
145
  height=500,
146
- #background='beige',
147
- #padding=50,
148
  title=alt.TitleParams(
149
  text='Rate of speech in words per minute (WPM)',
150
  offset=20,
151
- #subtitle='(clickable)',
152
- #font='Urbanist',
153
  fontSize=24,
154
  fontWeight='normal',
155
  anchor='middle',
@@ -162,25 +137,23 @@ def get_wpm_chart(show_medians=False):
162
  highlight
163
  )
164
 
165
- # Vertical lines corresponding to each level
166
  vertical_lines = alt.Chart(line_data).mark_rule(
167
  color='red',
168
  strokeWidth=6,
169
- strokeDash = [10, 2], # first arg is length, second is gap
170
  ).encode(
171
  x='x:Q',
172
  tooltip=[
173
  alt.Tooltip('x:N', title='Median WPM:'),
174
  alt.Tooltip('level:N', title='Level:')
175
  ],
176
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
177
  color=alt.Color(
178
  'level:N',
179
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
180
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
181
- legend=None # No legend for lines, it is already shown in the histogram
182
  ),
183
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
184
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
185
  ).add_params(
186
  selection,
@@ -188,22 +161,22 @@ def get_wpm_chart(show_medians=False):
188
  )
189
 
190
  text_labels = alt.Chart(line_data).mark_text(
191
- align='center', # Align text to the left of the line
192
- dx=0, # Offset the text to the right by 5 pixels
193
- dy=-10, # Adjust vertical positioning
194
  fontSize=16,
195
  fontWeight='bold'
196
  ).encode(
197
  x='x:Q',
198
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
199
- text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
200
  color=alt.Color(
201
  'level:N',
202
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
203
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
204
  legend=None
205
  ),
206
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
207
  )
208
 
209
 
@@ -229,8 +202,6 @@ st.markdown("To put this data into perspective, native Japanese speakers \
229
  tend to speak at rates of over 200 wpm, meaning that most of the videos \
230
  on CIJ have been adapted to be a lot slower than that!")
231
 
232
- # wpm vs sps chart
233
-
234
  @st.cache_data
235
  def get_wpm_vs_sps_chart(interactive=False):
236
 
@@ -238,7 +209,6 @@ def get_wpm_vs_sps_chart(interactive=False):
238
 
239
  highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
240
 
241
- # Create the scatter plot
242
  scatter_plot = alt.Chart(video_df).mark_circle(
243
  cursor='pointer',
244
  size=80,
@@ -250,10 +220,8 @@ def get_wpm_vs_sps_chart(interactive=False):
250
  axis=alt.Axis(
251
  labelFontSize=14,
252
  titleFontSize=18,
253
- #titleFont='Urbanist',
254
  titleColor='black',
255
  titleFontWeight='normal',
256
- #titleFontStyle='italic',
257
  titlePadding=20
258
  )
259
  ),
@@ -263,12 +231,9 @@ def get_wpm_vs_sps_chart(interactive=False):
263
  axis=alt.Axis(
264
  labelFontSize=14,
265
  titleFontSize=18,
266
- #titleFont='Urbanist',
267
  titleColor='black',
268
  titleFontWeight='normal',
269
- #titleFontStyle='italic',
270
  titlePadding=20,
271
- #tickCount=5
272
  ),
273
  ),
274
  color=alt.Color(
@@ -282,10 +247,8 @@ def get_wpm_vs_sps_chart(interactive=False):
282
  labelFontSize=16,
283
  symbolType='circle',
284
  symbolSize=200,
285
- #symbolStrokeWidth=3,
286
  orient='right',
287
  direction='vertical',
288
- #fillColor='black',
289
  padding=10,
290
  cornerRadius=5,
291
  )
@@ -298,15 +261,12 @@ def get_wpm_vs_sps_chart(interactive=False):
298
 
299
  ],
300
  opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)),
301
- #strokeWidth=alt.condition(selection | highlight, alt.value(6), alt.value(2))
302
  ).properties(
303
  width='container',
304
  height=500,
305
  title=alt.TitleParams(
306
  text='Rate of speech: Syllables per second vs. words per minute',
307
  offset=20,
308
- #subtitle='(clickable)',
309
- #font='Urbanist',
310
  fontSize=24,
311
  fontWeight='normal',
312
  anchor='middle',
@@ -321,7 +281,6 @@ def get_wpm_vs_sps_chart(interactive=False):
321
  background='white'
322
  )
323
 
324
- # Display the plot
325
  if interactive:
326
  return scatter_plot.interactive()
327
  else:
@@ -340,6 +299,9 @@ st.markdown("We can also measure the rate of speech in syllables per second (SPS
340
  st.markdown("(Also, FYI, most of these **graphs are \
341
  interactive** so please click around.)")
342
 
 
 
 
343
  st.markdown("## A quick statistics lesson")
344
 
345
  st.markdown("Before we continue this analysis, there's some basic things you should know.")
@@ -381,7 +343,6 @@ st.markdown("Videos meant for beginners tend to have shorter sentences on averag
381
  @st.cache_data
382
  def get_sentence_length_hist(show_medians=False):
383
 
384
- # Data for vertical lines corresponding to each level
385
  line_data = pd.DataFrame({
386
  'x': [7.60, 10.45, 16.17, 19.39],
387
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -407,10 +368,8 @@ def get_sentence_length_hist(show_medians=False):
407
  axis=alt.Axis(
408
  labelFontSize=14,
409
  titleFontSize=18,
410
- #titleFont='Urbanist',
411
  titleColor='black',
412
  titleFontWeight='normal',
413
- #titleFontStyle='italic',
414
  titlePadding=20
415
  )
416
  ),
@@ -420,10 +379,8 @@ def get_sentence_length_hist(show_medians=False):
420
  axis=alt.Axis(
421
  labelFontSize=14,
422
  titleFontSize=18,
423
- #titleFont='Urbanist',
424
  titleColor='black',
425
  titleFontWeight='normal',
426
- #titleFontStyle='italic',
427
  titlePadding=20,
428
  tickCount=5
429
  ),
@@ -435,11 +392,9 @@ def get_sentence_length_hist(show_medians=False):
435
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
436
  legend=alt.Legend(
437
  title='CIJ Level',
438
- #titleFont='Urbanist',
439
  titleFontSize=18,
440
  titleFontWeight='bolder',
441
  labelFontSize=16,
442
- #labelFont='Urbanist',
443
  symbolType='circle',
444
  symbolSize=200,
445
  symbolStrokeWidth=0,
@@ -451,24 +406,18 @@ def get_sentence_length_hist(show_medians=False):
451
  )
452
  ),
453
  tooltip=[
454
- alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True), # Properly indicate that `wpm` is binned
455
  alt.Tooltip('level:N', title='Level:'),
456
  alt.Tooltip('count()', title='Video count:')
457
  ],
458
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
459
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
460
  ).properties(
461
- #width=750,
462
  width='container',
463
- #height='container',
464
  height=500,
465
- #background='beige',
466
- #padding=50,
467
  title=alt.TitleParams(
468
  text='Average number of words per sentence (sentence length)',
469
  offset=20,
470
- #subtitle='(clickable)',
471
- #font='Urbanist',
472
  fontSize=24,
473
  fontWeight='normal',
474
  anchor='middle',
@@ -481,25 +430,23 @@ def get_sentence_length_hist(show_medians=False):
481
  highlight
482
  )
483
 
484
- # Vertical lines corresponding to each level
485
  vertical_lines = alt.Chart(line_data).mark_rule(
486
  color='red',
487
  strokeWidth=6,
488
- strokeDash = [10, 2], # first arg is length, second is gap
489
  ).encode(
490
  x='x:Q',
491
  tooltip=[
492
  alt.Tooltip('x:N', title='Median average sentence length:'),
493
  alt.Tooltip('level:N', title='Level:')
494
  ],
495
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
496
  color=alt.Color(
497
  'level:N',
498
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
499
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
500
- legend=None # No legend for lines, it is already shown in the histogram
501
  ),
502
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
503
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
504
  ).add_params(
505
  selection,
@@ -507,22 +454,22 @@ def get_sentence_length_hist(show_medians=False):
507
  )
508
 
509
  text_labels = alt.Chart(line_data).mark_text(
510
- align='center', # Align text to the left of the line
511
- dx=0, # Offset the text to the right by 5 pixels
512
- dy=-10, # Adjust vertical positioning
513
  fontSize=16,
514
  fontWeight='bold'
515
  ).encode(
516
  x='x:Q',
517
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
518
- text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
519
  color=alt.Color(
520
  'level:N',
521
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
522
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
523
  legend=None
524
  ),
525
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
526
  )
527
 
528
  if show_medians:
@@ -560,14 +507,8 @@ def get_repetition_hist(show_medians=False):
560
 
561
  video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
562
 
563
- #if show_medians:
564
- # sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
565
- #else:
566
- # sub_video_df = video_df
567
- # take the sub data frame for easier viewing
568
  sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
569
 
570
- # Data for vertical lines corresponding to each level
571
  line_data = pd.DataFrame({
572
  'x': [0.99, 0.62, 0.37, 0.23],
573
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -593,12 +534,9 @@ def get_repetition_hist(show_medians=False):
593
  axis=alt.Axis(
594
  labelFontSize=14,
595
  titleFontSize=18,
596
- #titleFont='Urbanist',
597
  titleColor='black',
598
  titleFontWeight='normal',
599
- #titleFontStyle='italic',
600
  titlePadding=20,
601
- #format='.1f%'
602
  ),
603
  ),
604
  alt.Y(
@@ -607,10 +545,8 @@ def get_repetition_hist(show_medians=False):
607
  axis=alt.Axis(
608
  labelFontSize=14,
609
  titleFontSize=18,
610
- #titleFont='Urbanist',
611
  titleColor='black',
612
  titleFontWeight='normal',
613
- #titleFontStyle='italic',
614
  titlePadding=20,
615
  tickCount=5
616
  ),
@@ -622,11 +558,9 @@ def get_repetition_hist(show_medians=False):
622
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
623
  legend=alt.Legend(
624
  title='CIJ Level',
625
- #titleFont='Urbanist',
626
  titleFontSize=18,
627
  titleFontWeight='bolder',
628
  labelFontSize=16,
629
- #labelFont='Urbanist',
630
  symbolType='circle',
631
  symbolSize=200,
632
  symbolStrokeWidth=0,
@@ -638,24 +572,18 @@ def get_repetition_hist(show_medians=False):
638
  )
639
  ),
640
  tooltip=[
641
- alt.Tooltip('average_rel_reps:Q', title='Average relative repetitions:', bin=True), # Properly indicate that `wpm` is binned
642
  alt.Tooltip('level:N', title='Level:'),
643
  alt.Tooltip('count()', title='Video count:')
644
  ],
645
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
646
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
647
  ).properties(
648
- #width=750,
649
  width='container',
650
- #height='container',
651
  height=500,
652
- #background='beige',
653
- #padding=50,
654
  title=alt.TitleParams(
655
  text='Relative repetitions of words',
656
  offset=20,
657
- #subtitle='(clickable)',
658
- #font='Urbanist',
659
  fontSize=24,
660
  fontWeight='normal',
661
  anchor='middle',
@@ -668,11 +596,10 @@ def get_repetition_hist(show_medians=False):
668
  highlight
669
  )
670
 
671
- # Vertical lines corresponding to each level
672
  vertical_lines = alt.Chart(line_data).mark_rule(
673
  color='red',
674
  strokeWidth=6,
675
- strokeDash = [10, 2], # first arg is length, second is gap
676
  ).encode(
677
  alt.X(
678
  'x:Q'
@@ -681,14 +608,13 @@ def get_repetition_hist(show_medians=False):
681
  alt.Tooltip('x:N', title='Median average relative repetitions:'),
682
  alt.Tooltip('level:N', title='Level:')
683
  ],
684
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
685
  color=alt.Color(
686
  'level:N',
687
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
688
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
689
- legend=None # No legend for lines, it is already shown in the histogram
690
  ),
691
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
692
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)),
693
  ).add_params(
694
  selection,
@@ -696,24 +622,24 @@ def get_repetition_hist(show_medians=False):
696
  )
697
 
698
  text_labels = alt.Chart(line_data).mark_text(
699
- align='center', # Align text to the left of the line
700
- dx=0, # Offset the text to the right by 5 pixels
701
- dy=-10, # Adjust vertical positioning
702
  fontSize=16,
703
  fontWeight='bold'
704
  ).encode(
705
  alt.X(
706
  'x:Q'
707
  ),
708
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
709
- text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
710
  color=alt.Color(
711
  'level:N',
712
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
713
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
714
  legend=None
715
  ),
716
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
717
  )
718
 
719
  if show_medians:
@@ -756,8 +682,6 @@ st.markdown("If we take all the words in CIJ, count them then order them from mo
756
  For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
757
  Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
758
 
759
- # word coverage chart
760
-
761
  @st.cache_data
762
  def get_word_coverage_chart(zoom=False):
763
 
@@ -766,7 +690,6 @@ def get_word_coverage_chart(zoom=False):
766
  else:
767
  word_coverage_df_sub = word_coverage_df
768
 
769
- # Data for vertical lines corresponding to each level
770
  line_data = pd.DataFrame({
771
  'x': [4295, 5606, 6853, 9085],
772
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -788,10 +711,8 @@ def get_word_coverage_chart(zoom=False):
788
  axis=alt.Axis(
789
  labelFontSize=14,
790
  titleFontSize=18,
791
- #titleFont='Urbanist',
792
  titleColor='black',
793
  titleFontWeight='normal',
794
- #titleFontStyle='italic',
795
  titlePadding=20
796
  )
797
  ),
@@ -802,10 +723,8 @@ def get_word_coverage_chart(zoom=False):
802
  axis=alt.Axis(
803
  labelFontSize=14,
804
  titleFontSize=18,
805
- #titleFont='Urbanist',
806
  titleColor='black',
807
  titleFontWeight='normal',
808
- #titleFontStyle='italic',
809
  titlePadding=20,
810
  tickCount=5
811
  ),
@@ -821,10 +740,8 @@ def get_word_coverage_chart(zoom=False):
821
  labelFontSize=16,
822
  symbolType='circle',
823
  symbolSize=200,
824
- #symbolStrokeWidth=3,
825
  orient='right',
826
  direction='vertical',
827
- #fillColor='black',
828
  padding=10,
829
  cornerRadius=5,
830
  )
@@ -843,8 +760,6 @@ def get_word_coverage_chart(zoom=False):
843
  title=alt.TitleParams(
844
  text='Word coverage curves',
845
  offset=20,
846
- #subtitle='(clickable)',
847
- #font='Urbanist',
848
  fontSize=24,
849
  fontWeight='normal',
850
  anchor='middle',
@@ -857,48 +772,46 @@ def get_word_coverage_chart(zoom=False):
857
  highlight
858
  )
859
 
860
- # Vertical lines corresponding to each level
861
  vertical_lines = alt.Chart(line_data).mark_rule(
862
  color='red',
863
  strokeWidth=4,
864
- strokeDash = [10, 2], # first arg is length, second is gap
865
  ).encode(
866
  x='x:Q',
867
  tooltip=[
868
  alt.Tooltip('x:N', title='Words needed to reach 98%:'),
869
  alt.Tooltip('level:N', title='Level:')
870
  ],
871
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
872
  color=alt.Color(
873
  'level:N',
874
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
875
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
876
- legend=None # No legend for lines, it is already shown in the histogram
877
  ),
878
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
879
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
880
  ).add_params(
881
  selection,
882
  highlight
883
- )#.interactive()
884
 
885
  text_labels = alt.Chart(line_data).mark_text(
886
- align='center', # Align text to the left of the line
887
- dx=0, # Offset the text to the right by 5 pixels
888
- dy=-10, # Adjust vertical positioning
889
  fontSize=16,
890
  fontWeight='bold'
891
  ).encode(
892
  x='x:Q',
893
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
894
- text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
895
  color=alt.Color(
896
  'level:N',
897
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
898
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
899
  legend=None
900
  ),
901
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
902
  )
903
 
904
  layered_chart = alt.layer(line_chart, vertical_lines, text_labels, background='white')
@@ -922,7 +835,6 @@ st.markdown("Using the same method of calculating word coverage as before, \
922
  @st.cache_data
923
  def get_ne_spot_hist(show_medians=False):
924
 
925
- # Data for vertical lines corresponding to each level
926
  line_data = pd.DataFrame({
927
  'x': [3859, 5229, 6698, 7925],
928
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -948,12 +860,9 @@ def get_ne_spot_hist(show_medians=False):
948
  axis=alt.Axis(
949
  labelFontSize=14,
950
  titleFontSize=18,
951
- #titleFont='Urbanist',
952
  titleColor='black',
953
  titleFontWeight='normal',
954
- #titleFontStyle='italic',
955
  titlePadding=20,
956
- #format='.1f%'
957
  )
958
  ),
959
  alt.Y(
@@ -962,10 +871,8 @@ def get_ne_spot_hist(show_medians=False):
962
  axis=alt.Axis(
963
  labelFontSize=14,
964
  titleFontSize=18,
965
- #titleFont='Urbanist',
966
  titleColor='black',
967
  titleFontWeight='normal',
968
- #titleFontStyle='italic',
969
  titlePadding=20,
970
  tickCount=5
971
  ),
@@ -977,11 +884,9 @@ def get_ne_spot_hist(show_medians=False):
977
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
978
  legend=alt.Legend(
979
  title='CIJ Level',
980
- #titleFont='Urbanist',
981
  titleFontSize=18,
982
  titleFontWeight='bolder',
983
  labelFontSize=16,
984
- #labelFont='Urbanist',
985
  symbolType='circle',
986
  symbolSize=200,
987
  symbolStrokeWidth=0,
@@ -993,24 +898,18 @@ def get_ne_spot_hist(show_medians=False):
993
  )
994
  ),
995
  tooltip=[
996
- alt.Tooltip('ne_spot:Q', title='Vocab size needed for 98% cov:', bin=True), # Properly indicate that `wpm` is binned
997
  alt.Tooltip('level:N', title='Level:'),
998
  alt.Tooltip('count()', title='Video count:')
999
  ],
1000
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1001
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1002
  ).properties(
1003
- #width=750,
1004
  width='container',
1005
- #height='container',
1006
  height=500,
1007
- #background='beige',
1008
- #padding=50,
1009
  title=alt.TitleParams(
1010
  text='Vocab size needed for 98% coverage',
1011
  offset=20,
1012
- #subtitle='(clickable)',
1013
- #font='Urbanist',
1014
  fontSize=24,
1015
  fontWeight='normal',
1016
  anchor='middle',
@@ -1023,25 +922,23 @@ def get_ne_spot_hist(show_medians=False):
1023
  highlight
1024
  )
1025
 
1026
- # Vertical lines corresponding to each level
1027
  vertical_lines = alt.Chart(line_data).mark_rule(
1028
  color='red',
1029
  strokeWidth=6,
1030
- strokeDash = [10, 2], # first arg is length, second is gap
1031
  ).encode(
1032
  x='x:Q',
1033
  tooltip=[
1034
  alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'),
1035
  alt.Tooltip('level:N', title='Level:')
1036
  ],
1037
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1038
  color=alt.Color(
1039
  'level:N',
1040
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1041
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1042
- legend=None # No legend for lines, it is already shown in the histogram
1043
  ),
1044
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1045
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1046
  ).add_params(
1047
  selection,
@@ -1049,22 +946,22 @@ def get_ne_spot_hist(show_medians=False):
1049
  )
1050
 
1051
  text_labels = alt.Chart(line_data).mark_text(
1052
- align='center', # Align text to the left of the line
1053
- dx=0, # Offset the text to the right by 5 pixels
1054
- dy=-10, # Adjust vertical positioning
1055
  fontSize=16,
1056
  fontWeight='bold'
1057
  ).encode(
1058
  x='x:Q',
1059
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1060
- text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
1061
  color=alt.Color(
1062
  'level:N',
1063
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1064
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1065
  legend=None
1066
  ),
1067
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1068
  )
1069
 
1070
 
@@ -1097,7 +994,6 @@ st.markdown("More advanced videos tend to use rare/uncommon words more often tha
1097
  @st.cache_data
1098
  def get_tfplr_hist(show_medians=False):
1099
 
1100
- # Data for vertical lines corresponding to each level
1101
  line_data = pd.DataFrame({
1102
  'x': [3.82, 4.30, 4.76, 5.21],
1103
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1123,12 +1019,9 @@ def get_tfplr_hist(show_medians=False):
1123
  axis=alt.Axis(
1124
  labelFontSize=14,
1125
  titleFontSize=18,
1126
- #titleFont='Urbanist',
1127
  titleColor='black',
1128
  titleFontWeight='normal',
1129
- #titleFontStyle='italic',
1130
  titlePadding=30,
1131
- #format='.1f%'
1132
  )
1133
  ),
1134
  alt.Y(
@@ -1137,10 +1030,8 @@ def get_tfplr_hist(show_medians=False):
1137
  axis=alt.Axis(
1138
  labelFontSize=14,
1139
  titleFontSize=18,
1140
- #titleFont='Urbanist',
1141
  titleColor='black',
1142
  titleFontWeight='normal',
1143
- #titleFontStyle='italic',
1144
  titlePadding=20,
1145
  tickCount=5
1146
  ),
@@ -1152,11 +1043,9 @@ def get_tfplr_hist(show_medians=False):
1152
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1153
  legend=alt.Legend(
1154
  title='CIJ Level',
1155
- #titleFont='Urbanist',
1156
  titleFontSize=18,
1157
  titleFontWeight='bolder',
1158
  labelFontSize=16,
1159
- #labelFont='Urbanist',
1160
  symbolType='circle',
1161
  symbolSize=200,
1162
  symbolStrokeWidth=0,
@@ -1175,17 +1064,11 @@ def get_tfplr_hist(show_medians=False):
1175
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1176
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1177
  ).properties(
1178
- #width=750,
1179
  width='container',
1180
- #height='container',
1181
  height=500,
1182
- #background='beige',
1183
- #padding=50,
1184
  title=alt.TitleParams(
1185
  text='25th percentile word-frequency log ranks',
1186
  offset=20,
1187
- #subtitle='(clickable)',
1188
- #font='Urbanist',
1189
  fontSize=24,
1190
  fontWeight='normal',
1191
  anchor='middle',
@@ -1198,25 +1081,23 @@ def get_tfplr_hist(show_medians=False):
1198
  highlight
1199
  )
1200
 
1201
- # Vertical lines corresponding to each level
1202
  vertical_lines = alt.Chart(line_data).mark_rule(
1203
  color='red',
1204
  strokeWidth=6,
1205
- strokeDash = [10, 2], # first arg is length, second is gap
1206
  ).encode(
1207
  x='x:Q',
1208
  tooltip=[
1209
  alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
1210
  alt.Tooltip('level:N', title='Level:')
1211
  ],
1212
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1213
  color=alt.Color(
1214
  'level:N',
1215
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1216
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1217
- legend=None # No legend for lines, it is already shown in the histogram
1218
  ),
1219
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1220
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1221
  ).add_params(
1222
  selection,
@@ -1224,25 +1105,24 @@ def get_tfplr_hist(show_medians=False):
1224
  )
1225
 
1226
  text_labels = alt.Chart(line_data).mark_text(
1227
- align='center', # Align text to the left of the line
1228
- dx=0, # Offset the text to the right by 5 pixels
1229
- dy=-10, # Adjust vertical positioning
1230
  fontSize=16,
1231
  fontWeight='bold'
1232
  ).encode(
1233
  x='x:Q',
1234
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1235
- text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
1236
  color=alt.Color(
1237
  'level:N',
1238
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1239
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1240
  legend=None
1241
  ),
1242
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1243
  )
1244
 
1245
- #layered_chart = alt.layer(histogram, background='white')
1246
  if show_medians:
1247
  layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
1248
  else:
@@ -1274,8 +1154,6 @@ st.markdown("(It's okay ff the above didn't quite make sense to you - just know
1274
  demonstrates that easier videos tend to use more common words whereas \
1275
  advanced videos tend to use more rare words!)")
1276
 
1277
- # grammar table
1278
-
1279
  ###
1280
  # GRAMMAR
1281
  ###
@@ -1288,7 +1166,6 @@ def get_sconj_hist(show_medians=False):
1288
 
1289
  video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
1290
 
1291
- # Data for vertical lines corresponding to each level
1292
  line_data = pd.DataFrame({
1293
  'x': [2.64, 4.73, 6.63, 7.67],
1294
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1314,12 +1191,9 @@ def get_sconj_hist(show_medians=False):
1314
  axis=alt.Axis(
1315
  labelFontSize=14,
1316
  titleFontSize=18,
1317
- #titleFont='Urbanist',
1318
  titleColor='black',
1319
  titleFontWeight='normal',
1320
- #titleFontStyle='italic',
1321
  titlePadding=30,
1322
- #format='.1f%'
1323
  )
1324
  ),
1325
  alt.Y(
@@ -1328,10 +1202,8 @@ def get_sconj_hist(show_medians=False):
1328
  axis=alt.Axis(
1329
  labelFontSize=14,
1330
  titleFontSize=18,
1331
- #titleFont='Urbanist',
1332
  titleColor='black',
1333
  titleFontWeight='normal',
1334
- #titleFontStyle='italic',
1335
  titlePadding=20,
1336
  tickCount=5
1337
  ),
@@ -1343,11 +1215,9 @@ def get_sconj_hist(show_medians=False):
1343
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1344
  legend=alt.Legend(
1345
  title='CIJ Level',
1346
- #titleFont='Urbanist',
1347
  titleFontSize=18,
1348
  titleFontWeight='bolder',
1349
  labelFontSize=16,
1350
- #labelFont='Urbanist',
1351
  symbolType='circle',
1352
  symbolSize=200,
1353
  symbolStrokeWidth=0,
@@ -1359,24 +1229,18 @@ def get_sconj_hist(show_medians=False):
1359
  )
1360
  ),
1361
  tooltip=[
1362
- alt.Tooltip('sconj_props_perc:Q', title='Percentage of subordinating conjunctions:', bin=True), # Properly indicate that `wpm` is binned
1363
  alt.Tooltip('level:N', title='Level:'),
1364
  alt.Tooltip('count()', title='Video count:')
1365
  ],
1366
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1367
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1368
  ).properties(
1369
- #width=750,
1370
  width='container',
1371
- #height='container',
1372
  height=500,
1373
- #background='beige',
1374
- #padding=50,
1375
  title=alt.TitleParams(
1376
  text='Percentages of subordinating conjunctions',
1377
  offset=20,
1378
- #subtitle='(clickable)',
1379
- #font='Urbanist',
1380
  fontSize=24,
1381
  fontWeight='normal',
1382
  anchor='middle',
@@ -1389,25 +1253,23 @@ def get_sconj_hist(show_medians=False):
1389
  highlight
1390
  )
1391
 
1392
- # Vertical lines corresponding to each level
1393
  vertical_lines = alt.Chart(line_data).mark_rule(
1394
  color='red',
1395
  strokeWidth=6,
1396
- strokeDash = [10, 2], # first arg is length, second is gap
1397
  ).encode(
1398
  x='x:Q',
1399
  tooltip=[
1400
  alt.Tooltip('x:N', title='Median percentage of subordinating conjunctions:'),
1401
  alt.Tooltip('level:N', title='Level:')
1402
  ],
1403
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1404
  color=alt.Color(
1405
  'level:N',
1406
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1407
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1408
- legend=None # No legend for lines, it is already shown in the histogram
1409
  ),
1410
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1411
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1412
  ).add_params(
1413
  selection,
@@ -1415,22 +1277,22 @@ def get_sconj_hist(show_medians=False):
1415
  )
1416
 
1417
  text_labels = alt.Chart(line_data).mark_text(
1418
- align='center', # Align text to the left of the line
1419
- dx=0, # Offset the text to the right by 5 pixels
1420
- dy=-10, # Adjust vertical positioning
1421
  fontSize=16,
1422
  fontWeight='bold'
1423
  ).encode(
1424
  x='x:Q',
1425
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1426
- text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
1427
  color=alt.Color(
1428
  'level:N',
1429
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1430
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1431
  legend=None
1432
  ),
1433
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1434
  )
1435
 
1436
 
@@ -1464,7 +1326,6 @@ df = pd.DataFrame(data)
1464
  row_labels = ['Median Perc. Subordinating Conjunctions', 'Median Perc. Adverbs', 'Median Perc. Determiners', 'Median Perc. Nouns']
1465
  df.index = row_labels
1466
 
1467
- # Apply header-specific styling using set_table_styles
1468
  styled_df = df.style.set_table_styles(
1469
  {
1470
  'Complete Beginner': [
@@ -1482,14 +1343,9 @@ styled_df = df.style.set_table_styles(
1482
  'Advanced': [
1483
  {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]},
1484
  {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]}
1485
- ],
1486
- # This is where we target the top-left index column reader
1487
- '': [
1488
- {'selector': '.index_name', 'props': [('color', 'green'), ('font-weight', 'bold')]}
1489
  ]
1490
  }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
1491
 
1492
- # Inject CSS to ensure the background is white in the markdown section
1493
  st.markdown(
1494
  """
1495
  <style>
@@ -1500,7 +1356,6 @@ st.markdown(
1500
  """, unsafe_allow_html=True
1501
  )
1502
 
1503
- # Display the styled DataFrame
1504
  st.markdown(
1505
  '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
1506
  , unsafe_allow_html=True)
@@ -1521,7 +1376,6 @@ def get_kango_hist(show_medians=False):
1521
 
1522
  video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
1523
 
1524
- # Data for vertical lines corresponding to each level
1525
  line_data = pd.DataFrame({
1526
  'x': [7.00, 9.55, 11.66, 13.03],
1527
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1547,12 +1401,9 @@ def get_kango_hist(show_medians=False):
1547
  axis=alt.Axis(
1548
  labelFontSize=14,
1549
  titleFontSize=18,
1550
- #titleFont='Urbanist',
1551
  titleColor='black',
1552
  titleFontWeight='normal',
1553
- #titleFontStyle='italic',
1554
  titlePadding=30,
1555
- #format='.1f%'
1556
  )
1557
  ),
1558
  alt.Y(
@@ -1561,10 +1412,8 @@ def get_kango_hist(show_medians=False):
1561
  axis=alt.Axis(
1562
  labelFontSize=14,
1563
  titleFontSize=18,
1564
- #titleFont='Urbanist',
1565
  titleColor='black',
1566
  titleFontWeight='normal',
1567
- #titleFontStyle='italic',
1568
  titlePadding=20,
1569
  tickCount=5
1570
  ),
@@ -1576,11 +1425,9 @@ def get_kango_hist(show_medians=False):
1576
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1577
  legend=alt.Legend(
1578
  title='CIJ Level',
1579
- #titleFont='Urbanist',
1580
  titleFontSize=18,
1581
  titleFontWeight='bolder',
1582
  labelFontSize=16,
1583
- #labelFont='Urbanist',
1584
  symbolType='circle',
1585
  symbolSize=200,
1586
  symbolStrokeWidth=0,
@@ -1592,24 +1439,18 @@ def get_kango_hist(show_medians=False):
1592
  )
1593
  ),
1594
  tooltip=[
1595
- alt.Tooltip('kan_props_perc:Q', title='Percentage of kango:', bin=True), # Properly indicate that `wpm` is binned
1596
  alt.Tooltip('level:N', title='Level:'),
1597
  alt.Tooltip('count()', title='Video count:')
1598
  ],
1599
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1600
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1601
  ).properties(
1602
- #width=750,
1603
  width='container',
1604
- #height='container',
1605
  height=500,
1606
- #background='beige',
1607
- #padding=50,
1608
  title=alt.TitleParams(
1609
  text='Percentages of kango (漢語)',
1610
  offset=20,
1611
- #subtitle='(clickable)',
1612
- #font='Urbanist',
1613
  fontSize=24,
1614
  fontWeight='normal',
1615
  anchor='middle',
@@ -1622,25 +1463,23 @@ def get_kango_hist(show_medians=False):
1622
  highlight
1623
  )
1624
 
1625
- # Vertical lines corresponding to each level
1626
  vertical_lines = alt.Chart(line_data).mark_rule(
1627
  color='red',
1628
  strokeWidth=6,
1629
- strokeDash = [10, 2], # first arg is length, second is gap
1630
  ).encode(
1631
  x='x:Q',
1632
  tooltip=[
1633
  alt.Tooltip('x:N', title='Median percentage of kango:'),
1634
  alt.Tooltip('level:N', title='Level:')
1635
  ],
1636
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1637
  color=alt.Color(
1638
  'level:N',
1639
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1640
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1641
- legend=None # No legend for lines, it is already shown in the histogram
1642
  ),
1643
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1644
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1645
  ).add_params(
1646
  selection,
@@ -1648,22 +1487,22 @@ def get_kango_hist(show_medians=False):
1648
  )
1649
 
1650
  text_labels = alt.Chart(line_data).mark_text(
1651
- align='center', # Align text to the left of the line
1652
- dx=0, # Offset the text to the right by 5 pixels
1653
- dy=-10, # Adjust vertical positioning
1654
  fontSize=16,
1655
  fontWeight='bold'
1656
  ).encode(
1657
  x='x:Q',
1658
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1659
- text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
1660
  color=alt.Color(
1661
  'level:N',
1662
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1663
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1664
  legend=None
1665
  ),
1666
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1667
  )
1668
 
1669
  if show_medians:
@@ -1688,8 +1527,6 @@ st.markdown("In Japanese, Kango are somewhat analogous to French words in Englis
1688
 
1689
  st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
1690
 
1691
- # word origin table
1692
-
1693
  data = {
1694
  'Complete Beginner': [0.06999874574159035, 0.8578043261266064, 0.03301790801790795],
1695
  'Beginner': [0.0955284552845528, 0.8399311531841652, 0.0279441117764471],
@@ -1701,7 +1538,6 @@ df = pd.DataFrame(data)
1701
  row_labels = ['Median Perc. Kango (漢語)', 'Median Perc. Wago (和語)', 'Median Perc. Garaigo (外来語)']
1702
  df.index = row_labels
1703
 
1704
- # Apply header-specific styling using set_table_styles
1705
  styled_df = df.style.set_table_styles(
1706
  {
1707
  'Complete Beginner': [
@@ -1722,13 +1558,13 @@ styled_df = df.style.set_table_styles(
1722
  ],
1723
  }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
1724
 
1725
- # Display the styled DataFrame
1726
  st.markdown(
1727
  '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
1728
  , unsafe_allow_html=True)
1729
 
1730
- # heatmap
1731
-
 
1732
  st.markdown("## Which factors matter the most?")
1733
 
1734
  st.markdown("We've just found a number of statistics that lead to orderings in the data \
@@ -1740,24 +1576,17 @@ st.markdown("To answer this, we can look at a correlation heatmap between each o
1740
  @st.cache_data
1741
  def render_vanilla_heatmap():
1742
 
1743
- # Compute the correlation matrix
1744
  corr_matrix = num_video_df.corr()
1745
 
1746
- # Specify the variable of interest (e.g., 'target_variable')
1747
  variable_of_interest = 'Level'
1748
 
1749
- # Sort the variables based on correlation with the variable of interest
1750
  sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
1751
 
1752
- # Reorder the correlation matrix
1753
  sorted_corr_matrix = corr_matrix.loc[sorted_vars, sorted_vars]
1754
 
1755
- # Create a heatmap using seaborn with the sorted correlation matrix
1756
  plt.figure(figsize=(10, 8))
1757
  sns.heatmap(sorted_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f")
1758
 
1759
- # Display the heatmap
1760
- #plt.show()
1761
  st.pyplot(plt.gcf())
1762
 
1763
  render_vanilla_heatmap()
@@ -1774,59 +1603,41 @@ st.markdown("Using a statistics rule of thumb and removing all variables that ha
1774
  @st.cache_data
1775
  def render_level_row_unordered():
1776
 
1777
- # Compute the correlation matrix
1778
  corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
1779
 
1780
- # Specify the variable of interest (e.g., 'Level')
1781
  variable_of_interest = 'Level'
1782
 
1783
- # Sort the variables based on correlation with the variable of interest
1784
  sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
1785
 
1786
- # Remove 'Level' from the sorted variables to exclude the self-correlation
1787
  sorted_vars = sorted_vars.drop(variable_of_interest)
1788
 
1789
- # Reorder the correlation matrix and exclude 'Level' column from the first row
1790
  first_row_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
1791
 
1792
- # Create a heatmap using seaborn with the single row of the correlation matrix
1793
- plt.figure(figsize=(10, 1)) # Adjust the figure size to make it more appropriate for a single row
1794
  sns.heatmap(first_row_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
1795
 
1796
- # Display the heatmap
1797
- #plt.show()
1798
  st.pyplot(plt.gcf())
1799
 
1800
  @st.cache_data
1801
  def render_level_col_ordered():
1802
 
1803
- # Compute the correlation matrix
1804
  corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
1805
 
1806
- # Specify the variable of interest (e.g., 'Level')
1807
  variable_of_interest = 'Level'
1808
 
1809
- # Get the correlations of the variable of interest
1810
  correlations = corr_matrix[variable_of_interest]
1811
 
1812
- # Sort the variables based on the absolute value of the correlation with the variable of interest
1813
  sorted_vars = correlations.abs().sort_values(ascending=False).index
1814
 
1815
- # Remove 'Level' from the sorted variables (to exclude the self-correlation)
1816
  sorted_vars = sorted_vars.drop(variable_of_interest)
1817
 
1818
- # Reorder the correlation matrix, excluding the self-correlation
1819
  sorted_corr_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
1820
 
1821
- # Transpose the matrix to make it vertical
1822
  transposed_corr_matrix = sorted_corr_matrix.T
1823
 
1824
- # Create a heatmap using seaborn with the transposed correlation matrix
1825
- plt.figure(figsize=(2, 3)) # Adjust the figure size to make it more appropriate for a vertical layout
1826
  sns.heatmap(transposed_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
1827
 
1828
- # Display the heatmap
1829
- #plt.show()
1830
  st.pyplot(plt.gcf())
1831
 
1832
  if st.checkbox('Flip and sort'):
@@ -1848,23 +1659,4 @@ st.markdown("8. Amount of Chinese words")
1848
 
1849
  st.markdown("### Thanks for reading ✌️")
1850
 
1851
- st.markdown("---")
1852
-
1853
- #st.markdown("In the unlikely chance that you happen to be a CI instructor or a CI content creator, I want to talk to you! \
1854
- # I can be reached at hamiltonjoshuadavid@gmail.com and I'm interested in learning \
1855
- # more about what you do. Please also add a link to your work if you decide to reach out.")
1856
-
1857
- #st.markdown("Special thanks to [CIJ](https://cijapanese.com/). I'm a happy subscriber and I recommend you also pick up a \
1858
- # a membership if you're a Japanese learner!")
1859
-
1860
- #st.markdown("---")
1861
- #st.markdown("**Some extra notes:**")
1862
- #st.markdown("1. No statistical tests of significance were conducted. This was just meant to be a light and unrigorous EDA.")
1863
- #st.markdown("2. It should be noted that the levels of the videos were determined by experts, and not by learners. They do not reflect objective difficulty.")
1864
- #st.markdown("3. While I stated that Japanese learners tend to speak at rates of over 200 wpm, I unfortunately haven't been able to find any good sources on this. \
1865
- # The actual average Japanese WPM is likely even higher than 200 wpm, but unfortunately I haven't found any good research on this.")
1866
- #st.markdown("4. Technically, I didn't actually compute syllables per second, but rather moras per second which served as an approximation for syllables. \
1867
- # I understand that this is linguistically incorrect, but I didn't want to confuse the reader who might not know any Japanese or linguistics.")
1868
- #st.markdown("5. More data cleaning could've been done to create better frequency lists, however, this was unnecessary in order to establish statistical patterns in a one-off analysis.")
1869
- #st.markdown("6. As a disclaimer, I do not think that CI instructors should base how they create their content off of the findings in this analysis. \
1870
- # They should only use these findings for inspiration and to get them thinking more analytically about what they're doing.")
 
9
  page_icon='favicon.svg'
10
  )
11
 
 
 
 
 
 
 
 
 
 
 
12
  @st.cache_data
13
  def load_dataframes():
14
 
 
39
  [cijapanese.com](https://cijapanese.com/) (CIJ), a \
40
  video platform for learning Japanese.")
41
 
42
+ ###
43
+ # RATE OF SPEECH
44
+ ###
45
  st.markdown("## How fast is CI?")
46
 
47
  st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
48
  they speak more slowly in videos meant for beginners and more quickly \
49
  for advanced learners.")
50
 
 
 
51
  @st.cache_data
52
  def get_wpm_chart(show_medians=False):
53
 
 
54
  line_data = pd.DataFrame({
55
  'x': [75, 91, 124, 149],
56
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
76
  axis=alt.Axis(
77
  labelFontSize=14,
78
  titleFontSize=18,
 
79
  titleColor='black',
80
  titleFontWeight='normal',
 
81
  titlePadding=20
82
  )
83
  ),
 
87
  axis=alt.Axis(
88
  labelFontSize=14,
89
  titleFontSize=18,
 
90
  titleColor='black',
91
  titleFontWeight='normal',
 
92
  titlePadding=20,
93
  tickCount=5
94
  ),
 
100
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
101
  legend=alt.Legend(
102
  title='CIJ Level',
 
103
  titleFontSize=18,
104
  titleFontWeight='bolder',
105
  labelFontSize=16,
 
106
  symbolType='circle',
107
  symbolSize=200,
108
  symbolStrokeWidth=0,
 
114
  )
115
  ),
116
  tooltip=[
117
+ alt.Tooltip('wpm:Q', title='Words per minute:', bin=True),
118
  alt.Tooltip('level:N', title='Level:'),
119
  alt.Tooltip('count()', title='Video count:')
120
  ],
121
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
122
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
123
  ).properties(
 
 
 
124
  height=500,
 
 
125
  title=alt.TitleParams(
126
  text='Rate of speech in words per minute (WPM)',
127
  offset=20,
 
 
128
  fontSize=24,
129
  fontWeight='normal',
130
  anchor='middle',
 
137
  highlight
138
  )
139
 
 
140
  vertical_lines = alt.Chart(line_data).mark_rule(
141
  color='red',
142
  strokeWidth=6,
143
+ strokeDash = [10, 2],
144
  ).encode(
145
  x='x:Q',
146
  tooltip=[
147
  alt.Tooltip('x:N', title='Median WPM:'),
148
  alt.Tooltip('level:N', title='Level:')
149
  ],
 
150
  color=alt.Color(
151
  'level:N',
152
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
153
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
154
+ legend=None
155
  ),
156
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
157
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
158
  ).add_params(
159
  selection,
 
161
  )
162
 
163
  text_labels = alt.Chart(line_data).mark_text(
164
+ align='center',
165
+ dx=0,
166
+ dy=-10,
167
  fontSize=16,
168
  fontWeight='bold'
169
  ).encode(
170
  x='x:Q',
171
+ y=alt.value(0),
172
+ text=alt.Text('x:Q', format='.0f'),
173
  color=alt.Color(
174
  'level:N',
175
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
176
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
177
  legend=None
178
  ),
179
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
180
  )
181
 
182
 
 
202
  tend to speak at rates of over 200 wpm, meaning that most of the videos \
203
  on CIJ have been adapted to be a lot slower than that!")
204
 
 
 
205
  @st.cache_data
206
  def get_wpm_vs_sps_chart(interactive=False):
207
 
 
209
 
210
  highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
211
 
 
212
  scatter_plot = alt.Chart(video_df).mark_circle(
213
  cursor='pointer',
214
  size=80,
 
220
  axis=alt.Axis(
221
  labelFontSize=14,
222
  titleFontSize=18,
 
223
  titleColor='black',
224
  titleFontWeight='normal',
 
225
  titlePadding=20
226
  )
227
  ),
 
231
  axis=alt.Axis(
232
  labelFontSize=14,
233
  titleFontSize=18,
 
234
  titleColor='black',
235
  titleFontWeight='normal',
 
236
  titlePadding=20,
 
237
  ),
238
  ),
239
  color=alt.Color(
 
247
  labelFontSize=16,
248
  symbolType='circle',
249
  symbolSize=200,
 
250
  orient='right',
251
  direction='vertical',
 
252
  padding=10,
253
  cornerRadius=5,
254
  )
 
261
 
262
  ],
263
  opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)),
 
264
  ).properties(
265
  width='container',
266
  height=500,
267
  title=alt.TitleParams(
268
  text='Rate of speech: Syllables per second vs. words per minute',
269
  offset=20,
 
 
270
  fontSize=24,
271
  fontWeight='normal',
272
  anchor='middle',
 
281
  background='white'
282
  )
283
 
 
284
  if interactive:
285
  return scatter_plot.interactive()
286
  else:
 
299
  st.markdown("(Also, FYI, most of these **graphs are \
300
  interactive** so please click around.)")
301
 
302
+ ###
303
+ # STATISTICS LESSON
304
+ ###
305
  st.markdown("## A quick statistics lesson")
306
 
307
  st.markdown("Before we continue this analysis, there's some basic things you should know.")
 
343
  @st.cache_data
344
  def get_sentence_length_hist(show_medians=False):
345
 
 
346
  line_data = pd.DataFrame({
347
  'x': [7.60, 10.45, 16.17, 19.39],
348
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
368
  axis=alt.Axis(
369
  labelFontSize=14,
370
  titleFontSize=18,
 
371
  titleColor='black',
372
  titleFontWeight='normal',
 
373
  titlePadding=20
374
  )
375
  ),
 
379
  axis=alt.Axis(
380
  labelFontSize=14,
381
  titleFontSize=18,
 
382
  titleColor='black',
383
  titleFontWeight='normal',
 
384
  titlePadding=20,
385
  tickCount=5
386
  ),
 
392
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
393
  legend=alt.Legend(
394
  title='CIJ Level',
 
395
  titleFontSize=18,
396
  titleFontWeight='bolder',
397
  labelFontSize=16,
 
398
  symbolType='circle',
399
  symbolSize=200,
400
  symbolStrokeWidth=0,
 
406
  )
407
  ),
408
  tooltip=[
409
+ alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True),
410
  alt.Tooltip('level:N', title='Level:'),
411
  alt.Tooltip('count()', title='Video count:')
412
  ],
413
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
414
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
415
  ).properties(
 
416
  width='container',
 
417
  height=500,
 
 
418
  title=alt.TitleParams(
419
  text='Average number of words per sentence (sentence length)',
420
  offset=20,
 
 
421
  fontSize=24,
422
  fontWeight='normal',
423
  anchor='middle',
 
430
  highlight
431
  )
432
 
 
433
  vertical_lines = alt.Chart(line_data).mark_rule(
434
  color='red',
435
  strokeWidth=6,
436
+ strokeDash = [10, 2],
437
  ).encode(
438
  x='x:Q',
439
  tooltip=[
440
  alt.Tooltip('x:N', title='Median average sentence length:'),
441
  alt.Tooltip('level:N', title='Level:')
442
  ],
 
443
  color=alt.Color(
444
  'level:N',
445
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
446
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
447
+ legend=None
448
  ),
449
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
450
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
451
  ).add_params(
452
  selection,
 
454
  )
455
 
456
  text_labels = alt.Chart(line_data).mark_text(
457
+ align='center',
458
+ dx=0,
459
+ dy=-10,
460
  fontSize=16,
461
  fontWeight='bold'
462
  ).encode(
463
  x='x:Q',
464
+ y=alt.value(0),
465
+ text=alt.Text('x:Q', format='.2f'),
466
  color=alt.Color(
467
  'level:N',
468
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
469
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
470
  legend=None
471
  ),
472
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
473
  )
474
 
475
  if show_medians:
 
507
 
508
  video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
509
 
 
 
 
 
 
510
  sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
511
 
 
512
  line_data = pd.DataFrame({
513
  'x': [0.99, 0.62, 0.37, 0.23],
514
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
534
  axis=alt.Axis(
535
  labelFontSize=14,
536
  titleFontSize=18,
 
537
  titleColor='black',
538
  titleFontWeight='normal',
 
539
  titlePadding=20,
 
540
  ),
541
  ),
542
  alt.Y(
 
545
  axis=alt.Axis(
546
  labelFontSize=14,
547
  titleFontSize=18,
 
548
  titleColor='black',
549
  titleFontWeight='normal',
 
550
  titlePadding=20,
551
  tickCount=5
552
  ),
 
558
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
559
  legend=alt.Legend(
560
  title='CIJ Level',
 
561
  titleFontSize=18,
562
  titleFontWeight='bolder',
563
  labelFontSize=16,
 
564
  symbolType='circle',
565
  symbolSize=200,
566
  symbolStrokeWidth=0,
 
572
  )
573
  ),
574
  tooltip=[
575
+ alt.Tooltip('average_rel_reps:Q', title='Average relative repetitions:', bin=True),
576
  alt.Tooltip('level:N', title='Level:'),
577
  alt.Tooltip('count()', title='Video count:')
578
  ],
579
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
580
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
581
  ).properties(
 
582
  width='container',
 
583
  height=500,
 
 
584
  title=alt.TitleParams(
585
  text='Relative repetitions of words',
586
  offset=20,
 
 
587
  fontSize=24,
588
  fontWeight='normal',
589
  anchor='middle',
 
596
  highlight
597
  )
598
 
 
599
  vertical_lines = alt.Chart(line_data).mark_rule(
600
  color='red',
601
  strokeWidth=6,
602
+ strokeDash = [10, 2],
603
  ).encode(
604
  alt.X(
605
  'x:Q'
 
608
  alt.Tooltip('x:N', title='Median average relative repetitions:'),
609
  alt.Tooltip('level:N', title='Level:')
610
  ],
 
611
  color=alt.Color(
612
  'level:N',
613
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
614
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
615
+ legend=None
616
  ),
617
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
618
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)),
619
  ).add_params(
620
  selection,
 
622
  )
623
 
624
  text_labels = alt.Chart(line_data).mark_text(
625
+ align='center',
626
+ dx=0,
627
+ dy=-10,
628
  fontSize=16,
629
  fontWeight='bold'
630
  ).encode(
631
  alt.X(
632
  'x:Q'
633
  ),
634
+ y=alt.value(0),
635
+ text=alt.Text('x:Q', format='.2f'),
636
  color=alt.Color(
637
  'level:N',
638
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
639
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
640
  legend=None
641
  ),
642
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
643
  )
644
 
645
  if show_medians:
 
682
  For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
683
  Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
684
 
 
 
685
  @st.cache_data
686
  def get_word_coverage_chart(zoom=False):
687
 
 
690
  else:
691
  word_coverage_df_sub = word_coverage_df
692
 
 
693
  line_data = pd.DataFrame({
694
  'x': [4295, 5606, 6853, 9085],
695
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
711
  axis=alt.Axis(
712
  labelFontSize=14,
713
  titleFontSize=18,
 
714
  titleColor='black',
715
  titleFontWeight='normal',
 
716
  titlePadding=20
717
  )
718
  ),
 
723
  axis=alt.Axis(
724
  labelFontSize=14,
725
  titleFontSize=18,
 
726
  titleColor='black',
727
  titleFontWeight='normal',
 
728
  titlePadding=20,
729
  tickCount=5
730
  ),
 
740
  labelFontSize=16,
741
  symbolType='circle',
742
  symbolSize=200,
 
743
  orient='right',
744
  direction='vertical',
 
745
  padding=10,
746
  cornerRadius=5,
747
  )
 
760
  title=alt.TitleParams(
761
  text='Word coverage curves',
762
  offset=20,
 
 
763
  fontSize=24,
764
  fontWeight='normal',
765
  anchor='middle',
 
772
  highlight
773
  )
774
 
 
775
  vertical_lines = alt.Chart(line_data).mark_rule(
776
  color='red',
777
  strokeWidth=4,
778
+ strokeDash = [10, 2],
779
  ).encode(
780
  x='x:Q',
781
  tooltip=[
782
  alt.Tooltip('x:N', title='Words needed to reach 98%:'),
783
  alt.Tooltip('level:N', title='Level:')
784
  ],
 
785
  color=alt.Color(
786
  'level:N',
787
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
788
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
789
+ legend=None
790
  ),
791
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
792
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
793
  ).add_params(
794
  selection,
795
  highlight
796
+ )
797
 
798
  text_labels = alt.Chart(line_data).mark_text(
799
+ align='center',
800
+ dx=0,
801
+ dy=-10,
802
  fontSize=16,
803
  fontWeight='bold'
804
  ).encode(
805
  x='x:Q',
806
+ y=alt.value(0),
807
+ text=alt.Text('x:Q', format='.0f'),
808
  color=alt.Color(
809
  'level:N',
810
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
811
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
812
  legend=None
813
  ),
814
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
815
  )
816
 
817
  layered_chart = alt.layer(line_chart, vertical_lines, text_labels, background='white')
 
835
  @st.cache_data
836
  def get_ne_spot_hist(show_medians=False):
837
 
 
838
  line_data = pd.DataFrame({
839
  'x': [3859, 5229, 6698, 7925],
840
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
860
  axis=alt.Axis(
861
  labelFontSize=14,
862
  titleFontSize=18,
 
863
  titleColor='black',
864
  titleFontWeight='normal',
 
865
  titlePadding=20,
 
866
  )
867
  ),
868
  alt.Y(
 
871
  axis=alt.Axis(
872
  labelFontSize=14,
873
  titleFontSize=18,
 
874
  titleColor='black',
875
  titleFontWeight='normal',
 
876
  titlePadding=20,
877
  tickCount=5
878
  ),
 
884
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
885
  legend=alt.Legend(
886
  title='CIJ Level',
 
887
  titleFontSize=18,
888
  titleFontWeight='bolder',
889
  labelFontSize=16,
 
890
  symbolType='circle',
891
  symbolSize=200,
892
  symbolStrokeWidth=0,
 
898
  )
899
  ),
900
  tooltip=[
901
+ alt.Tooltip('ne_spot:Q', title='Vocab size needed for 98% cov:', bin=True),
902
  alt.Tooltip('level:N', title='Level:'),
903
  alt.Tooltip('count()', title='Video count:')
904
  ],
905
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
906
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
907
  ).properties(
 
908
  width='container',
 
909
  height=500,
 
 
910
  title=alt.TitleParams(
911
  text='Vocab size needed for 98% coverage',
912
  offset=20,
 
 
913
  fontSize=24,
914
  fontWeight='normal',
915
  anchor='middle',
 
922
  highlight
923
  )
924
 
 
925
  vertical_lines = alt.Chart(line_data).mark_rule(
926
  color='red',
927
  strokeWidth=6,
928
+ strokeDash = [10, 2],
929
  ).encode(
930
  x='x:Q',
931
  tooltip=[
932
  alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'),
933
  alt.Tooltip('level:N', title='Level:')
934
  ],
 
935
  color=alt.Color(
936
  'level:N',
937
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
938
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
939
+ legend=None
940
  ),
941
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
942
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
943
  ).add_params(
944
  selection,
 
946
  )
947
 
948
  text_labels = alt.Chart(line_data).mark_text(
949
+ align='center',
950
+ dx=0,
951
+ dy=-10,
952
  fontSize=16,
953
  fontWeight='bold'
954
  ).encode(
955
  x='x:Q',
956
+ y=alt.value(0),
957
+ text=alt.Text('x:Q', format='.0f'),
958
  color=alt.Color(
959
  'level:N',
960
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
961
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
962
  legend=None
963
  ),
964
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
965
  )
966
 
967
 
 
994
  @st.cache_data
995
  def get_tfplr_hist(show_medians=False):
996
 
 
997
  line_data = pd.DataFrame({
998
  'x': [3.82, 4.30, 4.76, 5.21],
999
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
1019
  axis=alt.Axis(
1020
  labelFontSize=14,
1021
  titleFontSize=18,
 
1022
  titleColor='black',
1023
  titleFontWeight='normal',
 
1024
  titlePadding=30,
 
1025
  )
1026
  ),
1027
  alt.Y(
 
1030
  axis=alt.Axis(
1031
  labelFontSize=14,
1032
  titleFontSize=18,
 
1033
  titleColor='black',
1034
  titleFontWeight='normal',
 
1035
  titlePadding=20,
1036
  tickCount=5
1037
  ),
 
1043
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1044
  legend=alt.Legend(
1045
  title='CIJ Level',
 
1046
  titleFontSize=18,
1047
  titleFontWeight='bolder',
1048
  labelFontSize=16,
 
1049
  symbolType='circle',
1050
  symbolSize=200,
1051
  symbolStrokeWidth=0,
 
1064
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1065
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1066
  ).properties(
 
1067
  width='container',
 
1068
  height=500,
 
 
1069
  title=alt.TitleParams(
1070
  text='25th percentile word-frequency log ranks',
1071
  offset=20,
 
 
1072
  fontSize=24,
1073
  fontWeight='normal',
1074
  anchor='middle',
 
1081
  highlight
1082
  )
1083
 
 
1084
  vertical_lines = alt.Chart(line_data).mark_rule(
1085
  color='red',
1086
  strokeWidth=6,
1087
+ strokeDash = [10, 2],
1088
  ).encode(
1089
  x='x:Q',
1090
  tooltip=[
1091
  alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
1092
  alt.Tooltip('level:N', title='Level:')
1093
  ],
 
1094
  color=alt.Color(
1095
  'level:N',
1096
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
1097
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1098
+ legend=None
1099
  ),
1100
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1101
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1102
  ).add_params(
1103
  selection,
 
1105
  )
1106
 
1107
  text_labels = alt.Chart(line_data).mark_text(
1108
+ align='center',
1109
+ dx=0,
1110
+ dy=-10,
1111
  fontSize=16,
1112
  fontWeight='bold'
1113
  ).encode(
1114
  x='x:Q',
1115
+ y=alt.value(0),
1116
+ text=alt.Text('x:Q', format='.2f'),
1117
  color=alt.Color(
1118
  'level:N',
1119
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1120
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1121
  legend=None
1122
  ),
1123
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1124
  )
1125
 
 
1126
  if show_medians:
1127
  layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
1128
  else:
 
1154
  demonstrates that easier videos tend to use more common words whereas \
1155
  advanced videos tend to use more rare words!)")
1156
 
 
 
1157
  ###
1158
  # GRAMMAR
1159
  ###
 
1166
 
1167
  video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
1168
 
 
1169
  line_data = pd.DataFrame({
1170
  'x': [2.64, 4.73, 6.63, 7.67],
1171
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
1191
  axis=alt.Axis(
1192
  labelFontSize=14,
1193
  titleFontSize=18,
 
1194
  titleColor='black',
1195
  titleFontWeight='normal',
 
1196
  titlePadding=30,
 
1197
  )
1198
  ),
1199
  alt.Y(
 
1202
  axis=alt.Axis(
1203
  labelFontSize=14,
1204
  titleFontSize=18,
 
1205
  titleColor='black',
1206
  titleFontWeight='normal',
 
1207
  titlePadding=20,
1208
  tickCount=5
1209
  ),
 
1215
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1216
  legend=alt.Legend(
1217
  title='CIJ Level',
 
1218
  titleFontSize=18,
1219
  titleFontWeight='bolder',
1220
  labelFontSize=16,
 
1221
  symbolType='circle',
1222
  symbolSize=200,
1223
  symbolStrokeWidth=0,
 
1229
  )
1230
  ),
1231
  tooltip=[
1232
+ alt.Tooltip('sconj_props_perc:Q', title='Percentage of subordinating conjunctions:', bin=True),
1233
  alt.Tooltip('level:N', title='Level:'),
1234
  alt.Tooltip('count()', title='Video count:')
1235
  ],
1236
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1237
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1238
  ).properties(
 
1239
  width='container',
 
1240
  height=500,
 
 
1241
  title=alt.TitleParams(
1242
  text='Percentages of subordinating conjunctions',
1243
  offset=20,
 
 
1244
  fontSize=24,
1245
  fontWeight='normal',
1246
  anchor='middle',
 
1253
  highlight
1254
  )
1255
 
 
1256
  vertical_lines = alt.Chart(line_data).mark_rule(
1257
  color='red',
1258
  strokeWidth=6,
1259
+ strokeDash = [10, 2],
1260
  ).encode(
1261
  x='x:Q',
1262
  tooltip=[
1263
  alt.Tooltip('x:N', title='Median percentage of subordinating conjunctions:'),
1264
  alt.Tooltip('level:N', title='Level:')
1265
  ],
 
1266
  color=alt.Color(
1267
  'level:N',
1268
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
1269
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1270
+ legend=None
1271
  ),
1272
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1273
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1274
  ).add_params(
1275
  selection,
 
1277
  )
1278
 
1279
  text_labels = alt.Chart(line_data).mark_text(
1280
+ align='center',
1281
+ dx=0,
1282
+ dy=-10,
1283
  fontSize=16,
1284
  fontWeight='bold'
1285
  ).encode(
1286
  x='x:Q',
1287
+ y=alt.value(0),
1288
+ text=alt.Text('x:Q', format='.2f'),
1289
  color=alt.Color(
1290
  'level:N',
1291
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1292
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1293
  legend=None
1294
  ),
1295
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1296
  )
1297
 
1298
 
 
1326
  row_labels = ['Median Perc. Subordinating Conjunctions', 'Median Perc. Adverbs', 'Median Perc. Determiners', 'Median Perc. Nouns']
1327
  df.index = row_labels
1328
 
 
1329
  styled_df = df.style.set_table_styles(
1330
  {
1331
  'Complete Beginner': [
 
1343
  'Advanced': [
1344
  {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]},
1345
  {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]}
 
 
 
 
1346
  ]
1347
  }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
1348
 
 
1349
  st.markdown(
1350
  """
1351
  <style>
 
1356
  """, unsafe_allow_html=True
1357
  )
1358
 
 
1359
  st.markdown(
1360
  '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
1361
  , unsafe_allow_html=True)
 
1376
 
1377
  video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
1378
 
 
1379
  line_data = pd.DataFrame({
1380
  'x': [7.00, 9.55, 11.66, 13.03],
1381
  'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
 
1401
  axis=alt.Axis(
1402
  labelFontSize=14,
1403
  titleFontSize=18,
 
1404
  titleColor='black',
1405
  titleFontWeight='normal',
 
1406
  titlePadding=30,
 
1407
  )
1408
  ),
1409
  alt.Y(
 
1412
  axis=alt.Axis(
1413
  labelFontSize=14,
1414
  titleFontSize=18,
 
1415
  titleColor='black',
1416
  titleFontWeight='normal',
 
1417
  titlePadding=20,
1418
  tickCount=5
1419
  ),
 
1425
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1426
  legend=alt.Legend(
1427
  title='CIJ Level',
 
1428
  titleFontSize=18,
1429
  titleFontWeight='bolder',
1430
  labelFontSize=16,
 
1431
  symbolType='circle',
1432
  symbolSize=200,
1433
  symbolStrokeWidth=0,
 
1439
  )
1440
  ),
1441
  tooltip=[
1442
+ alt.Tooltip('kan_props_perc:Q', title='Percentage of kango:', bin=True),
1443
  alt.Tooltip('level:N', title='Level:'),
1444
  alt.Tooltip('count()', title='Video count:')
1445
  ],
1446
  opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1447
  strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1448
  ).properties(
 
1449
  width='container',
 
1450
  height=500,
 
 
1451
  title=alt.TitleParams(
1452
  text='Percentages of kango (漢語)',
1453
  offset=20,
 
 
1454
  fontSize=24,
1455
  fontWeight='normal',
1456
  anchor='middle',
 
1463
  highlight
1464
  )
1465
 
 
1466
  vertical_lines = alt.Chart(line_data).mark_rule(
1467
  color='red',
1468
  strokeWidth=6,
1469
+ strokeDash = [10, 2],
1470
  ).encode(
1471
  x='x:Q',
1472
  tooltip=[
1473
  alt.Tooltip('x:N', title='Median percentage of kango:'),
1474
  alt.Tooltip('level:N', title='Level:')
1475
  ],
 
1476
  color=alt.Color(
1477
  'level:N',
1478
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
1479
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1480
+ legend=None
1481
  ),
1482
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1483
  strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1484
  ).add_params(
1485
  selection,
 
1487
  )
1488
 
1489
  text_labels = alt.Chart(line_data).mark_text(
1490
+ align='center',
1491
+ dx=0,
1492
+ dy=-10,
1493
  fontSize=16,
1494
  fontWeight='bold'
1495
  ).encode(
1496
  x='x:Q',
1497
+ y=alt.value(0),
1498
+ text=alt.Text('x:Q', format='.0f'),
1499
  color=alt.Color(
1500
  'level:N',
1501
  scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1502
  sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1503
  legend=None
1504
  ),
1505
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
1506
  )
1507
 
1508
  if show_medians:
 
1527
 
1528
  st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
1529
 
 
 
1530
  data = {
1531
  'Complete Beginner': [0.06999874574159035, 0.8578043261266064, 0.03301790801790795],
1532
  'Beginner': [0.0955284552845528, 0.8399311531841652, 0.0279441117764471],
 
1538
  row_labels = ['Median Perc. Kango (漢語)', 'Median Perc. Wago (和語)', 'Median Perc. Garaigo (外来語)']
1539
  df.index = row_labels
1540
 
 
1541
  styled_df = df.style.set_table_styles(
1542
  {
1543
  'Complete Beginner': [
 
1558
  ],
1559
  }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
1560
 
 
1561
  st.markdown(
1562
  '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
1563
  , unsafe_allow_html=True)
1564
 
1565
+ ###
1566
+ # MOST IMPORTANT FACTORS
1567
+ ###
1568
  st.markdown("## Which factors matter the most?")
1569
 
1570
  st.markdown("We've just found a number of statistics that lead to orderings in the data \
 
1576
  @st.cache_data
1577
  def render_vanilla_heatmap():
1578
 
 
1579
  corr_matrix = num_video_df.corr()
1580
 
 
1581
  variable_of_interest = 'Level'
1582
 
 
1583
  sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
1584
 
 
1585
  sorted_corr_matrix = corr_matrix.loc[sorted_vars, sorted_vars]
1586
 
 
1587
  plt.figure(figsize=(10, 8))
1588
  sns.heatmap(sorted_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f")
1589
 
 
 
1590
  st.pyplot(plt.gcf())
1591
 
1592
  render_vanilla_heatmap()
 
1603
  @st.cache_data
1604
  def render_level_row_unordered():
1605
 
 
1606
  corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
1607
 
 
1608
  variable_of_interest = 'Level'
1609
 
 
1610
  sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
1611
 
 
1612
  sorted_vars = sorted_vars.drop(variable_of_interest)
1613
 
 
1614
  first_row_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
1615
 
1616
+ plt.figure(figsize=(10, 1))
 
1617
  sns.heatmap(first_row_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
1618
 
 
 
1619
  st.pyplot(plt.gcf())
1620
 
1621
  @st.cache_data
1622
  def render_level_col_ordered():
1623
 
 
1624
  corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
1625
 
 
1626
  variable_of_interest = 'Level'
1627
 
 
1628
  correlations = corr_matrix[variable_of_interest]
1629
 
 
1630
  sorted_vars = correlations.abs().sort_values(ascending=False).index
1631
 
 
1632
  sorted_vars = sorted_vars.drop(variable_of_interest)
1633
 
 
1634
  sorted_corr_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
1635
 
 
1636
  transposed_corr_matrix = sorted_corr_matrix.T
1637
 
1638
+ plt.figure(figsize=(2, 3))
 
1639
  sns.heatmap(transposed_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
1640
 
 
 
1641
  st.pyplot(plt.gcf())
1642
 
1643
  if st.checkbox('Flip and sort'):
 
1659
 
1660
  st.markdown("### Thanks for reading ✌️")
1661
 
1662
+ st.markdown("---")