Spaces:

joshdavham
/

Comprehensible-Input-Analysis

Sleeping

App Files Files Community

joshdavham commited on Oct 11, 2024

Commit

360a122

1 Parent(s): 391b919

modify comments

Browse files

Files changed (1) hide show

app.py +99 -307

app.py CHANGED Viewed

@@ -9,16 +9,6 @@ st.set_page_config(
     page_icon='favicon.svg'
 )
-#st.markdown("""
-#<link href="https://fonts.googleapis.com/css2?family=Urbanist:wght@400;700&display=swap" rel="stylesheet">
-#<style>
-#    .vega-embed * {
-#        font-family: 'Urbanist', sans-serif;
-#    }
-#</style>
-#""", unsafe_allow_html=True)
 @st.cache_data
 def load_dataframes():
@@ -49,20 +39,18 @@ st.markdown("To answer this question, I'll be analyzing the videos on \
             [cijapanese.com](https://cijapanese.com/) (CIJ), a \
             video platform for learning Japanese.")
-# Plot the WPM histogram
 st.markdown("## How fast is CI?")
 st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
             they speak more slowly in videos meant for beginners and more quickly \
             for advanced learners.")
-#st.markdown("### Rate of speech in words per minute (WPM)")
 @st.cache_data
 def get_wpm_chart(show_medians=False):
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [75, 91, 124, 149],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -88,10 +76,8 @@ def get_wpm_chart(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20
             )
         ),
@@ -101,10 +87,8 @@ def get_wpm_chart(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -116,11 +100,9 @@ def get_wpm_chart(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -132,24 +114,17 @@ def get_wpm_chart(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('wpm:Q', title='Words per minute:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
-        #width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Rate of speech in words per minute (WPM)',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -162,25 +137,23 @@ def get_wpm_chart(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median WPM:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -188,22 +161,22 @@ def get_wpm_chart(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
@@ -229,8 +202,6 @@ st.markdown("To put this data into perspective, native Japanese speakers \
             tend to speak at rates of over 200 wpm, meaning that most of the videos \
             on CIJ have been adapted to be a lot slower than that!")
-# wpm vs sps chart
 @st.cache_data
 def get_wpm_vs_sps_chart(interactive=False):
@@ -238,7 +209,6 @@ def get_wpm_vs_sps_chart(interactive=False):
     highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
-    # Create the scatter plot
     scatter_plot = alt.Chart(video_df).mark_circle(
         cursor='pointer',
         size=80,
@@ -250,10 +220,8 @@ def get_wpm_vs_sps_chart(interactive=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20
             )
         ),
@@ -263,12 +231,9 @@ def get_wpm_vs_sps_chart(interactive=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
-                #tickCount=5
             ),
         ),
         color=alt.Color(
@@ -282,10 +247,8 @@ def get_wpm_vs_sps_chart(interactive=False):
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
-                #symbolStrokeWidth=3,
                 orient='right',
                 direction='vertical',
-                #fillColor='black',
                 padding=10,
                 cornerRadius=5,
             )
@@ -298,15 +261,12 @@ def get_wpm_vs_sps_chart(interactive=False):
         ],
         opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)),
-        #strokeWidth=alt.condition(selection | highlight, alt.value(6), alt.value(2))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Rate of speech: Syllables per second vs. words per minute',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -321,7 +281,6 @@ def get_wpm_vs_sps_chart(interactive=False):
         background='white'
     )
-    # Display the plot
     if interactive:
         return scatter_plot.interactive()
     else:
@@ -340,6 +299,9 @@ st.markdown("We can also measure the rate of speech in syllables per second (SPS
 st.markdown("(Also, FYI, most of these **graphs are \
             interactive** so please click around.)")
 st.markdown("## A quick statistics lesson")
 st.markdown("Before we continue this analysis, there's some basic things you should know.")
@@ -381,7 +343,6 @@ st.markdown("Videos meant for beginners tend to have shorter sentences on averag
 @st.cache_data
 def get_sentence_length_hist(show_medians=False):
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [7.60, 10.45, 16.17, 19.39],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -407,10 +368,8 @@ def get_sentence_length_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20
             )
         ),
@@ -420,10 +379,8 @@ def get_sentence_length_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -435,11 +392,9 @@ def get_sentence_length_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -451,24 +406,18 @@ def get_sentence_length_hist(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Average number of words per sentence (sentence length)',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -481,25 +430,23 @@ def get_sentence_length_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median average sentence length:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -507,22 +454,22 @@ def get_sentence_length_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.2f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
     if show_medians:
@@ -560,14 +507,8 @@ def get_repetition_hist(show_medians=False):
     video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
-    #if show_medians:
-    #    sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
-    #else:
-    #    sub_video_df = video_df
-    # take the sub data frame for easier viewing
     sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [0.99, 0.62, 0.37, 0.23],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -593,12 +534,9 @@ def get_repetition_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
-                #format='.1f%'
             ),
         ),
         alt.Y(
@@ -607,10 +545,8 @@ def get_repetition_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -622,11 +558,9 @@ def get_repetition_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -638,24 +572,18 @@ def get_repetition_hist(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('average_rel_reps:Q', title='Average relative repetitions:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Relative repetitions of words',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -668,11 +596,10 @@ def get_repetition_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         alt.X(
             'x:Q'
@@ -681,14 +608,13 @@ def get_repetition_hist(show_medians=False):
             alt.Tooltip('x:N', title='Median average relative repetitions:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)),
     ).add_params(
         selection,
@@ -696,24 +622,24 @@ def get_repetition_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         alt.X(
             'x:Q'
         ),
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.2f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
     if show_medians:
@@ -756,8 +682,6 @@ st.markdown("If we take all the words in CIJ, count them then order them from mo
             For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
             Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
-# word coverage chart
 @st.cache_data
 def get_word_coverage_chart(zoom=False):
@@ -766,7 +690,6 @@ def get_word_coverage_chart(zoom=False):
     else:
         word_coverage_df_sub = word_coverage_df
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [4295, 5606, 6853, 9085],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -788,10 +711,8 @@ def get_word_coverage_chart(zoom=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20
             )
         ),
@@ -802,10 +723,8 @@ def get_word_coverage_chart(zoom=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -821,10 +740,8 @@ def get_word_coverage_chart(zoom=False):
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
-                #symbolStrokeWidth=3,
                 orient='right',
                 direction='vertical',
-                #fillColor='black',
                 padding=10,
                 cornerRadius=5,
             )
@@ -843,8 +760,6 @@ def get_word_coverage_chart(zoom=False):
         title=alt.TitleParams(
             text='Word coverage curves',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -857,48 +772,46 @@ def get_word_coverage_chart(zoom=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=4,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Words needed to reach 98%:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
         highlight
-    )#.interactive()
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
     layered_chart = alt.layer(line_chart, vertical_lines, text_labels, background='white')
@@ -922,7 +835,6 @@ st.markdown("Using the same method of calculating word coverage as before, \
 @st.cache_data
 def get_ne_spot_hist(show_medians=False):
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [3859, 5229, 6698, 7925],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -948,12 +860,9 @@ def get_ne_spot_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
-                #format='.1f%'
             )
         ),
         alt.Y(
@@ -962,10 +871,8 @@ def get_ne_spot_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -977,11 +884,9 @@ def get_ne_spot_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -993,24 +898,18 @@ def get_ne_spot_hist(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('ne_spot:Q', title='Vocab size needed for 98% cov:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Vocab size needed for 98% coverage',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -1023,25 +922,23 @@ def get_ne_spot_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -1049,22 +946,22 @@ def get_ne_spot_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
@@ -1097,7 +994,6 @@ st.markdown("More advanced videos tend to use rare/uncommon words more often tha
 @st.cache_data
 def get_tfplr_hist(show_medians=False):
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [3.82, 4.30, 4.76, 5.21],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1123,12 +1019,9 @@ def get_tfplr_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=30,
-                #format='.1f%'
             )
         ),
         alt.Y(
@@ -1137,10 +1030,8 @@ def get_tfplr_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -1152,11 +1043,9 @@ def get_tfplr_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -1175,17 +1064,11 @@ def get_tfplr_hist(show_medians=False):
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='25th percentile word-frequency log ranks',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -1198,25 +1081,23 @@ def get_tfplr_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -1224,25 +1105,24 @@ def get_tfplr_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.2f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
-    #layered_chart = alt.layer(histogram, background='white')
     if show_medians:
         layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
     else:
@@ -1274,8 +1154,6 @@ st.markdown("(It's okay ff the above didn't quite make sense to you - just know
             demonstrates that easier videos tend to use more common words whereas \
             advanced videos tend to use more rare words!)")
-# grammar table
 ###
 # GRAMMAR
 ###
@@ -1288,7 +1166,6 @@ def get_sconj_hist(show_medians=False):
     video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [2.64, 4.73, 6.63, 7.67],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1314,12 +1191,9 @@ def get_sconj_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=30,
-                #format='.1f%'
             )
         ),
         alt.Y(
@@ -1328,10 +1202,8 @@ def get_sconj_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -1343,11 +1215,9 @@ def get_sconj_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -1359,24 +1229,18 @@ def get_sconj_hist(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('sconj_props_perc:Q', title='Percentage of subordinating conjunctions:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Percentages of subordinating conjunctions',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -1389,25 +1253,23 @@ def get_sconj_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median percentage of subordinating conjunctions:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -1415,22 +1277,22 @@ def get_sconj_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.2f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
@@ -1464,7 +1326,6 @@ df = pd.DataFrame(data)
 row_labels = ['Median Perc. Subordinating Conjunctions', 'Median Perc. Adverbs', 'Median Perc. Determiners', 'Median Perc. Nouns']
 df.index = row_labels
-# Apply header-specific styling using set_table_styles
 styled_df = df.style.set_table_styles(
     {
         'Complete Beginner': [
@@ -1482,14 +1343,9 @@ styled_df = df.style.set_table_styles(
         'Advanced': [
             {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]},
             {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]}
-        ],
-        # This is where we target the top-left index column reader
-        '': [
-            {'selector': '.index_name', 'props': [('color', 'green'), ('font-weight', 'bold')]}
         ]
 }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
-# Inject CSS to ensure the background is white in the markdown section
 st.markdown(
     """
     <style>
@@ -1500,7 +1356,6 @@ st.markdown(
     """, unsafe_allow_html=True
 )
-# Display the styled DataFrame
 st.markdown(
     '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
     , unsafe_allow_html=True)
@@ -1521,7 +1376,6 @@ def get_kango_hist(show_medians=False):
     video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
-    # Data for vertical lines corresponding to each level
     line_data = pd.DataFrame({
         'x': [7.00, 9.55, 11.66, 13.03],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
@@ -1547,12 +1401,9 @@ def get_kango_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=30,
-                #format='.1f%'
             )
         ),
         alt.Y(
@@ -1561,10 +1412,8 @@ def get_kango_hist(show_medians=False):
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
-                #titleFont='Urbanist',
                 titleColor='black',
                 titleFontWeight='normal',
-                #titleFontStyle='italic',
                 titlePadding=20,
                 tickCount=5
             ),
@@ -1576,11 +1425,9 @@ def get_kango_hist(show_medians=False):
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
-                #titleFont='Urbanist',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
-                #labelFont='Urbanist',
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
@@ -1592,24 +1439,18 @@ def get_kango_hist(show_medians=False):
             )
         ),
         tooltip=[
-            alt.Tooltip('kan_props_perc:Q', title='Percentage of kango:', bin=True),  # Properly indicate that `wpm` is binned
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
-        #width=750,
         width='container',
-        #height='container',
         height=500,
-        #background='beige',
-        #padding=50,
         title=alt.TitleParams(
             text='Percentages of kango (漢語)',
             offset=20,
-            #subtitle='(clickable)',
-            #font='Urbanist',
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
@@ -1622,25 +1463,23 @@ def get_kango_hist(show_medians=False):
         highlight
     )
-    # Vertical lines corresponding to each level
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
-        strokeDash = [10, 2], # first arg is length, second is gap
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median percentage of kango:'),
             alt.Tooltip('level:N', title='Level:')
         ],
-        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
         color=alt.Color(
             'level:N',
-            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-            legend=None  # No legend for lines, it is already shown in the histogram
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
@@ -1648,22 +1487,22 @@ def get_kango_hist(show_medians=False):
     )
     text_labels = alt.Chart(line_data).mark_text(
-        align='center',  # Align text to the left of the line
-        dx=0,  # Offset the text to the right by 5 pixels
-        dy=-10, # Adjust vertical positioning
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
-        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-        text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
-        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
     if show_medians:
@@ -1688,8 +1527,6 @@ st.markdown("In Japanese, Kango are somewhat analogous to French words in Englis
 st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
-# word origin table
 data = {
     'Complete Beginner': [0.06999874574159035, 0.8578043261266064, 0.03301790801790795],
     'Beginner': [0.0955284552845528, 0.8399311531841652, 0.0279441117764471],
@@ -1701,7 +1538,6 @@ df = pd.DataFrame(data)
 row_labels = ['Median Perc. Kango (漢語)', 'Median Perc. Wago (和語)', 'Median Perc. Garaigo (外来語)']
 df.index = row_labels
-# Apply header-specific styling using set_table_styles
 styled_df = df.style.set_table_styles(
     {
         'Complete Beginner': [
@@ -1722,13 +1558,13 @@ styled_df = df.style.set_table_styles(
         ],
 }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
-# Display the styled DataFrame
 st.markdown(
     '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
     , unsafe_allow_html=True)
-# heatmap
 st.markdown("## Which factors matter the most?")
 st.markdown("We've just found a number of statistics that lead to orderings in the data \
@@ -1740,24 +1576,17 @@ st.markdown("To answer this, we can look at a correlation heatmap between each o
 @st.cache_data
 def render_vanilla_heatmap():
-    # Compute the correlation matrix
     corr_matrix = num_video_df.corr()
-    # Specify the variable of interest (e.g., 'target_variable')
     variable_of_interest = 'Level'
-    # Sort the variables based on correlation with the variable of interest
     sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
-    # Reorder the correlation matrix
     sorted_corr_matrix = corr_matrix.loc[sorted_vars, sorted_vars]
-    # Create a heatmap using seaborn with the sorted correlation matrix
     plt.figure(figsize=(10, 8))
     sns.heatmap(sorted_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f")
-    # Display the heatmap
-    #plt.show()
     st.pyplot(plt.gcf())
 render_vanilla_heatmap()
@@ -1774,59 +1603,41 @@ st.markdown("Using a statistics rule of thumb and removing all variables that ha
 @st.cache_data
 def render_level_row_unordered():
-    # Compute the correlation matrix
     corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
-    # Specify the variable of interest (e.g., 'Level')
     variable_of_interest = 'Level'
-    # Sort the variables based on correlation with the variable of interest
     sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
-    # Remove 'Level' from the sorted variables to exclude the self-correlation
     sorted_vars = sorted_vars.drop(variable_of_interest)
-    # Reorder the correlation matrix and exclude 'Level' column from the first row
     first_row_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
-    # Create a heatmap using seaborn with the single row of the correlation matrix
-    plt.figure(figsize=(10, 1))  # Adjust the figure size to make it more appropriate for a single row
     sns.heatmap(first_row_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
-    # Display the heatmap
-    #plt.show()
     st.pyplot(plt.gcf())
 @st.cache_data
 def render_level_col_ordered():
-    # Compute the correlation matrix
     corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
-    # Specify the variable of interest (e.g., 'Level')
     variable_of_interest = 'Level'
-    # Get the correlations of the variable of interest
     correlations = corr_matrix[variable_of_interest]
-    # Sort the variables based on the absolute value of the correlation with the variable of interest
     sorted_vars = correlations.abs().sort_values(ascending=False).index
-    # Remove 'Level' from the sorted variables (to exclude the self-correlation)
     sorted_vars = sorted_vars.drop(variable_of_interest)
-    # Reorder the correlation matrix, excluding the self-correlation
     sorted_corr_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
-    # Transpose the matrix to make it vertical
     transposed_corr_matrix = sorted_corr_matrix.T
-    # Create a heatmap using seaborn with the transposed correlation matrix
-    plt.figure(figsize=(2, 3))  # Adjust the figure size to make it more appropriate for a vertical layout
     sns.heatmap(transposed_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
-    # Display the heatmap
-    #plt.show()
     st.pyplot(plt.gcf())
 if st.checkbox('Flip and sort'):
@@ -1848,23 +1659,4 @@ st.markdown("8. Amount of Chinese words")
 st.markdown("### Thanks for reading ✌️")
-st.markdown("---")
-#st.markdown("In the unlikely chance that you happen to be a CI instructor or a CI content creator, I want to talk to you! \
-#            I can be reached at hamiltonjoshuadavid@gmail.com and I'm interested in learning \
-#            more about what you do. Please also add a link to your work if you decide to reach out.")
-#st.markdown("Special thanks to [CIJ](https://cijapanese.com/). I'm a happy subscriber and I recommend you also pick up a \
-#             a membership if you're a Japanese learner!")
-#st.markdown("---")
-#st.markdown("**Some extra notes:**")
-#st.markdown("1. No statistical tests of significance were conducted. This was just meant to be a light and unrigorous EDA.")
-#st.markdown("2. It should be noted that the levels of the videos were determined by experts, and not by learners. They do not reflect objective difficulty.")
-#st.markdown("3. While I stated that Japanese learners tend to speak at rates of over 200 wpm, I unfortunately haven't been able to find any good sources on this. \
-#            The actual average Japanese WPM is likely even higher than 200 wpm, but unfortunately I haven't found any good research on this.")
-#st.markdown("4. Technically, I didn't actually compute syllables per second, but rather moras per second which served as an approximation for syllables. \
-#            I understand that this is linguistically incorrect, but I didn't want to confuse the reader who might not know any Japanese or linguistics.")
-#st.markdown("5. More data cleaning could've been done to create better frequency lists, however, this was unnecessary in order to establish statistical patterns in a one-off analysis.")
-#st.markdown("6. As a disclaimer, I do not think that CI instructors should base how they create their content off of the findings in this analysis. \
-#            They should only use these findings for inspiration and to get them thinking more analytically about what they're doing.")

     page_icon='favicon.svg'
 )
 @st.cache_data
 def load_dataframes():
             [cijapanese.com](https://cijapanese.com/) (CIJ), a \
             video platform for learning Japanese.")
+###
+# RATE OF SPEECH
+###
 st.markdown("## How fast is CI?")
 st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
             they speak more slowly in videos meant for beginners and more quickly \
             for advanced learners.")
 @st.cache_data
 def get_wpm_chart(show_medians=False):
     line_data = pd.DataFrame({
         'x': [75, 91, 124, 149],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20
             )
         ),
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('wpm:Q', title='Words per minute:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         height=500,
         title=alt.TitleParams(
             text='Rate of speech in words per minute (WPM)',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median WPM:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.0f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
             tend to speak at rates of over 200 wpm, meaning that most of the videos \
             on CIJ have been adapted to be a lot slower than that!")
 @st.cache_data
 def get_wpm_vs_sps_chart(interactive=False):
     highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
     scatter_plot = alt.Chart(video_df).mark_circle(
         cursor='pointer',
         size=80,
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20
             )
         ),
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
             ),
         ),
         color=alt.Color(
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 orient='right',
                 direction='vertical',
                 padding=10,
                 cornerRadius=5,
             )
         ],
         opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)),
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Rate of speech: Syllables per second vs. words per minute',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         background='white'
     )
     if interactive:
         return scatter_plot.interactive()
     else:
 st.markdown("(Also, FYI, most of these **graphs are \
             interactive** so please click around.)")
+###
+# STATISTICS LESSON
+###
 st.markdown("## A quick statistics lesson")
 st.markdown("Before we continue this analysis, there's some basic things you should know.")
 @st.cache_data
 def get_sentence_length_hist(show_medians=False):
     line_data = pd.DataFrame({
         'x': [7.60, 10.45, 16.17, 19.39],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20
             )
         ),
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Average number of words per sentence (sentence length)',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median average sentence length:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.2f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
     if show_medians:
     video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
     sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
     line_data = pd.DataFrame({
         'x': [0.99, 0.62, 0.37, 0.23],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
             ),
         ),
         alt.Y(
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('average_rel_reps:Q', title='Average relative repetitions:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Relative repetitions of words',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         alt.X(
             'x:Q'
             alt.Tooltip('x:N', title='Median average relative repetitions:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)),
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         alt.X(
             'x:Q'
         ),
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.2f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
     if show_medians:
             For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
             Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
 @st.cache_data
 def get_word_coverage_chart(zoom=False):
     else:
         word_coverage_df_sub = word_coverage_df
     line_data = pd.DataFrame({
         'x': [4295, 5606, 6853, 9085],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20
             )
         ),
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 orient='right',
                 direction='vertical',
                 padding=10,
                 cornerRadius=5,
             )
         title=alt.TitleParams(
             text='Word coverage curves',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=4,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Words needed to reach 98%:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
         highlight
+    )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.0f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
     layered_chart = alt.layer(line_chart, vertical_lines, text_labels, background='white')
 @st.cache_data
 def get_ne_spot_hist(show_medians=False):
     line_data = pd.DataFrame({
         'x': [3859, 5229, 6698, 7925],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
             )
         ),
         alt.Y(
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('ne_spot:Q', title='Vocab size needed for 98% cov:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Vocab size needed for 98% coverage',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.0f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
 @st.cache_data
 def get_tfplr_hist(show_medians=False):
     line_data = pd.DataFrame({
         'x': [3.82, 4.30, 4.76, 5.21],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=30,
             )
         ),
         alt.Y(
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='25th percentile word-frequency log ranks',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.2f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
     if show_medians:
         layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
     else:
             demonstrates that easier videos tend to use more common words whereas \
             advanced videos tend to use more rare words!)")
 ###
 # GRAMMAR
 ###
     video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
     line_data = pd.DataFrame({
         'x': [2.64, 4.73, 6.63, 7.67],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=30,
             )
         ),
         alt.Y(
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('sconj_props_perc:Q', title='Percentage of subordinating conjunctions:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Percentages of subordinating conjunctions',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median percentage of subordinating conjunctions:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.2f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
 row_labels = ['Median Perc. Subordinating Conjunctions', 'Median Perc. Adverbs', 'Median Perc. Determiners', 'Median Perc. Nouns']
 df.index = row_labels
 styled_df = df.style.set_table_styles(
     {
         'Complete Beginner': [
         'Advanced': [
             {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]},
             {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]}
         ]
 }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
 st.markdown(
     """
     <style>
     """, unsafe_allow_html=True
 )
 st.markdown(
     '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
     , unsafe_allow_html=True)
     video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
     line_data = pd.DataFrame({
         'x': [7.00, 9.55, 11.66, 13.03],
         'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=30,
             )
         ),
         alt.Y(
             axis=alt.Axis(
                 labelFontSize=14,
                 titleFontSize=18,
                 titleColor='black',
                 titleFontWeight='normal',
                 titlePadding=20,
                 tickCount=5
             ),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=alt.Legend(
                 title='CIJ Level',
                 titleFontSize=18,
                 titleFontWeight='bolder',
                 labelFontSize=16,
                 symbolType='circle',
                 symbolSize=200,
                 symbolStrokeWidth=0,
             )
         ),
         tooltip=[
+            alt.Tooltip('kan_props_perc:Q', title='Percentage of kango:', bin=True),
             alt.Tooltip('level:N', title='Level:'),
             alt.Tooltip('count()', title='Video count:')
         ],
         opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
     ).properties(
         width='container',
         height=500,
         title=alt.TitleParams(
             text='Percentages of kango (漢語)',
             offset=20,
             fontSize=24,
             fontWeight='normal',
             anchor='middle',
         highlight
     )
     vertical_lines = alt.Chart(line_data).mark_rule(
         color='red',
         strokeWidth=6,
+        strokeDash = [10, 2],
     ).encode(
         x='x:Q',
         tooltip=[
             alt.Tooltip('x:N', title='Median percentage of kango:'),
             alt.Tooltip('level:N', title='Level:')
         ],
         color=alt.Color(
             'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
         strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
     ).add_params(
         selection,
     )
     text_labels = alt.Chart(line_data).mark_text(
+        align='center',
+        dx=0,
+        dy=-10,
         fontSize=16,
         fontWeight='bold'
     ).encode(
         x='x:Q',
+        y=alt.value(0),
+        text=alt.Text('x:Q', format='.0f'),
         color=alt.Color(
             'level:N',
             scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
             sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
             legend=None
         ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),
     )
     if show_medians:
 st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
 data = {
     'Complete Beginner': [0.06999874574159035, 0.8578043261266064, 0.03301790801790795],
     'Beginner': [0.0955284552845528, 0.8399311531841652, 0.0279441117764471],
 row_labels = ['Median Perc. Kango (漢語)', 'Median Perc. Wago (和語)', 'Median Perc. Garaigo (外来語)']
 df.index = row_labels
 styled_df = df.style.set_table_styles(
     {
         'Complete Beginner': [
         ],
 }).set_properties(**{'background-color': 'white'}).format("{:.2%}")
 st.markdown(
     '<div class="dataframe-divv">' + styled_df.to_html() + "</div>"
     , unsafe_allow_html=True)
+###
+# MOST IMPORTANT FACTORS
+###
 st.markdown("## Which factors matter the most?")
 st.markdown("We've just found a number of statistics that lead to orderings in the data \
 @st.cache_data
 def render_vanilla_heatmap():
     corr_matrix = num_video_df.corr()
     variable_of_interest = 'Level'
     sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
     sorted_corr_matrix = corr_matrix.loc[sorted_vars, sorted_vars]
     plt.figure(figsize=(10, 8))
     sns.heatmap(sorted_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f")
     st.pyplot(plt.gcf())
 render_vanilla_heatmap()
 @st.cache_data
 def render_level_row_unordered():
     corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
     variable_of_interest = 'Level'
     sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index
     sorted_vars = sorted_vars.drop(variable_of_interest)
     first_row_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
+    plt.figure(figsize=(10, 1))
     sns.heatmap(first_row_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
     st.pyplot(plt.gcf())
 @st.cache_data
 def render_level_col_ordered():
     corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo'], axis=1).corr()
     variable_of_interest = 'Level'
     correlations = corr_matrix[variable_of_interest]
     sorted_vars = correlations.abs().sort_values(ascending=False).index
     sorted_vars = sorted_vars.drop(variable_of_interest)
     sorted_corr_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars]
     transposed_corr_matrix = sorted_corr_matrix.T
+    plt.figure(figsize=(2, 3))
     sns.heatmap(transposed_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'})
     st.pyplot(plt.gcf())
 if st.checkbox('Flip and sort'):
 st.markdown("### Thanks for reading ✌️")
+st.markdown("---")