import streamlit as st import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np from scipy.stats import zscore import streamlit.components.v1 as components st.set_page_config(layout="wide") def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: # Remove columns with no name df = df.loc[:, df.columns.str.strip() != ''] # Drop rows where all values are NaN df = df.dropna(how='all') return df def categorize_marks(normalized_marks): if normalized_marks >= 0.7: return '70-100%' elif normalized_marks >= 0.5: return '50-70%' else: return '<50%' def analyze_student_performance_by_li(question_data, student_performance_data): # Merge the dataframes on question number merged_data = pd.merge(student_performance_data, question_data, on='question_number') merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks']) merged_data["maximum_marks"] = merged_data["full_marks"] merged_data = merged_data.groupby(["student_id","learning_indicator_id"])[["marks_obtained","maximum_marks"]].sum().reset_index() merged_data['normalized_marks'] = merged_data['marks_obtained'] / merged_data['maximum_marks'] # Categorize the normalized marks merged_data['category'] = merged_data['normalized_marks'].apply(categorize_marks) # Group by learning indicator ID and category, and count the number of students in each category merged_data = merged_data.groupby(['learning_indicator_id', 'category']).size().unstack(fill_value=0) # Rename the columns for better readability merged_data.columns = ['<50%', '50-70%', '70-100%'] merged_data = merged_data.sort_values(['<50%', '50-70%', '70-100%'],ascending=[False,False,False]).reset_index() # Display the results return merged_data def prioritize_lis(category_counts): # Add a rank column based on the order of rows category_counts['Rank'] = category_counts.index + 1 # Determine the number of LIs total_lis = len(category_counts) # Determine the cutoff points for high, medium, and low priority high_priority_cutoff = int(total_lis * 0.3) medium_priority_cutoff = int(total_lis * 0.6) # Classify the LIs based on their rank category_counts['Priority'] = 'Low' category_counts.loc[category_counts['Rank'] <= high_priority_cutoff, 'Priority'] = 'High' category_counts.loc[(category_counts['Rank'] > high_priority_cutoff) & (category_counts['Rank'] <= medium_priority_cutoff), 'Priority'] = 'Medium' return category_counts def mean_li_level_analysis(student_data, question_data): merged_data = pd.merge(student_data, question_data, on='question_number',how="inner") merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks']) merged_data["maximum_marks"] = merged_data["full_marks"] # Normalize the marks obtained for each learning indicator by each student merged_data = merged_data.groupby(['learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index() merged_data["normalised_score"]=(merged_data["marks_obtained"]/merged_data["maximum_marks"])*100 merged_data["normalised_score"] = merged_data["normalised_score"].round(1) merged_data = pd.merge(merged_data,question_data[["learning_indicator_id","learning_indicator_text"]].drop_duplicates(),on="learning_indicator_id") return merged_data def student_level_analysis(student_data, question_data, prioritized_lis): # Merge the student data with question data merged_data = pd.merge(student_data, question_data, on='question_number',how="inner") merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks']) merged_data["maximum_marks"] = merged_data["full_marks"] # Normalize the marks obtained for each learning indicator by each student merged_data = merged_data.groupby(['student_id', 'learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index() merged_data['normalized_marks'] = merged_data['marks_obtained'] / merged_data['maximum_marks'] merged_data = merged_data[merged_data["normalized_marks"]<0.95] # Merge with prioritized_lis to get the priority and rank merged_data = pd.merge(merged_data, prioritized_lis[['learning_indicator_id', 'Rank']], on='learning_indicator_id', how='left') # Rank the LIs for each student based on normalized marks and class-level LI priority merged_data['student_rank'] = merged_data.groupby('student_id')['normalized_marks'].rank(method='dense', ascending=False) merged_data = merged_data.sort_values(by=['student_id', 'student_rank', 'Rank']) # Ensure unique ranks by adding a secondary sort by Rank merged_data['unique_rank'] = merged_data.groupby('student_id').cumcount() + 1 # Create the final dataframe student_ranking = merged_data.pivot(index='student_id', columns='unique_rank', values='learning_indicator_id').reset_index() student_ranking.columns = ['student_id'] + [f'P{I+1}_li' for I in range(student_ranking.shape[1] - 1)] li_text_mapping = question_data.drop_duplicates(subset='learning_indicator_id').set_index('learning_indicator_id')['learning_indicator_text'] for col in student_ranking.columns[1:]: student_ranking[col] = student_ranking[col].map(li_text_mapping) return student_ranking def prepare_data_for_ridge_plot(student_data, question_data,ranking_order=[]): # Merge the DataFrames merged_data = pd.merge(student_data, question_data, on='question_number', how='inner') # Apply minimum marks validation and fill NaN with 0 merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks']) merged_data["maximum_marks"] = merged_data["full_marks"] # Normalize the marks obtained for each learning indicator by each student normalized_data = merged_data.groupby(['student_id', 'learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index() normalized_data['normalized_marks'] = normalized_data['marks_obtained'] / normalized_data['maximum_marks'] # Add learning_indicator_text to normalized_data plot_data = pd.merge(normalized_data, question_data[['learning_indicator_id', 'learning_indicator_text']].drop_duplicates(), on='learning_indicator_id') if len(ranking_order)>1: plot_data = pd.merge(plot_data,ranking_order[["learning_indicator_id","Rank"]],how="left") plot_data["Rank"]=plot_data["Rank"].fillna(10000) plot_data = plot_data.sort_values("Rank") return plot_data def calculate_logical_quantiles(data, num_quantiles=5): """ Calculate logical quantiles for a given data set to ensure they are informative. Parameters: data (array-like): The input data for which to calculate quantiles. num_quantiles (int): The number of quantiles to calculate. Default is 5. Returns: list: A list of quantile values. """ # Ensure there are enough unique values to calculate the quantiles if len(np.unique(data)) < num_quantiles: # If not enough unique values, use unique values as quantiles quantiles = np.unique(data) else: # Calculate evenly spaced quantiles quantiles = np.percentile(data, np.linspace(0, 100, num_quantiles)) return quantiles.tolist() def create_ridge_plot(plot_data): unique_learning_indicators = plot_data['learning_indicator_text'].unique() n_indicators = len(unique_learning_indicators) bandwidth = 0.5 # Adjust bandwidth for smoother graphs darkgreen = '#9BC184' midgreen = '#C2D6A4' lightgreen = '#E7E5CB' colors = [lightgreen, midgreen, darkgreen, midgreen, lightgreen] fig, axs = plt.subplots(nrows=n_indicators, ncols=1, figsize=(10, n_indicators * 1.5), sharex=True) axs = axs.flatten() # Flatten in case of single plot for i, indicator in enumerate(unique_learning_indicators): # Subset the data for each learning indicator subset = plot_data[plot_data['learning_indicator_text'] == indicator] # Plot the distribution of normalized marks sns.kdeplot( subset['normalized_marks'], shade=True, bw_adjust=bandwidth, ax=axs[i], color=sns.color_palette('coolwarm', n_colors=n_indicators)[i] ) quantiles = calculate_logical_quantiles(subset["normalized_marks"].tolist()) # fill space between each pair of quantiles for j in range(len(quantiles) - 1): axs[i].fill_between( [quantiles[j], # lower bound quantiles[j+1]], # upper bound 0.1, # max y=0 0.3, # max y=0.0002 color=colors[j] ) mean = subset['marks_obtained'].sum()/subset['maximum_marks'].sum() axs[i].scatter([mean], [0.3], color='black', s=15) global_mean = plot_data['normalized_marks'].mean() axs[i].axvline(global_mean, color='#525252', linestyle='--') axs[i].set_xlim(0, 1) #axs[i].set_ylim(0,3) # Add the learning indicator text as the title axs[i].set_title(indicator, loc='left', fontsize=12, fontweight='bold') # Remove y-axis label axs[i].set_ylabel('') # Add a horizontal line for the baseline axs[i].axhline(0, color='black', linewidth=1.3, linestyle='-') # Set common labels plt.xlabel('Normalized Marks', fontsize=12, fontweight='bold') plt.tight_layout() return fig # def remediation_groups(student_df, question_df, z_threshold=-1.35): # # Merge student performance with question data to get full marks # student_df["student_id"]=student_df["student_id"].astype(str) # merged_df = pd.merge(student_df, question_df, on='question_number') # # Apply minimum marks validation and fill NaN with 0 # merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks']) # # Calculate normalized scores # merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks'] # # Calculate z-scores for each learning indicator # z_scores = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore) # merged_df['z_score'] = z_scores # # Identify students needing remediation # remediation_df = merged_df[merged_df['z_score'] < z_threshold] # # Group by learning indicator to find students needing remediation # li_remediation_groups = remediation_df.groupby(['learning_indicator_id', 'learning_indicator_text'])['student_id'].apply(lambda x: ', '.join(x.unique())).reset_index() # # Identify students who don't need remediation # students_needing_remediation = remediation_df['student_id'].unique() # students_no_remediation = merged_df[~merged_df['student_id'].isin(students_needing_remediation)]['student_id'].unique() # no_remediation_df = pd.DataFrame(students_no_remediation, columns=['student_id']) # return li_remediation_groups, no_remediation_df # def remediation_groups(student_df, question_df, z_threshold=-1.00): # # Merge student performance with question data to get full marks # student_df["student_id"] = student_df["student_id"].astype(str) # merged_df = pd.merge(student_df, question_df, on='question_number') # # Apply minimum marks validation and fill NaN with 0 # merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks']) # # Calculate normalized scores # merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks'] # # Calculate z-scores for each learning indicator # merged_df['z_score'] = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore) # # Identify students needing remediation (z-score below threshold) # remediation_df = merged_df[merged_df['z_score'] < z_threshold] # # For each student, select the LI with the lowest z-score # remediation_df = remediation_df.loc[remediation_df.groupby('student_id')['z_score'].idxmin()] # # Group by learning indicator to find students needing remediation # li_remediation_groups = remediation_df.groupby(['learning_indicator_id', 'learning_indicator_text'])['student_id'].apply(lambda x: ', '.join(sorted(x))).reset_index() # # Identify students who don't need remediation # students_needing_remediation = remediation_df['student_id'].unique() # students_no_remediation = merged_df[~merged_df['student_id'].isin(students_needing_remediation)]['student_id'].unique() # no_remediation_df = pd.DataFrame(students_no_remediation, columns=['student_id']) # return li_remediation_groups, no_remediation_df def remediation_groups(student_df, question_df, z_threshold=-1.25): # Merge student performance with question data to get full marks student_df["student_id"] = student_df["student_id"].astype(str) merged_df = pd.merge(student_df, question_df, on='question_number') # Apply minimum marks validation and fill NaN with 0 merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks']) # Calculate normalized scores merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks'] # Calculate z-scores for each learning indicator merged_df['z_score'] = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore) # Identify students needing remediation (z-score below threshold) remediation_df = merged_df[merged_df['z_score'] < z_threshold] # Get list of students needing remediation and their LIs remediation_students = remediation_df[['student_id', 'learning_indicator_id', 'learning_indicator_text']].drop_duplicates() # Build a mapping of LIs to students needing remediation in that LI li_student_map = remediation_students.groupby('learning_indicator_id')['student_id'].apply(set).to_dict() li_text_map = remediation_df.set_index('learning_indicator_id')['learning_indicator_text'].to_dict() # Initialize group sizes li_group_sizes = {li: 0 for li in li_student_map.keys()} # Initialize final assignments student_assignments = {} # Shuffle the students for random assignment remediation_students = remediation_students.sample(frac=1, random_state=42) # For each student, assign to an LI where they need remediation, balancing group sizes for student_id in remediation_students['student_id'].unique(): # Get LIs where the student needs remediation student_lis = remediation_students[remediation_students['student_id'] == student_id]['learning_indicator_id'].tolist() # Get current group sizes for these LIs lis_with_sizes = [(li, li_group_sizes[li]) for li in student_lis] # Select the LI with the smallest group size selected_li = min(lis_with_sizes, key=lambda x: x[1])[0] # Assign student to this LI student_assignments[student_id] = selected_li # Update group size li_group_sizes[selected_li] += 1 # Build the remediation groups li_remediation_groups = pd.DataFrame([ { 'learning_indicator_id': li, 'learning_indicator_text': li_text_map.get(li, ''), 'student_id': ', '.join(sorted([s for s, assigned_li in student_assignments.items() if assigned_li == li])) } for li in li_group_sizes.keys() ]) # Remove LIs with no assigned students li_remediation_groups = li_remediation_groups[li_remediation_groups['student_id'] != ''] # Identify students who don't need remediation students_needing_remediation = set(student_assignments.keys()) all_students = set(merged_df['student_id'].unique()) students_no_remediation = all_students - students_needing_remediation no_remediation_df = pd.DataFrame(sorted(students_no_remediation), columns=['student_id']) return li_remediation_groups, no_remediation_df def process_student_marks_ingestion(df): # Remove columns where the count of non-empty values is less than 1 df = df.dropna(axis=1, how='all') # Remove rows where question_number is empty df = df[df['question_number'].apply(lambda x: len(str(x)) > 0)] # Ensure all student marks are numbers, fill missing or invalid values with 0 student_columns = [col for col in df.columns if col != 'question_number'] df[student_columns] = df[student_columns].apply(pd.to_numeric, errors='coerce').fillna(0) # Transform dataframe to have columns: question_number, student_id, marks_obtained result = df.melt(id_vars=['question_number'], var_name='student_id', value_name='marks_obtained') return result def main(): col_logo , col_name = st.columns([1,3]) with col_logo: st.image("Screenshot 2024-08-07 at 1.05.24 PM.png") with col_name: st.title("Learning Indicator Analysis") st.subheader("Student and Class Remediation based on Question Data and Student Data. Upload Files") st.write("For any issues contact : nitesh.kumar@leadschool.in") # Upload the dataframes col_A,colB = st.columns(2) with col_A: question_data_file = st.file_uploader("Upload Question Data CSV", type="csv") with colB: student_performance_data_file = st.file_uploader("Upload Student Performance Data CSV", type="csv") st.write("----------------------") if question_data_file and student_performance_data_file: question_data = pd.read_csv(question_data_file) question_data = clean_dataframe(question_data) student_performance_data = pd.read_csv(student_performance_data_file) student_performance_data = process_student_marks_ingestion(student_performance_data) student_performance_data = clean_dataframe(student_performance_data) # Analyze performance and prioritize LIs category_counts = analyze_student_performance_by_li(question_data, student_performance_data) prioritized_lis = prioritize_lis(category_counts) # Merge with original question data to get the learning indicator text prioritized_lis = pd.merge(prioritized_lis, question_data[['learning_indicator_id', 'learning_indicator_text']].drop_duplicates(), on='learning_indicator_id', how='left') def highlight_priority(row): if row.Priority == 'High': return ['background-color: #ff8080']*len(row) elif row.Priority == 'Medium': return ['background-color: #eee96c']*len(row) elif row.Priority == 'Low': return ['background-color: #6ac140']*len(row) else: return ['']*len(row) col1,col2 = st.columns(2) with col1: with st.expander("**1. Learning Indicator Performance and Remediation Priorities** on #Students",expanded=False,icon="🔍"): st.write(''' This table shows student performance across different learning indicators (LIs). Each LI represents a key concept. Performance is divided into **Below Average (<50%)**, **Average (50-70%)**, and **Above Average (70-100%)**. The **Remediation Priority** column flags urgent areas marked as **High**, **Medium**, or **Low** priority, guiding your focus on areas needing the most support. ### Column Descriptions: - **Learning Indicator ID (ID)** – Key topic or concept ID. 1. **Below Average (<50%)** – Students scoring below 50%. 2. **Average (50-70%)** – Students scoring between 50-70%. 3. **Above Average (70-100%)** – Students scoring above 70%. 4. **Remediation Priority** – High, Medium, or Low priority. 5. **Rank** – LI ranked by urgency. ''') st.dataframe(prioritized_lis.rename({"learning_indicator_id":"LI ID","learning_indicator_text":"LI Text"},axis=1).style.apply(highlight_priority, axis=1),hide_index=True) overall_li_level = mean_li_level_analysis(student_performance_data, question_data) overall_li_level = overall_li_level.sort_values("normalised_score") overall_li_level = overall_li_level[["learning_indicator_id","normalised_score","learning_indicator_text"]] overall_li_level.columns = ["LI ID","LI % Mastery","LI Text"] with st.expander("**3. LI Mastery Score Overview**",expanded=False,icon="📈"): st.write(''' This section provides the **Class Average Performance (%)** for each learning indicator (LI). The **LI % Mastery** score shows how well students have grasped each concept compared to the maximum possible score. - For example, if the **Mastery Score** is 55%, it means students have understood just over half of the topic, guiding you to focus on areas with lower scores. ''') st.dataframe(overall_li_level,hide_index=True) with col2: with st.expander("**2. Learning Indicator Performance Graphs**",expanded=False,icon="📊"): st.write(''' ##### Description: These graphs visualize how your class performed across different learning indicators. - **Dotted Line**: Represents the **Class Average** performance, providing a benchmark for how the entire class scored for learning indicators (LIs Overall). - **Black Dot**: Shows the **LI-Specific Average**, highlighting how students performed on that particular topic. - **Distribution Curve**: The shape of the curve shows how students’ scores are distributed. The further left the majority of the curve is, the more students are struggling with that concept. ##### Key Takeaways: - Focus on areas where the **distribution** is heavily towards the left of the graph. This indicates that many students are scoring below average in that particular learning indicator. - The **closer the black dot** is to the left side of the graph, the lower the average score for that LI, indicating weaker understanding. **Don’t worry about the technical details—just use these visual cues to quickly identify which topics need more review based on how your class is performing relative to the average.** ''') plt_data=prepare_data_for_ridge_plot(student_performance_data, question_data,prioritized_lis) plt_fig = create_ridge_plot(plt_data) with st.container(height=950): st.pyplot(plt_fig) st.write("---------------------------") col3,col4 = st.columns(2) li_remediation_groups, no_remediation_df = remediation_groups(student_performance_data,question_data) with col3: with st.expander("**Student Group Remediation by Learning Indicator**",expanded=False,icon="👥"): st.write(''' This table identifies students needing remediation based on their performance relative to the class. Using a z-score threshold, it groups students by **Learning Indicator (LI)** for focused intervention. #### Key Steps: 1. **Z-Score Calculation:** Measures how each student performed compared to the class average. 2. **Grouping:** Students falling below the z-score threshold are grouped by LI for targeted remediation. #### Output: - **LI-Based Remediation Groups**: Lists of students requiring remediation for each learning objective. ''') li_remediation_groups=li_remediation_groups.rename({"learning_indicator_id":"LI ID","learning_indicator_text":"LI Text","student_id":"Student ID"},axis=1) st.dataframe(li_remediation_groups,hide_index=True) with col4: with st.expander("**Students Not Needing Remediation**",icon="✅",expanded=False): st.write(''' This table lists students who are **not part of the group remediation** because their performance meets or exceeds the expected threshold. These students have shown adequate understanding in the assessed learning indicators and do not require additional intervention. ''') no_remediation_df = no_remediation_df.rename({"student_id":"Student ID"},axis=1) st.dataframe(no_remediation_df,hide_index=True) # Filters for LI ID and Priority # li_id_filter = st.multiselect("Exclude Li_ids :", prioritized_lis['learning_indicator_id'].unique()) # priority_filter = st.multiselect("Exclude Priority:",prioritized_lis["Priority"].unique()) # if li_id_filter: # prioritized_lis = prioritized_lis[~prioritized_lis["learning_indicator_id"].isin(li_id_filter)] # question_data = question_data[~question_data["learning_indicator_id"].isin(li_id_filter)] # if priority_filter: # li_ids_out = prioritized_lis[prioritized_lis["Priority"].isin(priority_filter)]["learning_indicator_id"].unique().tolist() # question_data = question_data[~question_data["learning_indicator_id"].isin(li_ids_out)] # Button to generate student-level ranking st.write("--------") st.write(''' ### Student Level Learning Indicator Ranking The table displayed provides an insightful view of student-level learning indicators (LIs) ranked by priority for remediation. Each row represents a student, identified by their unique ID, and the columns detail specific learning indicators (LIs) that highlight areas requiring attention, spanning from `P1_li` as priority 1 and so on as `P2_li`,`P3_li`,`P4_li` etc. **When the remediation button is clicked**, it will display a prioritized list of learning indicators (LIs) for each student. These LIs are sorted by importance, guiding educators on which concepts or areas to focus on during the remediation process. This tailored approach ensures that the most critical gaps in a student's understanding are addressed first, enabling targeted interventions that are both efficient and effective in improving learning outcomes. ''') if st.button("**Generate Student Level Ranking**",type="primary",use_container_width=True): student_ranking = student_level_analysis(student_performance_data, question_data, prioritized_lis) st.write("Student Level Learning Indicator Ranking") st.dataframe(student_ranking,hide_index=True) if __name__ == "__main__": main()