import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore
import streamlit.components.v1 as components
st.set_page_config(layout="wide")

def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    # Remove columns with no name
    df = df.loc[:, df.columns.str.strip() != '']

    # Drop rows where all values are NaN
    df = df.dropna(how='all')

    return df

def categorize_marks(normalized_marks):
    if normalized_marks >= 0.7:
        return '70-100%'
    elif normalized_marks >= 0.5:
        return '50-70%'
    else:
        return '<50%'

def analyze_student_performance_by_li(question_data, student_performance_data):
    # Merge the dataframes on question number
    merged_data = pd.merge(student_performance_data, question_data, on='question_number')
    merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks'])
    merged_data["maximum_marks"] = merged_data["full_marks"]
    merged_data = merged_data.groupby(["student_id","learning_indicator_id"])[["marks_obtained","maximum_marks"]].sum().reset_index()
    merged_data['normalized_marks'] = merged_data['marks_obtained'] / merged_data['maximum_marks']
    # Categorize the normalized marks
    merged_data['category'] = merged_data['normalized_marks'].apply(categorize_marks)
    
    # Group by learning indicator ID and category, and count the number of students in each category
    merged_data = merged_data.groupby(['learning_indicator_id', 'category']).size().unstack(fill_value=0)
    
    # Rename the columns for better readability
    merged_data.columns = ['<50%', '50-70%', '70-100%']
    merged_data = merged_data.sort_values(['<50%', '50-70%', '70-100%'],ascending=[False,False,False]).reset_index()
    
    # Display the results
    return merged_data

def prioritize_lis(category_counts):
    # Add a rank column based on the order of rows
    category_counts['Rank'] = category_counts.index + 1
    
    # Determine the number of LIs
    total_lis = len(category_counts)
    
    # Determine the cutoff points for high, medium, and low priority
    high_priority_cutoff = int(total_lis * 0.3)
    medium_priority_cutoff = int(total_lis * 0.6)
    
    # Classify the LIs based on their rank
    category_counts['Priority'] = 'Low'
    category_counts.loc[category_counts['Rank'] <= high_priority_cutoff, 'Priority'] = 'High'
    category_counts.loc[(category_counts['Rank'] > high_priority_cutoff) & (category_counts['Rank'] <= medium_priority_cutoff), 'Priority'] = 'Medium'
    
    return category_counts

def mean_li_level_analysis(student_data, question_data):
    merged_data = pd.merge(student_data, question_data, on='question_number',how="inner")
    merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks'])
    merged_data["maximum_marks"] = merged_data["full_marks"]
    # Normalize the marks obtained for each learning indicator by each student
    merged_data = merged_data.groupby(['learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index()
    merged_data["normalised_score"]=(merged_data["marks_obtained"]/merged_data["maximum_marks"])*100
    merged_data["normalised_score"] = merged_data["normalised_score"].round(1)
    merged_data = pd.merge(merged_data,question_data[["learning_indicator_id","learning_indicator_text"]].drop_duplicates(),on="learning_indicator_id")
    return merged_data


def student_level_analysis(student_data, question_data, prioritized_lis):
    # Merge the student data with question data
    merged_data = pd.merge(student_data, question_data, on='question_number',how="inner")
    merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks'])
    merged_data["maximum_marks"] = merged_data["full_marks"]
    # Normalize the marks obtained for each learning indicator by each student
    merged_data = merged_data.groupby(['student_id', 'learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index()
    merged_data['normalized_marks'] = merged_data['marks_obtained'] / merged_data['maximum_marks']
    merged_data = merged_data[merged_data["normalized_marks"]<0.95]
    
    # Merge with prioritized_lis to get the priority and rank
    merged_data = pd.merge(merged_data, prioritized_lis[['learning_indicator_id', 'Rank']], on='learning_indicator_id', how='left')
    
    # Rank the LIs for each student based on normalized marks and class-level LI priority
    merged_data['student_rank'] = merged_data.groupby('student_id')['normalized_marks'].rank(method='dense', ascending=False)
    merged_data = merged_data.sort_values(by=['student_id', 'student_rank', 'Rank'])
    
    # Ensure unique ranks by adding a secondary sort by Rank
    merged_data['unique_rank'] = merged_data.groupby('student_id').cumcount() + 1
    
    # Create the final dataframe
    student_ranking = merged_data.pivot(index='student_id', columns='unique_rank', values='learning_indicator_id').reset_index()
    student_ranking.columns = ['student_id'] + [f'P{I+1}_li' for I in range(student_ranking.shape[1] - 1)]
    
    li_text_mapping = question_data.drop_duplicates(subset='learning_indicator_id').set_index('learning_indicator_id')['learning_indicator_text']

    for col in student_ranking.columns[1:]:
        student_ranking[col] = student_ranking[col].map(li_text_mapping)
    return student_ranking


def prepare_data_for_ridge_plot(student_data, question_data,ranking_order=[]):
    # Merge the DataFrames
    merged_data = pd.merge(student_data, question_data, on='question_number', how='inner')
    
    # Apply minimum marks validation and fill NaN with 0
    merged_data['marks_obtained'] = np.minimum(merged_data['marks_obtained'].fillna(0), merged_data['full_marks'])
    merged_data["maximum_marks"] = merged_data["full_marks"]
    
    # Normalize the marks obtained for each learning indicator by each student
    normalized_data = merged_data.groupby(['student_id', 'learning_indicator_id'])[['marks_obtained', 'maximum_marks']].sum().reset_index()
    normalized_data['normalized_marks'] = normalized_data['marks_obtained'] / normalized_data['maximum_marks']

    
    # Add learning_indicator_text to normalized_data
    plot_data = pd.merge(normalized_data, question_data[['learning_indicator_id', 'learning_indicator_text']].drop_duplicates(), on='learning_indicator_id')
    if len(ranking_order)>1:
        plot_data = pd.merge(plot_data,ranking_order[["learning_indicator_id","Rank"]],how="left")
        plot_data["Rank"]=plot_data["Rank"].fillna(10000)
        plot_data = plot_data.sort_values("Rank")
    
    return plot_data

def calculate_logical_quantiles(data, num_quantiles=5):
    """
    Calculate logical quantiles for a given data set to ensure they are informative.
    
    Parameters:
    data (array-like): The input data for which to calculate quantiles.
    num_quantiles (int): The number of quantiles to calculate. Default is 5.
    
    Returns:
    list: A list of quantile values.
    """
    # Ensure there are enough unique values to calculate the quantiles
    if len(np.unique(data)) < num_quantiles:
        # If not enough unique values, use unique values as quantiles
        quantiles = np.unique(data)
    else:
        # Calculate evenly spaced quantiles
        quantiles = np.percentile(data, np.linspace(0, 100, num_quantiles))
    
    return quantiles.tolist()

def create_ridge_plot(plot_data):
    unique_learning_indicators = plot_data['learning_indicator_text'].unique()
    n_indicators = len(unique_learning_indicators)
    bandwidth = 0.5  # Adjust bandwidth for smoother graphs
    darkgreen = '#9BC184'
    midgreen = '#C2D6A4'
    lightgreen = '#E7E5CB'
    colors = [lightgreen, midgreen, darkgreen, midgreen, lightgreen]

    fig, axs = plt.subplots(nrows=n_indicators, ncols=1, figsize=(10, n_indicators * 1.5), sharex=True)
    axs = axs.flatten()  # Flatten in case of single plot

    for i, indicator in enumerate(unique_learning_indicators):
        # Subset the data for each learning indicator
        subset = plot_data[plot_data['learning_indicator_text'] == indicator]

        # Plot the distribution of normalized marks
        sns.kdeplot(
            subset['normalized_marks'],
            shade=True,
            bw_adjust=bandwidth,
            ax=axs[i],
            color=sns.color_palette('coolwarm', n_colors=n_indicators)[i]
        )
        quantiles = calculate_logical_quantiles(subset["normalized_marks"].tolist())

        # fill space between each pair of quantiles
        for j in range(len(quantiles) - 1):
            axs[i].fill_between(
                [quantiles[j], # lower bound
                quantiles[j+1]], # upper bound
                0.1, # max y=0
                0.3, # max y=0.0002
                color=colors[j]
            )
        mean = subset['marks_obtained'].sum()/subset['maximum_marks'].sum()
        axs[i].scatter([mean], [0.3], color='black', s=15)

        global_mean = plot_data['normalized_marks'].mean()
        axs[i].axvline(global_mean, color='#525252', linestyle='--')


        axs[i].set_xlim(0, 1)
        #axs[i].set_ylim(0,3)

        # Add the learning indicator text as the title
        axs[i].set_title(indicator, loc='left', fontsize=12, fontweight='bold')

        # Remove y-axis label
        axs[i].set_ylabel('')

        # Add a horizontal line for the baseline
        axs[i].axhline(0, color='black', linewidth=1.3, linestyle='-')

    # Set common labels
    plt.xlabel('Normalized Marks', fontsize=12, fontweight='bold')
    

    plt.tight_layout()
    return fig

# def remediation_groups(student_df, question_df, z_threshold=-1.35):
#     # Merge student performance with question data to get full marks
#     student_df["student_id"]=student_df["student_id"].astype(str)
#     merged_df = pd.merge(student_df, question_df, on='question_number')
    
#     # Apply minimum marks validation and fill NaN with 0
#     merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks'])
    
#     # Calculate normalized scores
#     merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks']
    
#     # Calculate z-scores for each learning indicator
#     z_scores = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore)
#     merged_df['z_score'] = z_scores
    
#     # Identify students needing remediation
#     remediation_df = merged_df[merged_df['z_score'] < z_threshold]
    
#     # Group by learning indicator to find students needing remediation
#     li_remediation_groups = remediation_df.groupby(['learning_indicator_id', 'learning_indicator_text'])['student_id'].apply(lambda x: ', '.join(x.unique())).reset_index()
    
#     # Identify students who don't need remediation
#     students_needing_remediation = remediation_df['student_id'].unique()
#     students_no_remediation = merged_df[~merged_df['student_id'].isin(students_needing_remediation)]['student_id'].unique()
#     no_remediation_df = pd.DataFrame(students_no_remediation, columns=['student_id'])
    
#     return li_remediation_groups, no_remediation_df
# def remediation_groups(student_df, question_df, z_threshold=-1.00):
#     # Merge student performance with question data to get full marks
#     student_df["student_id"] = student_df["student_id"].astype(str)
#     merged_df = pd.merge(student_df, question_df, on='question_number')
    
#     # Apply minimum marks validation and fill NaN with 0
#     merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks'])
    
#     # Calculate normalized scores
#     merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks']
    
#     # Calculate z-scores for each learning indicator
#     merged_df['z_score'] = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore)
    
#     # Identify students needing remediation (z-score below threshold)
#     remediation_df = merged_df[merged_df['z_score'] < z_threshold]
    
#     # For each student, select the LI with the lowest z-score
#     remediation_df = remediation_df.loc[remediation_df.groupby('student_id')['z_score'].idxmin()]
    
#     # Group by learning indicator to find students needing remediation
#     li_remediation_groups = remediation_df.groupby(['learning_indicator_id', 'learning_indicator_text'])['student_id'].apply(lambda x: ', '.join(sorted(x))).reset_index()
    
#     # Identify students who don't need remediation
#     students_needing_remediation = remediation_df['student_id'].unique()
#     students_no_remediation = merged_df[~merged_df['student_id'].isin(students_needing_remediation)]['student_id'].unique()
#     no_remediation_df = pd.DataFrame(students_no_remediation, columns=['student_id'])
    
#     return li_remediation_groups, no_remediation_df
def remediation_groups(student_df, question_df, z_threshold=-1.25):
    # Merge student performance with question data to get full marks
    student_df["student_id"] = student_df["student_id"].astype(str)
    merged_df = pd.merge(student_df, question_df, on='question_number')
    
    # Apply minimum marks validation and fill NaN with 0
    merged_df['marks_obtained'] = np.minimum(merged_df['marks_obtained'].fillna(0), merged_df['full_marks'])
    
    # Calculate normalized scores
    merged_df['normalized_score'] = merged_df['marks_obtained'] / merged_df['full_marks']
    
    # Calculate z-scores for each learning indicator
    merged_df['z_score'] = merged_df.groupby('learning_indicator_id')['normalized_score'].transform(zscore)
    
    # Identify students needing remediation (z-score below threshold)
    remediation_df = merged_df[merged_df['z_score'] < z_threshold]
    
    # Get list of students needing remediation and their LIs
    remediation_students = remediation_df[['student_id', 'learning_indicator_id', 'learning_indicator_text']].drop_duplicates()
    
    # Build a mapping of LIs to students needing remediation in that LI
    li_student_map = remediation_students.groupby('learning_indicator_id')['student_id'].apply(set).to_dict()
    li_text_map = remediation_df.set_index('learning_indicator_id')['learning_indicator_text'].to_dict()
    
    # Initialize group sizes
    li_group_sizes = {li: 0 for li in li_student_map.keys()}
    
    # Initialize final assignments
    student_assignments = {}
    
    # Shuffle the students for random assignment
    remediation_students = remediation_students.sample(frac=1, random_state=42)
    
    # For each student, assign to an LI where they need remediation, balancing group sizes
    for student_id in remediation_students['student_id'].unique():
        # Get LIs where the student needs remediation
        student_lis = remediation_students[remediation_students['student_id'] == student_id]['learning_indicator_id'].tolist()
        
        # Get current group sizes for these LIs
        lis_with_sizes = [(li, li_group_sizes[li]) for li in student_lis]
        
        # Select the LI with the smallest group size
        selected_li = min(lis_with_sizes, key=lambda x: x[1])[0]
        
        # Assign student to this LI
        student_assignments[student_id] = selected_li
        
        # Update group size
        li_group_sizes[selected_li] += 1
    
    # Build the remediation groups
    li_remediation_groups = pd.DataFrame([
        {
            'learning_indicator_id': li,
            'learning_indicator_text': li_text_map.get(li, ''),
            'student_id': ', '.join(sorted([s for s, assigned_li in student_assignments.items() if assigned_li == li]))
        } for li in li_group_sizes.keys()
    ])
    
    # Remove LIs with no assigned students
    li_remediation_groups = li_remediation_groups[li_remediation_groups['student_id'] != '']
    
    # Identify students who don't need remediation
    students_needing_remediation = set(student_assignments.keys())
    all_students = set(merged_df['student_id'].unique())
    students_no_remediation = all_students - students_needing_remediation
    no_remediation_df = pd.DataFrame(sorted(students_no_remediation), columns=['student_id'])
    
    return li_remediation_groups, no_remediation_df

def process_student_marks_ingestion(df):
    # Remove columns where the count of non-empty values is less than 1
    df = df.dropna(axis=1, how='all')
    
    # Remove rows where question_number is empty
    df = df[df['question_number'].apply(lambda x: len(str(x)) > 0)]
    
    # Ensure all student marks are numbers, fill missing or invalid values with 0
    student_columns = [col for col in df.columns if col != 'question_number']
    df[student_columns] = df[student_columns].apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # Transform dataframe to have columns: question_number, student_id, marks_obtained
    result = df.melt(id_vars=['question_number'], var_name='student_id', value_name='marks_obtained')
    
    return result

def main():
    col_logo , col_name = st.columns([1,3])
    with col_logo:
        st.image("Screenshot 2024-08-07 at 1.05.24 PM.png")
    with col_name:
        st.title("Learning Indicator Analysis")
        st.subheader("Student and Class Remediation based on Question Data and Student Data. Upload Files")
        st.write("For any issues contact : nitesh.kumar@leadschool.in")

    # Upload the dataframes
    col_A,colB = st.columns(2)
    with col_A:
        question_data_file = st.file_uploader("Upload Question Data CSV", type="csv")
    with colB:
        student_performance_data_file = st.file_uploader("Upload Student Performance Data CSV", type="csv")
    st.write("----------------------")
    
    if question_data_file and student_performance_data_file:
        question_data = pd.read_csv(question_data_file)
        question_data = clean_dataframe(question_data)
        student_performance_data = pd.read_csv(student_performance_data_file)
        student_performance_data = process_student_marks_ingestion(student_performance_data)
        student_performance_data = clean_dataframe(student_performance_data)
        
        # Analyze performance and prioritize LIs
        category_counts = analyze_student_performance_by_li(question_data, student_performance_data)
        prioritized_lis = prioritize_lis(category_counts)
        
        # Merge with original question data to get the learning indicator text
        prioritized_lis = pd.merge(prioritized_lis, question_data[['learning_indicator_id', 'learning_indicator_text']].drop_duplicates(), on='learning_indicator_id', how='left')
        
        
        def highlight_priority(row):
            if row.Priority == 'High':
                return ['background-color: #ff8080']*len(row)
            elif row.Priority == 'Medium':
                return ['background-color: #eee96c']*len(row)
            elif row.Priority == 'Low':
                return ['background-color: #6ac140']*len(row)
            else:
                return ['']*len(row)
            
        col1,col2 = st.columns(2)
        with col1:
            with st.expander("**1. Learning Indicator Performance and Remediation Priorities** on #Students",expanded=False,icon="🔍"):
                st.write(''' 
This table shows student performance across different learning indicators (LIs). Each LI represents a key concept. Performance is divided into **Below Average (<50%)**, **Average (50-70%)**, and **Above Average (70-100%)**. The **Remediation Priority** column flags urgent areas marked as **High**, **Medium**, or **Low** priority, guiding your focus on areas needing the most support.

### Column Descriptions:
- **Learning Indicator ID (ID)** – Key topic or concept ID.
1. **Below Average (<50%)** – Students scoring below 50%.
2. **Average (50-70%)** – Students scoring between 50-70%.
3. **Above Average (70-100%)** – Students scoring above 70%.
4. **Remediation Priority** – High, Medium, or Low priority.
5. **Rank** – LI ranked by urgency.
''')
            st.dataframe(prioritized_lis.rename({"learning_indicator_id":"LI ID","learning_indicator_text":"LI Text"},axis=1).style.apply(highlight_priority, axis=1),hide_index=True)
            overall_li_level = mean_li_level_analysis(student_performance_data, question_data)
            overall_li_level = overall_li_level.sort_values("normalised_score")
            overall_li_level = overall_li_level[["learning_indicator_id","normalised_score","learning_indicator_text"]]
            overall_li_level.columns = ["LI ID","LI % Mastery","LI Text"]
            with st.expander("**3. LI Mastery Score Overview**",expanded=False,icon="📈"):
                st.write('''
This section provides the **Class Average Performance (%)** for each learning indicator (LI). The **LI % Mastery** score shows how well students have grasped each concept compared to the maximum possible score. 

- For example, if the **Mastery Score** is 55%, it means students have understood just over half of the topic, guiding you to focus on areas with lower scores.
''')
            st.dataframe(overall_li_level,hide_index=True)
        with col2:
            with st.expander("**2. Learning Indicator Performance Graphs**",expanded=False,icon="📊"):
                st.write('''
##### Description:
These graphs visualize how your class performed across different learning indicators. 

- **Dotted Line**: Represents the **Class Average** performance, providing a benchmark for how the entire class scored for learning indicators (LIs Overall).
- **Black Dot**: Shows the **LI-Specific Average**, highlighting how students performed on that particular topic.
- **Distribution Curve**: The shape of the curve shows how students’ scores are distributed. The further left the majority of the curve is, the more students are struggling with that concept.

##### Key Takeaways:
- Focus on areas where the **distribution** is heavily towards the left of the graph. This indicates that many students are scoring below average in that particular learning indicator.
- The **closer the black dot** is to the left side of the graph, the lower the average score for that LI, indicating weaker understanding.
  
**Don’t worry about the technical details—just use these visual cues to quickly identify which topics need more review based on how your class is performing relative to the average.**
''')
            plt_data=prepare_data_for_ridge_plot(student_performance_data, question_data,prioritized_lis)
            plt_fig = create_ridge_plot(plt_data)
            with st.container(height=950):
                st.pyplot(plt_fig)
        st.write("---------------------------")
        col3,col4 = st.columns(2)
        li_remediation_groups, no_remediation_df = remediation_groups(student_performance_data,question_data)
        with col3:
            with st.expander("**Student Group Remediation by Learning Indicator**",expanded=False,icon="👥"):
                st.write('''
This table identifies students needing remediation based on their performance relative to the class. Using a z-score threshold, it groups students by **Learning Indicator (LI)** for focused intervention.

#### Key Steps:
1. **Z-Score Calculation:** Measures how each student performed compared to the class average.
2. **Grouping:** Students falling below the z-score threshold are grouped by LI for targeted remediation.

#### Output:
- **LI-Based Remediation Groups**: Lists of students requiring remediation for each learning objective.
''')
            li_remediation_groups=li_remediation_groups.rename({"learning_indicator_id":"LI ID","learning_indicator_text":"LI Text","student_id":"Student ID"},axis=1)
            st.dataframe(li_remediation_groups,hide_index=True)
        with col4:
            with st.expander("**Students Not Needing Remediation**",icon="✅",expanded=False):
                st.write('''
This table lists students who are **not part of the group remediation** because their performance meets or exceeds the expected threshold. These students have shown adequate understanding in the assessed learning indicators and do not require additional intervention.
''')
            no_remediation_df = no_remediation_df.rename({"student_id":"Student ID"},axis=1)
            st.dataframe(no_remediation_df,hide_index=True)

        
        # Filters for LI ID and Priority
        # li_id_filter = st.multiselect("Exclude Li_ids :", prioritized_lis['learning_indicator_id'].unique())
        # priority_filter = st.multiselect("Exclude  Priority:",prioritized_lis["Priority"].unique())
        # if li_id_filter:
        #     prioritized_lis = prioritized_lis[~prioritized_lis["learning_indicator_id"].isin(li_id_filter)]
        #     question_data = question_data[~question_data["learning_indicator_id"].isin(li_id_filter)]
        # if priority_filter:
        #     li_ids_out = prioritized_lis[prioritized_lis["Priority"].isin(priority_filter)]["learning_indicator_id"].unique().tolist()
        #     question_data = question_data[~question_data["learning_indicator_id"].isin(li_ids_out)]


        # Button to generate student-level ranking
        st.write("--------")
        st.write('''
### Student Level Learning Indicator Ranking

The table displayed provides an insightful view of student-level learning indicators (LIs) ranked by priority for remediation. Each row represents a student, identified by their unique ID, and the columns detail specific learning indicators (LIs) that highlight areas requiring attention, spanning from `P1_li` as priority 1 and so on as `P2_li`,`P3_li`,`P4_li` etc.

**When the remediation button is clicked**, it will display a prioritized list of learning indicators (LIs) for each student. These LIs are sorted by importance, guiding educators on which concepts or areas to focus on during the remediation process. This tailored approach ensures that the most critical gaps in a student's understanding are addressed first, enabling targeted interventions that are both efficient and effective in improving learning outcomes.
''')
        if st.button("**Generate Student Level Ranking**",type="primary",use_container_width=True):
            student_ranking = student_level_analysis(student_performance_data, question_data, prioritized_lis)
            st.write("Student Level Learning Indicator Ranking")
            st.dataframe(student_ranking,hide_index=True)
    
if __name__ == "__main__":
    main()