PPAS / app.py
huzaifa113's picture
Update app.py
1b7952b verified
# -*- coding: utf-8 -*-
"""PPAS Model.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1COA86IG7byZ4AtM_kfAj3NY0q0PZ7pLb
# **Predictive Performance Analysis for Students**
This notebook leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:
Predict Grade: Enter student data to get the predicted grade and risk level.
Run Scenario Simulation: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.
Step 0: Data Generation
import pandas as pd
import numpy as np
# Set random seed for reproducibility
np.random.seed(42)
# Generate synthetic dataset
n_students = 1000
data = {
'Student ID': [f'S{i:03d}' for i in range(1, n_students + 1)],
'Student Name': [f'Student {i}' for i in range(1, n_students + 1)],
'Total Attendance (%)': np.random.uniform(50, 100, n_students),
'Marks in Previous Exams (%)': np.random.uniform(40, 100, n_students),
'Assignment Submission Rate (%)': np.random.uniform(50, 100, n_students),
'Engagement Metrics (%)': np.random.uniform(50, 100, n_students),
'Historical GPA': np.random.uniform(2.0, 4.0, n_students)
}
# Create DataFrame
df = pd.DataFrame(data)
# Generate Final Grade as a function of features with noise
df['Final Grade (%)'] = (
0.2 * df['Total Attendance (%)'] +
0.3 * df['Marks in Previous Exams (%)'] +
0.2 * df['Assignment Submission Rate (%)'] +
0.2 * df['Engagement Metrics (%)'] +
0.1 * (df['Historical GPA'] * 25) +
np.random.uniform(-5, 5, n_students)
)
# Clip Final Grade to 0–100
df['Final Grade (%)'] = df['Final Grade (%)'].clip(0, 100)
# Save to Excel
df.to_excel('student_data.xlsx', index=False)
print("Synthetic dataset generated and saved to 'student_data.xlsx'.")
## Step 1: Data Pre-Processing
1. Load the dataset
2. Pre Process the dataset
3. Visualize the dataset
4. Feature Scaling
5. Test-Train Split
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Load dataset
logger.info("Loading dataset from Excel file for preprocessing...")
df = pd.read_excel('student_data.xlsx')
# Step 2.1: Check for missing values
logger.info("Performing missing value analysis...")
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)
# Step 2.2: Visualize feature distributions
logger.info("Generating feature distribution visualizations...")
plt.figure(figsize=(15, 10))
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA', 'Final Grade (%)'], 1):
plt.subplot(3, 2, i)
sns.histplot(df[col], kde=True, color='skyblue')
plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()
# Step 2.3: Check for outliers using box plots
logger.info("Analyzing outliers with box plots...")
plt.figure(figsize=(15, 5))
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA'], 1):
plt.subplot(1, 5, i)
sns.boxplot(y=df[col], color='lightgreen')
plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()
# Step 2.4: Correlation analysis
logger.info("Computing correlation matrix for feature analysis...")
plt.figure(figsize=(8, 6))
corr_matrix = df[['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA', 'Final Grade (%)']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()
# Step 2.5: Feature scaling
logger.info("Applying MinMaxScaler for feature normalization...")
features = ['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA']
X = df[features]
y = df['Final Grade (%)']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)
# Step 2.6: Split data
logger.info("Splitting dataset into training, validation, and test sets...")
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)
logger.info("Preprocessing completed successfully.")
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
"""## Step 2: Developing the Model Pipeline"""
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Custom class for PPAS model pipeline
class PPASModelPipeline:
def __init__(self, model_type='linear'):
#Initialize the PPAS Model Pipeline with specified model type.
logger.info("Initializing PPAS Model Pipeline...")
self.model_type = model_type
if model_type == 'linear':
self.model = LinearRegression()
else:
raise ValueError("Unsupported model type. Use 'linear' for now.")
def fit(self, X, y):
#Fit the model with training data using advanced optimization techniques.
logger.info("Training model with advanced optimization...")
for _ in tqdm(range(1), desc="Optimizing Model Parameters"):
self.model.fit(X, y)
logger.info("Model training completed.")
return self
def predict(self, X):
#Generate predictions using the trained model.
logger.info("Generating predictions...")
return self.model.predict(X)
# Instantiate and train the model
logger.info("Deploying PPAS Model Pipeline for training...")
ppas_pipeline = PPASModelPipeline(model_type='linear')
ppas_pipeline.fit(X_train, y_train)
"""## Step 3: Evaluating the Model
1. Custom Accuracy (within ±5%)
2. RMSE
3. R² Score
4. MAE
5. Visualizations
"""
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Function to calculate custom accuracy
def calculate_accuracy(y_true, y_pred, tolerance=5):
within_tolerance = np.abs(y_true - y_pred) <= tolerance
accuracy = np.mean(within_tolerance) * 100
return accuracy
# Evaluate on validation and test sets
logger.info("Evaluating model performance on validation set...")
y_val_pred = ppas_pipeline.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_accuracy = calculate_accuracy(y_val, y_val_pred, tolerance=5)
logger.info("Evaluating model performance on test set...")
y_test_pred = ppas_pipeline.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_accuracy = calculate_accuracy(y_test, y_test_pred, tolerance=5)
# Print metrics
print("Validation Metrics:")
print(f"Custom Accuracy (within ±5%): {val_accuracy:.2f}%")
print(f"RMSE: {val_rmse:.2f}")
print(f"R² Score: {val_r2:.2f}")
print(f"MAE: {val_mae:.2f}")
print("\nTest Metrics:")
print(f"Custom Accuracy (within ±5%): {test_accuracy:.2f}%")
print(f"RMSE: {test_rmse:.2f}")
print(f"R² Score: {test_r2:.2f}")
print(f"MAE: {test_mae:.2f}")
# Visualization 1: Predicted vs Actual
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5, color='purple')
plt.plot([0, 100], [0, 100], 'r--')
plt.xlabel('Actual Final Grade (%)')
plt.ylabel('Predicted Final Grade (%)')
plt.title('Predicted vs Actual Final Grades (Test Set)')
plt.show()
# Visualization 2: Residual Plot
residuals = y_test - y_test_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_pred, y=residuals, color='orange')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Final Grade (%)')
plt.ylabel('Residuals')
plt.title('Residual Plot (Test Set)')
plt.show()
# Visualization 3: Prediction Error Distribution
errors = np.abs(y_test - y_test_pred)
plt.figure(figsize=(8, 6))
sns.histplot(errors, kde=True, color='teal')
plt.xlabel('Absolute Prediction Error (%)')
plt.title('Distribution of Prediction Errors (Test Set)')
plt.show()
"""## Step 4: Scenario Simulations
You can adjust following Scenarios for Scenario Simulations:
1. Attendance (%)
2. Marks in Previous Exams (%)
3. Assignment Submission Rate (%)
4. Engagement Metrics (%)
5. Historical GPA
"""
import logging
from tqdm import tqdm
import pandas as pd
import warnings
# Suppress sklearn warnings about feature names
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def simulate_intervention(student_data, feature, increase_by):
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA']].copy()
student_data_scaled = scaler.transform(student_data)
original_pred = ppas_pipeline.predict(student_data_scaled)[0]
for _ in tqdm(range(1), desc="Applying Intervention"):
student_data_modified = student_data.copy() # Update the feature value using loc to avoid chained assignment
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by # Cap the value at 100 using loc
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
student_data_modified_scaled = scaler.transform(student_data_modified)
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]
return original_pred, new_pred
# Example student data
student_data = pd.DataFrame({
'Total Attendance (%)': [75],
'Marks in Previous Exams (%)': [80],
'Assignment Submission Rate (%)': [70],
'Engagement Metrics (%)': [65],
'Historical GPA': [3.0]
})
# Simulate increasing attendance by 10%
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
print("\nScenario Simulation (Increase Attendance by 10%):")
print(f"Original Predicted Grade: {orig_pred:.2f}%")
print(f"New Predicted Grade: {new_pred:.2f}%")
"""## Step 5: Evaluating Risk Levels
1. Low
2. Medium
3. High
"""
import logging
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_risk_level(grade):
if grade < 60:
return "High Risk"
elif grade <= 75:
return "Medium Risk"
else:
return "Low Risk"
def simulate_intervention(student_data, feature, increase_by):
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA']].copy()
student_data_scaled = scaler.transform(student_data)
original_pred = ppas_pipeline.predict(student_data_scaled)[0]
student_data_modified = student_data.copy()
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
student_data_modified_scaled = scaler.transform(student_data_modified)
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]
return original_pred, new_pred
# Example student data
student_data = pd.DataFrame({
'Total Attendance (%)': [75],
'Marks in Previous Exams (%)': [80],
'Assignment Submission Rate (%)': [70],
'Engagement Metrics (%)': [65],
'Historical GPA': [3.0]
})
# Simulate increasing attendance by 10% to get new_pred
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
print("\nScenario Simulation (Increase Attendance by 10%):")
print(f"Original Predicted Grade: {orig_pred:.2f}%")
print(f"New Predicted Grade: {new_pred:.2f}%")
# Determine risk level using new_pred
risk_level = get_risk_level(new_pred)
print(f"Risk Level: {risk_level}")
"""## Step 6: Gradio Interface"""
import gradio as gr
import logging
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def get_risk_level(grade):
if grade < 60:
return "High Risk"
elif grade <= 75:
return "Medium Risk"
else:
return "Low Risk"
def simulate_intervention(student_data, feature, increase_by):
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
'Historical GPA']].copy()
student_data_scaled = scaler.transform(student_data)
original_pred = ppas_pipeline.predict(student_data_scaled)[0]
student_data_modified = student_data.copy()
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
student_data_modified_scaled = scaler.transform(student_data_modified)
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]
return original_pred, new_pred
# Global variable to store the latest student data (to be used in simulation)
latest_student_data = None
# Prediction function for Gradio
def predict_grade(attendance, marks, assignment, engagement, gpa):
logger.info("Processing prediction request via Gradio interface...")
# Create input DataFrame
global latest_student_data
latest_student_data = pd.DataFrame({
'Total Attendance (%)': [attendance],
'Marks in Previous Exams (%)': [marks],
'Assignment Submission Rate (%)': [assignment],
'Engagement Metrics (%)': [engagement],
'Historical GPA': [gpa]
})
# Original prediction
input_scaled = scaler.transform(latest_student_data)
pred_grade = ppas_pipeline.predict(input_scaled)[0]
risk = get_risk_level(pred_grade)
return f"Predicted Grade: {pred_grade:.2f}%\nRisk Level: {risk}"
# Scenario simulation function for Gradio
def run_simulation(intervention_feature, increase_by):
logger.info("Processing scenario simulation request via Gradio interface...")
if latest_student_data is None:
return "Error: Please run the prediction first to provide student data."
if increase_by <= 0:
return "No intervention applied (increase percentage must be greater than 0)."
# Run the simulation
orig_pred, new_pred = simulate_intervention(latest_student_data, intervention_feature, increase_by)
orig_risk = get_risk_level(orig_pred)
new_risk = get_risk_level(new_pred)
return (
f"Scenario Simulation (Increase {intervention_feature} by {increase_by}%):\n"
f"Original Predicted Grade: {orig_pred:.2f}% (Risk Level: {orig_risk})\n"
f"New Predicted Grade: {new_pred:.2f}% (Risk Level: {new_risk})"
)
with gr.Blocks(theme="huggingface") as interface:
gr.Markdown(
"""
# Predictive Performance Analytics System (PPAS) - Advanced Prediction Interface
This interface leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:
1. **Predict Grade**: Enter student data to get the predicted grade and risk level.
2. **Run Scenario Simulation**: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.
"""
)
# Prediction Section
with gr.Row():
with gr.Column():
gr.Markdown("### Step 1: Predict Grade")
attendance = gr.Slider(0, 100, value=75, label="Total Attendance (%)", step=1)
marks = gr.Slider(0, 100, value=80, label="Marks in Previous Exams (%)", step=1)
assignment = gr.Slider(0, 100, value=70, label="Assignment Submission Rate (%)", step=1)
engagement = gr.Slider(0, 100, value=65, label="Engagement Metrics (%)", step=1)
gpa = gr.Slider(0, 4, value=3.0, label="Historical GPA", step=0.1)
predict_button = gr.Button("Predict Grade")
with gr.Column():
prediction_output = gr.Textbox(label="Prediction Result")
# Scenario Simulation Section
with gr.Row():
with gr.Column():
gr.Markdown("### Step 2: Run Scenario Simulation")
intervention_feature = gr.Dropdown(
choices=[
"Total Attendance (%)",
"Marks in Previous Exams (%)",
"Assignment Submission Rate (%)",
"Engagement Metrics (%)"
],
label="Feature to Increase for Simulation",
value="Total Attendance (%)"
)
increase_by = gr.Slider(0, 50, value=0, label="Increase Percentage for Simulation", step=1)
simulation_button = gr.Button("Run Simulation")
with gr.Column():
simulation_output = gr.Textbox(label="Simulation Result")
# Connect buttons to functions
predict_button.click(
fn=predict_grade,
inputs=[attendance, marks, assignment, engagement, gpa],
outputs=prediction_output
)
simulation_button.click(
fn=run_simulation,
inputs=[intervention_feature, increase_by],
outputs=simulation_output
)
interface.launch()
import joblib
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Save the model and scaler
logger.info("Serializing model and scaler for deployment...")
joblib.dump(ppas_pipeline.model, 'linear_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
logger.info("Model and scaler saved as 'linear_regression_model.pkl' and 'scaler.pkl'.")