pavlyhalim's picture
Update app.py
db89ec1 verified
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import plotly.graph_objects as go
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import time
from datetime import datetime
class OptimizedStackedClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
self.scaler = RobustScaler()
self.label_encoder = LabelEncoder()
self.feature_selector = None
self.base_models = None
self.meta_model = None
self.selected_features = None
self.start_time = time.time()
def predict(self, X):
"""Make predictions using optimized pipeline"""
# Scale and select features
X_scaled = pd.DataFrame(
self.scaler.transform(X),
columns=X.columns
)
X_selected = X_scaled[self.selected_features]
# Generate meta-features
meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6))
for i, (name, model) in enumerate(self.base_models):
predictions = model.predict_proba(X_selected)
meta_features[:, i*6:(i+1)*6] = predictions
# Make final predictions
predictions = self.meta_model.predict(meta_features)
return self.label_encoder.inverse_transform(predictions)
def predict_proba(self, X):
"""Get prediction probabilities"""
# Scale and select features
X_scaled = pd.DataFrame(
self.scaler.transform(X),
columns=X.columns
)
X_selected = X_scaled[self.selected_features]
# Generate meta-features
meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6))
for i, (name, model) in enumerate(self.base_models):
predictions = model.predict_proba(X_selected)
meta_features[:, i*6:(i+1)*6] = predictions
return self.meta_model.predict_proba(meta_features)
def load_model(model_path):
"""Load the saved model"""
try:
return joblib.load(model_path)
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return None
def create_features(input_data):
"""Create features matching the model's exact feature names"""
features = {
'chars_original': input_data['chars_original'],
'chars_tokenized': input_data['chars_tokenized'],
'num_words': input_data['num_words'],
'num_tokens': input_data['num_tokens'],
'unique_tokens': input_data['unique_tokens'],
'type_token_ratio': input_data['type_token_ratio'],
'fertility': input_data['fertility'],
'token_std': input_data['token_std'],
'avg_token_len': input_data['avg_token_len']
}
# Add derived features
eps = 1e-10
features['chars_per_word'] = features['chars_original'] / (features['num_words'] + eps)
features['chars_per_token'] = features['chars_tokenized'] / (features['num_tokens'] + eps)
features['tokens_per_word'] = features['num_tokens'] / (features['num_words'] + eps)
features['token_complexity'] = features['token_std'] * features['avg_token_len']
features['lexical_density'] = features['unique_tokens'] / (features['num_words'] + eps)
features['log_chars'] = np.log1p(features['chars_original'])
features['complexity_score'] = (
features['token_complexity'] *
features['lexical_density'] *
features['type_token_ratio']
)
return pd.DataFrame([features])
def plot_probabilities(probabilities):
"""Create a bar plot of prediction probabilities"""
fig = go.Figure(data=[
go.Bar(
x=[f'Level {i+1}' for i in range(len(probabilities))],
y=probabilities,
text=np.round(probabilities, 3),
textposition='auto'
)
])
fig.update_layout(
title='Probability Distribution Across Readability Levels',
xaxis_title='Readability Level',
yaxis_title='Probability',
yaxis_range=[0, 1],
height=400
)
return fig
def plot_feature_values(features_df):
"""Create a bar plot of feature values"""
fig = go.Figure(data=[
go.Bar(
x=features_df.columns,
y=features_df.values[0],
text=np.round(features_df.values[0], 2),
textposition='auto'
)
])
fig.update_layout(
title='Feature Values',
xaxis_title='Features',
yaxis_title='Value',
xaxis_tickangle=-45,
height=500
)
return fig
def main():
st.set_page_config(page_title="Text Readability Classifier", layout="wide")
st.title("Text Readability Classifier")
st.write("This app predicts the readability level based on text characteristics.")
# Load the model
model_path = "model.joblib"
model = load_model(model_path)
if model is None:
st.error("Could not load the model. Please check if the model file exists.")
return
# Create two columns for layout
col1, col2 = st.columns([2, 1])
with col1:
# Input form for text characteristics
st.subheader("Enter Text Characteristics")
# Basic features input
input_data = {}
input_data['chars_original'] = st.number_input('Number of Characters (Original)', value=0)
input_data['chars_tokenized'] = st.number_input('Number of Characters (Tokenized)', value=0)
input_data['num_words'] = st.number_input('Number of Words', value=0)
input_data['num_tokens'] = st.number_input('Number of Tokens', value=0)
input_data['unique_tokens'] = st.number_input('Number of Unique Tokens', value=0)
input_data['type_token_ratio'] = st.number_input('Type-Token Ratio', value=0.0, min_value=0.0, max_value=1.0)
input_data['fertility'] = st.number_input('Fertility', value=0.0)
input_data['token_std'] = st.number_input('Token Standard Deviation', value=0.0)
input_data['avg_token_len'] = st.number_input('Average Token Length', value=0.0)
analyze_button = st.button("Analyze", type="primary")
if analyze_button:
with st.spinner("Analyzing..."):
try:
# Create features dataframe with all required features
features_df = create_features(input_data)
# Make prediction
prediction = model.predict(features_df)[0]
probabilities = model.predict_proba(features_df)[0]
# Display results
st.subheader("Analysis Results")
# Create metrics row
metrics_cols = st.columns(2)
with metrics_cols[0]:
st.metric("Readability Level", f"Level {prediction}")
with metrics_cols[1]:
highest_prob = max(probabilities)
st.metric("Confidence", f"{highest_prob:.2%}")
# Show probability distribution
st.plotly_chart(plot_probabilities(probabilities),
use_container_width=True)
# Show all feature values including derived features
st.subheader("All Features (Including Derived)")
st.plotly_chart(plot_feature_values(features_df),
use_container_width=True)
except Exception as e:
st.error(f"Error during analysis: {str(e)}")
with col2:
# Information sidebar
with st.container():
st.subheader("About Readability Levels")
st.write("""
The model predicts readability on a scale from 1 to 6:
- **Level 1**: Very Easy
- **Level 2**: Easy
- **Level 3**: Moderately Easy
- **Level 4**: Moderate
- **Level 5**: Moderately Difficult
- **Level 6**: Difficult
""")
st.subheader("Feature Explanations")
st.write("""
**Basic Features:**
- Character counts (original and tokenized)
- Word and token counts
- Type-token ratio (vocabulary diversity)
- Token length statistics
**Derived Features:**
- Characters per word/token
- Token complexity
- Lexical density
- Overall complexity score
""")
st.subheader("Model Performance")
st.write("""
This model achieves:
- **Accuracy**: 73.86%
- **Macro Avg F1**: 0.75
- **Weighted Avg F1**: 0.74
*Note: Results should be used as guidance rather than absolute measures.*
""")
if __name__ == "__main__":
main()