|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import joblib |
|
import plotly.graph_objects as go |
|
from sklearn.base import BaseEstimator, ClassifierMixin |
|
from sklearn.preprocessing import RobustScaler, LabelEncoder |
|
from sklearn.feature_selection import SelectFromModel |
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
import xgboost as xgb |
|
from sklearn.linear_model import LogisticRegression |
|
import time |
|
from datetime import datetime |
|
|
|
class OptimizedStackedClassifier(BaseEstimator, ClassifierMixin): |
|
def __init__(self): |
|
self.scaler = RobustScaler() |
|
self.label_encoder = LabelEncoder() |
|
self.feature_selector = None |
|
self.base_models = None |
|
self.meta_model = None |
|
self.selected_features = None |
|
self.start_time = time.time() |
|
|
|
def predict(self, X): |
|
"""Make predictions using optimized pipeline""" |
|
|
|
X_scaled = pd.DataFrame( |
|
self.scaler.transform(X), |
|
columns=X.columns |
|
) |
|
X_selected = X_scaled[self.selected_features] |
|
|
|
|
|
meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6)) |
|
for i, (name, model) in enumerate(self.base_models): |
|
predictions = model.predict_proba(X_selected) |
|
meta_features[:, i*6:(i+1)*6] = predictions |
|
|
|
|
|
predictions = self.meta_model.predict(meta_features) |
|
return self.label_encoder.inverse_transform(predictions) |
|
|
|
def predict_proba(self, X): |
|
"""Get prediction probabilities""" |
|
|
|
X_scaled = pd.DataFrame( |
|
self.scaler.transform(X), |
|
columns=X.columns |
|
) |
|
X_selected = X_scaled[self.selected_features] |
|
|
|
|
|
meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6)) |
|
for i, (name, model) in enumerate(self.base_models): |
|
predictions = model.predict_proba(X_selected) |
|
meta_features[:, i*6:(i+1)*6] = predictions |
|
|
|
return self.meta_model.predict_proba(meta_features) |
|
|
|
def load_model(model_path): |
|
"""Load the saved model""" |
|
try: |
|
return joblib.load(model_path) |
|
except Exception as e: |
|
st.error(f"Error loading model: {str(e)}") |
|
return None |
|
|
|
def create_features(input_data): |
|
"""Create features matching the model's exact feature names""" |
|
features = { |
|
'chars_original': input_data['chars_original'], |
|
'chars_tokenized': input_data['chars_tokenized'], |
|
'num_words': input_data['num_words'], |
|
'num_tokens': input_data['num_tokens'], |
|
'unique_tokens': input_data['unique_tokens'], |
|
'type_token_ratio': input_data['type_token_ratio'], |
|
'fertility': input_data['fertility'], |
|
'token_std': input_data['token_std'], |
|
'avg_token_len': input_data['avg_token_len'] |
|
} |
|
|
|
|
|
eps = 1e-10 |
|
features['chars_per_word'] = features['chars_original'] / (features['num_words'] + eps) |
|
features['chars_per_token'] = features['chars_tokenized'] / (features['num_tokens'] + eps) |
|
features['tokens_per_word'] = features['num_tokens'] / (features['num_words'] + eps) |
|
features['token_complexity'] = features['token_std'] * features['avg_token_len'] |
|
features['lexical_density'] = features['unique_tokens'] / (features['num_words'] + eps) |
|
features['log_chars'] = np.log1p(features['chars_original']) |
|
features['complexity_score'] = ( |
|
features['token_complexity'] * |
|
features['lexical_density'] * |
|
features['type_token_ratio'] |
|
) |
|
|
|
return pd.DataFrame([features]) |
|
|
|
def plot_probabilities(probabilities): |
|
"""Create a bar plot of prediction probabilities""" |
|
fig = go.Figure(data=[ |
|
go.Bar( |
|
x=[f'Level {i+1}' for i in range(len(probabilities))], |
|
y=probabilities, |
|
text=np.round(probabilities, 3), |
|
textposition='auto' |
|
) |
|
]) |
|
fig.update_layout( |
|
title='Probability Distribution Across Readability Levels', |
|
xaxis_title='Readability Level', |
|
yaxis_title='Probability', |
|
yaxis_range=[0, 1], |
|
height=400 |
|
) |
|
return fig |
|
|
|
def plot_feature_values(features_df): |
|
"""Create a bar plot of feature values""" |
|
fig = go.Figure(data=[ |
|
go.Bar( |
|
x=features_df.columns, |
|
y=features_df.values[0], |
|
text=np.round(features_df.values[0], 2), |
|
textposition='auto' |
|
) |
|
]) |
|
fig.update_layout( |
|
title='Feature Values', |
|
xaxis_title='Features', |
|
yaxis_title='Value', |
|
xaxis_tickangle=-45, |
|
height=500 |
|
) |
|
return fig |
|
|
|
def main(): |
|
st.set_page_config(page_title="Text Readability Classifier", layout="wide") |
|
|
|
st.title("Text Readability Classifier") |
|
st.write("This app predicts the readability level based on text characteristics.") |
|
|
|
|
|
model_path = "model.joblib" |
|
model = load_model(model_path) |
|
|
|
if model is None: |
|
st.error("Could not load the model. Please check if the model file exists.") |
|
return |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
|
|
st.subheader("Enter Text Characteristics") |
|
|
|
|
|
input_data = {} |
|
input_data['chars_original'] = st.number_input('Number of Characters (Original)', value=0) |
|
input_data['chars_tokenized'] = st.number_input('Number of Characters (Tokenized)', value=0) |
|
input_data['num_words'] = st.number_input('Number of Words', value=0) |
|
input_data['num_tokens'] = st.number_input('Number of Tokens', value=0) |
|
input_data['unique_tokens'] = st.number_input('Number of Unique Tokens', value=0) |
|
input_data['type_token_ratio'] = st.number_input('Type-Token Ratio', value=0.0, min_value=0.0, max_value=1.0) |
|
input_data['fertility'] = st.number_input('Fertility', value=0.0) |
|
input_data['token_std'] = st.number_input('Token Standard Deviation', value=0.0) |
|
input_data['avg_token_len'] = st.number_input('Average Token Length', value=0.0) |
|
|
|
analyze_button = st.button("Analyze", type="primary") |
|
|
|
if analyze_button: |
|
with st.spinner("Analyzing..."): |
|
try: |
|
|
|
features_df = create_features(input_data) |
|
|
|
|
|
prediction = model.predict(features_df)[0] |
|
probabilities = model.predict_proba(features_df)[0] |
|
|
|
|
|
st.subheader("Analysis Results") |
|
|
|
|
|
metrics_cols = st.columns(2) |
|
with metrics_cols[0]: |
|
st.metric("Readability Level", f"Level {prediction}") |
|
with metrics_cols[1]: |
|
highest_prob = max(probabilities) |
|
st.metric("Confidence", f"{highest_prob:.2%}") |
|
|
|
|
|
st.plotly_chart(plot_probabilities(probabilities), |
|
use_container_width=True) |
|
|
|
|
|
st.subheader("All Features (Including Derived)") |
|
st.plotly_chart(plot_feature_values(features_df), |
|
use_container_width=True) |
|
|
|
except Exception as e: |
|
st.error(f"Error during analysis: {str(e)}") |
|
|
|
with col2: |
|
|
|
with st.container(): |
|
st.subheader("About Readability Levels") |
|
st.write(""" |
|
The model predicts readability on a scale from 1 to 6: |
|
- **Level 1**: Very Easy |
|
- **Level 2**: Easy |
|
- **Level 3**: Moderately Easy |
|
- **Level 4**: Moderate |
|
- **Level 5**: Moderately Difficult |
|
- **Level 6**: Difficult |
|
""") |
|
|
|
st.subheader("Feature Explanations") |
|
st.write(""" |
|
**Basic Features:** |
|
- Character counts (original and tokenized) |
|
- Word and token counts |
|
- Type-token ratio (vocabulary diversity) |
|
- Token length statistics |
|
|
|
**Derived Features:** |
|
- Characters per word/token |
|
- Token complexity |
|
- Lexical density |
|
- Overall complexity score |
|
""") |
|
|
|
st.subheader("Model Performance") |
|
st.write(""" |
|
This model achieves: |
|
- **Accuracy**: 73.86% |
|
- **Macro Avg F1**: 0.75 |
|
- **Weighted Avg F1**: 0.74 |
|
|
|
*Note: Results should be used as guidance rather than absolute measures.* |
|
""") |
|
|
|
if __name__ == "__main__": |
|
main() |