Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

0a50c6f

verified ·

1 Parent(s): 8d810b6

Update app.py

Browse files

Files changed (1) hide show

app.py +366 -490

app.py CHANGED Viewed

@@ -5,146 +5,200 @@ import numpy as np
 import seaborn as sns
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.svm import LinearSVC, SVC
 from sklearn.naive_bayes import MultinomialNB, GaussianNB
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 import os
 import pickle
 import re
 import string
 from collections import Counter
-import plotly.express as px
-import plotly.graph_objects as go
-# Configure Streamlit page
-st.set_page_config(
-    page_title="Text Classification App",
-    page_icon="📝",
-    layout="wide"
-)
-# Text preprocessing class
-class TextCleaner:
-    def __init__(self):
-        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
-    def clean_text(self, text):
-        """Clean and preprocess text"""
-        if pd.isna(text):
-            return ""
-        text = str(text).lower()
-        text = re.sub(r'http\S+', '', text)  # Remove URLs
-        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
-        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
-        text = text.strip()
-        # Remove stop words (optional)
-        words = text.split()
-        words = [word for word in words if word not in self.stop_words]
-        return ' '.join(words)
-# Data analysis functions
-def get_data_insights(df, text_col, target_col):
-    """Get basic insights from the dataset"""
-    insights = {
-        'shape': df.shape,
-        'missing_values': df.isnull().sum().to_dict(),
-        'class_distribution': df[target_col].value_counts().to_dict(),
-        'text_length_stats': {
-            'mean': df[text_col].str.len().mean(),
-            'median': df[text_col].str.len().median(),
-            'min': df[text_col].str.len().min(),
-            'max': df[text_col].str.len().max()
-        }
-    }
-    return insights
-# Model training functions
 def train_model(model_name, X_train, X_test, y_train, y_test):
-    """Train and evaluate a model"""
-    models = {
-        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
-        'Decision Tree': DecisionTreeClassifier(random_state=42),
-        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
-        'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
-        'SVC': SVC(random_state=42, probability=True),
-        'Multinomial Naive Bayes': MultinomialNB(),
-        'Gaussian Naive Bayes': GaussianNB()
     }
-    model = models[model_name]
-    # For Gaussian NB, convert sparse matrix to dense
-    if model_name == 'Gaussian Naive Bayes':
-        X_train = X_train.toarray()
-        X_test = X_test.toarray()
     # Train model
-    model.fit(X_train, y_train)
     # Make predictions
-    y_pred = model.predict(X_test)
     # Calculate metrics
     accuracy = accuracy_score(y_test, y_pred)
     # Save model
     os.makedirs("models", exist_ok=True)
-    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
-    with open(os.path.join("models", model_filename), 'wb') as f:
-        pickle.dump(model, f)
-    return model, accuracy, y_pred, model_filename
-# Utility functions
-def save_artifacts(obj, folder_name, file_name):
-    """Save artifacts like encoders and vectorizers"""
-    os.makedirs(folder_name, exist_ok=True)
-    with open(os.path.join(folder_name, file_name), 'wb') as f:
-        pickle.dump(obj, f)
-def load_artifacts(folder_name, file_name):
-    """Load saved artifacts"""
-    try:
-        with open(os.path.join(folder_name, file_name), 'rb') as f:
-            return pickle.load(f)
-    except FileNotFoundError:
-        st.error(f"File {file_name} not found in {folder_name} folder")
-        return None
-def predict_text(model_filename, text, vectorizer_type="tfidf"):
     """Make prediction on new text"""
     try:
         # Load model
-        with open(os.path.join('models', model_filename), 'rb') as f:
-            model = pickle.load(f)
         # Load vectorizer
-        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
-        vectorizer = load_artifacts("artifacts", vectorizer_file)
         if vectorizer is None:
             return None, None
         # Load label encoder
-        encoder = load_artifacts("artifacts", "encoder.pkl")
         if encoder is None:
             return None, None
         # Clean and vectorize text
-        text_cleaner = TextCleaner()
-        clean_text = text_cleaner.clean_text(text)
-        # Transform text
-        text_vector = vectorizer.transform([clean_text])
-        # For Gaussian NB, convert to dense
-        if 'gaussian' in model_filename:
             text_vector = text_vector.toarray()
         # Make prediction
@@ -154,9 +208,12 @@ def predict_text(model_filename, text, vectorizer_type="tfidf"):
         # Get prediction probabilities if available
         if hasattr(model, 'predict_proba'):
             try:
-                prediction_proba = model.predict_proba(text_vector)[0]
-            except:
-                pass
         # Decode prediction
         predicted_label = encoder.inverse_transform(prediction)[0]
@@ -167,479 +224,298 @@ def predict_text(model_filename, text, vectorizer_type="tfidf"):
         st.error(f"Error during prediction: {str(e)}")
         return None, None
-# Streamlit App
-st.title('📝 No Code Text Classification App')
-st.markdown('---')
-st.write('Analyze your text data and train machine learning models without coding!')
 # Sidebar
-st.sidebar.title("Navigation")
-section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"])
-# Upload Data
-st.sidebar.markdown("---")
-st.sidebar.subheader("📁 Upload Your Dataset")
-train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
-test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
-# Global variables to store data and settings
-if 'vectorizer_type' not in st.session_state:
-    st.session_state.vectorizer_type = "tfidf"
-if train_data is not None:
     try:
-        # Try different encodings
-        encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
-        train_df = None
-        for encoding in encodings:
-            try:
-                train_df = pd.read_csv(train_data, encoding=encoding)
-                break
-            except UnicodeDecodeError:
-                continue
-        if train_df is None:
-            st.error("Unable to read the CSV file. Please check the file encoding.")
-        else:
-            if test_data is not None:
-                for encoding in encodings:
-                    try:
-                        test_df = pd.read_csv(test_data, encoding=encoding)
-                        break
-                    except UnicodeDecodeError:
-                        continue
-            else:
-                test_df = None
-            # Show data preview
-            with st.sidebar.expander("📋 Data Preview", expanded=True):
-                st.write("Shape:", train_df.shape)
-                st.write(train_df.head(2))
-            columns = train_df.columns.tolist()
-            text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
-            target = st.sidebar.selectbox("🎯 Choose the target column:", columns)
-            # Process data
-            if text_data and target:
-                # Clean text
-                text_cleaner = TextCleaner()
-                train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text)
-                train_df['text_length'] = train_df[text_data].str.len()
-                # Handle label encoding
-                label_encoder = LabelEncoder()
-                train_df['target_encoded'] = label_encoder.fit_transform(train_df[target])
-                # Save label encoder
-                save_artifacts(label_encoder, "artifacts", "encoder.pkl")
     except Exception as e:
-        st.error(f"Error loading data: {str(e)}")
-        train_df = None
-# Data Analysis Section
 if section == "📊 Data Analysis":
-    if train_data is not None and 'train_df' in locals() and train_df is not None:
-        st.header("📊 Data Analysis")
-        # Get insights
-        insights = get_data_insights(train_df, text_data, target)
-        # Display insights in columns
-        col1, col2, col3, col4 = st.columns(4)
         with col1:
-            st.metric("Total Samples", insights['shape'][0])
         with col2:
-            st.metric("Features", insights['shape'][1])
         with col3:
-            st.metric("Classes", len(insights['class_distribution']))
-        with col4:
-            st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}")
-        st.markdown("---")
-        # Data quality section
         col1, col2 = st.columns(2)
         with col1:
-            st.subheader("📋 Dataset Overview")
-            st.write("**Shape:**", insights['shape'])
-            st.write("**Missing Values:**")
-            missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count'])
-            st.dataframe(missing_df[missing_df['Count'] > 0])
-            st.write("**Sample Data:**")
-            st.dataframe(train_df[[text_data, target, 'text_length']].head())
         with col2:
-            st.subheader("📊 Class Distribution")
-            class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count'])
             st.dataframe(class_dist)
-            # Plot class distribution
-            fig = px.bar(
-                x=class_dist.index,
-                y=class_dist['Count'],
-                title="Class Distribution",
-                labels={'x': 'Class', 'y': 'Count'}
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        st.markdown("---")
-        # Text analysis section
-        st.subheader("📝 Text Analysis")
-        col1, col2 = st.columns(2)
-        with col1:
-            # Text length distribution
-            fig = px.histogram(
-                train_df,
-                x='text_length',
-                title="Text Length Distribution",
-                nbins=30
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        with col2:
-            # Text length by class
-            fig = px.box(
-                train_df,
-                x=target,
-                y='text_length',
-                title="Text Length by Class"
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        # Word frequency analysis
-        st.subheader("🔤 Most Common Words")
-        all_text = ' '.join(train_df['clean_text'].astype(str))
-        word_freq = Counter(all_text.split())
-        top_words = word_freq.most_common(20)
-        if top_words:
-            words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
-            fig = px.bar(
-                words_df,
-                x='Frequency',
-                y='Word',
-                orientation='h',
-                title="Top 20 Most Common Words"
-            )
-            fig.update_layout(yaxis={'categoryorder': 'total ascending'})
-            st.plotly_chart(fig, use_container_width=True)
     else:
-        st.warning("📁 Please upload training data to perform analysis")
-# Train Model Section
 elif section == "🤖 Train Model":
-    if train_data is not None and 'train_df' in locals() and train_df is not None:
-        st.header("🤖 Train Machine Learning Model")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("⚙️ Model Configuration")
-            model_name = st.selectbox("Choose Model", [
-                "Logistic Regression", "Decision Tree",
-                "Random Forest", "Linear SVC", "SVC",
-                "Multinomial Naive Bayes", "Gaussian Naive Bayes"
-            ])
-        with col2:
-            st.subheader("📊 Vectorization Method")
-            vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"])
-        # Model parameters
-        st.subheader("🔧 Parameters")
         col1, col2 = st.columns(2)
         with col1:
-            max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000)
-            test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05)
-        with col2:
-            random_state = st.number_input("Random State", 0, 1000, 42)
-            min_df = st.slider("Min Document Frequency", 1, 10, 1)
-        # Initialize vectorizer
-        if vectorizer_choice == "TF-IDF":
-            vectorizer = TfidfVectorizer(
-                max_features=max_features,
-                min_df=min_df,
-                stop_words='english'
             )
-            st.session_state.vectorizer_type = "tfidf"
-        else:
-            vectorizer = CountVectorizer(
-                max_features=max_features,
-                min_df=min_df,
-                stop_words='english'
-            )
-            st.session_state.vectorizer_type = "count"
-        # Show data info
-        st.subheader("📋 Training Data Info")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Total Samples", len(train_df))
         with col2:
-            st.metric("Unique Classes", train_df[target].nunique())
-        with col3:
-            st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}")
         if st.button("🚀 Start Training", type="primary"):
-            with st.spinner("Training model... This may take a few minutes."):
                 try:
-                    # Vectorize text data
-                    X = vectorizer.fit_transform(train_df['clean_text'])
-                    y = train_df['target_encoded']
                     # Split data
                     X_train, X_test, y_train, y_test = train_test_split(
-                        X, y,
-                        test_size=test_size,
-                        random_state=random_state,
-                        stratify=y
                     )
-                    st.success(f"✅ Data split - Train: {X_train.shape}, Test: {X_test.shape}")
                     # Save vectorizer
                     vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
                     save_artifacts(vectorizer, "artifacts", vectorizer_filename)
                     # Train model
-                    model, accuracy, y_pred, model_filename = train_model(
-                        model_name, X_train, X_test, y_train, y_test
-                    )
-                    st.success("🎉 Model training completed!")
-                    # Display results
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.metric("🎯 Test Accuracy", f"{accuracy:.4f}")
-                        # Classification report
-                        st.subheader("📊 Classification Report")
-                        report = classification_report(
-                            y_test, y_pred,
-                            target_names=label_encoder.classes_,
-                            output_dict=True
-                        )
-                        report_df = pd.DataFrame(report).transpose()
-                        st.dataframe(report_df.round(4))
-                    with col2:
-                        # Confusion matrix
-                        st.subheader("🔄 Confusion Matrix")
-                        cm = confusion_matrix(y_test, y_pred)
-                        fig = px.imshow(
-                            cm,
-                            text_auto=True,
-                            aspect="auto",
-                            title="Confusion Matrix",
-                            labels=dict(x="Predicted", y="Actual"),
-                            x=label_encoder.classes_,
-                            y=label_encoder.classes_
-                        )
-                        st.plotly_chart(fig, use_container_width=True)
-                    st.info(f"✅ Model saved as: {model_filename}")
-                    st.info("🔮 You can now use the 'Predictions' section to classify new text!")
                 except Exception as e:
-                    st.error(f"❌ Error during training: {str(e)}")
     else:
-        st.warning("📁 Please upload training data to train a model")
-# Predictions Section
 elif section == "🔮 Predictions":
-    st.header("🔮 Text Classification Predictions")
-    # Check if models exist
     if os.path.exists("models") and os.listdir("models"):
-        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
             # Single prediction
-            st.subheader("📝 Single Text Classification")
-            col1, col2 = st.columns([2, 1])
             with col1:
-                text_input = st.text_area("Enter text to classify:", height=150)
             with col2:
-                selected_model = st.selectbox("Choose model:", available_models)
-                predict_button = st.button("🔮 Predict", type="primary")
-            if predict_button and text_input.strip():
-                with st.spinner("Making prediction..."):
-                    predicted_label, prediction_proba = predict_text(
-                        selected_model,
-                        text_input,
-                        st.session_state.get('vectorizer_type', 'tfidf')
-                    )
-                    if predicted_label is not None:
-                        st.success("✅ Prediction completed!")
-                        # Display results
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.markdown("### 🎯 Results")
-                            st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}")
-                            st.markdown(f"**Predicted Class:** `{predicted_label}`")
-                        with col2:
-                            # Display probabilities if available
-                            if prediction_proba is not None:
-                                st.markdown("### 📊 Class Probabilities")
-                                encoder = load_artifacts("artifacts", "encoder.pkl")
-                                if encoder is not None:
-                                    prob_df = pd.DataFrame({
-                                        'Class': encoder.classes_,
-                                        'Probability': prediction_proba
-                                    }).sort_values('Probability', ascending=False)
-                                    fig = px.bar(
-                                        prob_df,
-                                        x='Probability',
-                                        y='Class',
-                                        orientation='h',
-                                        title="Prediction Confidence"
-                                    )
-                                    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
-                                    st.plotly_chart(fig, use_container_width=True)
-            elif predict_button:
-                st.warning("⚠️ Please enter some text to classify")
             # Batch predictions
             st.markdown("---")
-            st.subheader("📊 Batch Predictions")
-            uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv'])
-            if uploaded_file is not None:
                 try:
-                    # Try different encodings for batch file
-                    encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
-                    batch_df = None
-                    for encoding in encodings:
-                        try:
-                            batch_df = pd.read_csv(uploaded_file, encoding=encoding)
-                            break
-                        except UnicodeDecodeError:
-                            continue
-                    if batch_df is not None:
-                        st.write("📋 Uploaded data preview:")
-                        st.dataframe(batch_df.head())
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
-                        with col2:
-                            batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
-                        if st.button("🚀 Run Batch Predictions", type="primary"):
-                            with st.spinner("Processing batch predictions..."):
-                                predictions = []
-                                confidences = []
-                                progress_bar = st.progress(0)
-                                total_texts = len(batch_df)
-                                for i, text in enumerate(batch_df[text_column]):
-                                    pred, proba = predict_text(
-                                        batch_model,
-                                        str(text),
-                                        st.session_state.get('vectorizer_type', 'tfidf')
-                                    )
-                                    predictions.append(pred if pred is not None else "Error")
-                                    # Get confidence (max probability)
-                                    if proba is not None:
-                                        confidences.append(max(proba))
-                                    else:
-                                        confidences.append(0.0)
-                                    progress_bar.progress((i + 1) / total_texts)
-                                batch_df['Predicted_Class'] = predictions
-                                batch_df['Confidence'] = confidences
-                                st.success("✅ Batch predictions completed!")
-                                # Show results
-                                st.subheader("📊 Results")
-                                result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']]
-                                st.dataframe(result_df)
-                                # Summary statistics
-                                st.subheader("📈 Summary")
-                                col1, col2, col3 = st.columns(3)
-                                with col1:
-                                    st.metric("Total Predictions", len(predictions))
-                                with col2:
-                                    successful_preds = sum(1 for p in predictions if p != "Error")
-                                    st.metric("Successful", successful_preds)
-                                with col3:
-                                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
-                                    st.metric("Avg Confidence", f"{avg_confidence:.3f}")
-                                # Class distribution of predictions
-                                pred_counts = pd.Series(predictions).value_counts()
-                                if len(pred_counts) > 0:
-                                    fig = px.pie(
-                                        values=pred_counts.values,
-                                        names=pred_counts.index,
-                                        title="Distribution of Predictions"
-                                    )
-                                    st.plotly_chart(fig, use_container_width=True)
-                                # Download results
-                                csv = batch_df.to_csv(index=False)
-                                st.download_button(
-                                    label="📥 Download Results as CSV",
-                                    data=csv,
-                                    file_name="batch_predictions.csv",
-                                    mime="text/csv"
                                 )
-                    else:
-                        st.error("❌ Unable to read the CSV file. Please check the file encoding.")
                 except Exception as e:
-                    st.error(f"❌ Error in batch prediction: {str(e)}")
         else:
-            st.warning("⚠️ No trained models found. Please train a model first.")
     else:
-        st.warning("⚠️ No models directory found. Please go to 'Train Model' section to train a model first.")
 # Footer
 st.markdown("---")
-st.markdown("🚀 Built with Streamlit | 📊 No-Code Text Classification")

 import seaborn as sns
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.svm import LinearSVC, SVC
 from sklearn.naive_bayes import MultinomialNB, GaussianNB
+from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 import os
 import pickle
 import re
 import string
 from collections import Counter
+# Set page config
+st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .section-header {
+        font-size: 1.8rem;
+        color: #ff7f0e;
+        border-bottom: 2px solid #ff7f0e;
+        padding-bottom: 0.5rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Utility functions
+def clean_text(text):
+    """Clean text data"""
+    if pd.isna(text):
+        return ""
+    text = str(text).lower()
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def save_artifacts(obj, folder_name, file_name):
+    """Save artifacts like encoders and vectorizers"""
+    try:
+        os.makedirs(folder_name, exist_ok=True)
+        with open(os.path.join(folder_name, file_name), 'wb') as f:
+            pickle.dump(obj, f)
+        return True
+    except Exception as e:
+        st.error(f"Error saving {file_name}: {str(e)}")
+        return False
+def load_artifacts(folder_name, file_name):
+    """Load saved artifacts"""
+    try:
+        with open(os.path.join(folder_name, file_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"File {file_name} not found in {folder_name} folder")
+        return None
+    except Exception as e:
+        st.error(f"Error loading {file_name}: {str(e)}")
+        return None
+def analyze_data(df, text_col, target_col):
+    """Perform data analysis"""
+    analysis = {}
+    # Basic info
+    analysis['shape'] = df.shape
+    analysis['columns'] = df.columns.tolist()
+    analysis['missing_values'] = df.isnull().sum().to_dict()
+    # Text analysis
+    df['text_length'] = df[text_col].astype(str).apply(len)
+    analysis['avg_text_length'] = df['text_length'].mean()
+    analysis['text_length_stats'] = df['text_length'].describe().to_dict()
+    # Target analysis
+    analysis['class_distribution'] = df[target_col].value_counts().to_dict()
+    analysis['num_classes'] = df[target_col].nunique()
+    return analysis
+def create_visualizations(df, text_col, target_col):
+    """Create visualizations"""
+    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+    # Class distribution
+    class_counts = df[target_col].value_counts()
+    axes[0, 0].bar(class_counts.index, class_counts.values)
+    axes[0, 0].set_title('Class Distribution')
+    axes[0, 0].set_xlabel('Classes')
+    axes[0, 0].set_ylabel('Count')
+    plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
+    # Text length distribution
+    axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
+    axes[0, 1].set_title('Text Length Distribution')
+    axes[0, 1].set_xlabel('Text Length')
+    axes[0, 1].set_ylabel('Frequency')
+    # Box plot of text length by class
+    df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
+    axes[1, 0].set_title('Text Length by Class')
+    axes[1, 0].set_xlabel('Class')
+    axes[1, 0].set_ylabel('Text Length')
+    # Correlation plot (if applicable)
+    if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
+        correlation = df[['text_length', target_col]].corr()
+        sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
+        axes[1, 1].set_title('Correlation Matrix')
+    else:
+        axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
+                       ha='center', va='center', transform=axes[1, 1].transAxes)
+        axes[1, 1].set_title('Correlation Analysis')
+    plt.tight_layout()
+    return fig
 def train_model(model_name, X_train, X_test, y_train, y_test):
+    """Train selected model"""
+    models_dict = {
+        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
+        "Decision Tree": DecisionTreeClassifier(random_state=42),
+        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
+        "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
+        "SVC": SVC(random_state=42, probability=True),
+        "Multinomial Naive Bayes": MultinomialNB(),
+        "Gaussian Naive Bayes": GaussianNB()
     }
+    if model_name not in models_dict:
+        return None, None, None
+    model = models_dict[model_name]
+    # Special handling for Gaussian NB (needs dense array)
+    if model_name == "Gaussian Naive Bayes":
+        X_train_model = X_train.toarray()
+        X_test_model = X_test.toarray()
+    else:
+        X_train_model = X_train
+        X_test_model = X_test
     # Train model
+    model.fit(X_train_model, y_train)
     # Make predictions
+    y_pred = model.predict(X_test_model)
     # Calculate metrics
     accuracy = accuracy_score(y_test, y_pred)
+    report = classification_report(y_test, y_pred, output_dict=True)
     # Save model
     os.makedirs("models", exist_ok=True)
+    model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
+    save_artifacts(model, "models", model_filename)
+    return model, accuracy, report
+def predict_text(model_name, text, vectorizer_type="tfidf"):
     """Make prediction on new text"""
     try:
         # Load model
+        model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
+        model = load_artifacts("models", model_filename)
+        if model is None:
+            return None, None
         # Load vectorizer
+        vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
+        vectorizer = load_artifacts("artifacts", vectorizer_filename)
         if vectorizer is None:
             return None, None
         # Load label encoder
+        encoder = load_artifacts("artifacts", "label_encoder.pkl")
         if encoder is None:
             return None, None
         # Clean and vectorize text
+        clean_text_input = clean_text(text)
+        text_vector = vectorizer.transform([clean_text_input])
+        # Special handling for Gaussian NB
+        if "gaussian" in model_name.lower():
             text_vector = text_vector.toarray()
         # Make prediction
         # Get prediction probabilities if available
         if hasattr(model, 'predict_proba'):
             try:
+                if "gaussian" in model_name.lower():
+                    prediction_proba = model.predict_proba(text_vector)[0]
+                else:
+                    prediction_proba = model.predict_proba(text_vector)[0]
+            except Exception as e:
+                st.warning(f"Could not get prediction probabilities: {str(e)}")
         # Decode prediction
         predicted_label = encoder.inverse_transform(prediction)[0]
         st.error(f"Error during prediction: {str(e)}")
         return None, None
+# Main App
+st.markdown('<h1 class="main-header">📊 No Code Text Classification App</h1>', unsafe_allow_html=True)
+st.markdown("### Analyze your text data and train machine learning models without coding!")
+# Initialize session state
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = "tfidf"
+if 'trained_models' not in st.session_state:
+    st.session_state.trained_models = []
 # Sidebar
+st.sidebar.markdown("## 📁 Upload Your Dataset")
+# File upload with better error handling
+try:
+    uploaded_file = st.sidebar.file_uploader(
+        "Choose a CSV file",
+        type="csv",
+        help="Upload your training dataset (CSV format)"
+    )
+    # Encoding selection
+    encoding = st.sidebar.selectbox(
+        "Select file encoding",
+        ["utf-8", "latin1", "iso-8859-1", "cp1252"],
+        help="Try different encodings if you get reading errors"
+    )
+except Exception as e:
+    st.sidebar.error(f"File upload error: {str(e)}")
+    uploaded_file = None
+# Navigation
+section = st.sidebar.radio(
+    "Choose Section",
+    ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
+    help="Navigate through different sections of the app"
+)
+# Main content based on section
+if uploaded_file is not None:
     try:
+        # Load data with selected encoding
+        df = pd.read_csv(uploaded_file, encoding=encoding)
+        st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")
+        # Column selection
+        columns = df.columns.tolist()
+        text_column = st.sidebar.selectbox("📝 Select text column:", columns)
+        target_column = st.sidebar.selectbox("🎯 Select target column:", columns)
+        # Data preprocessing
+        df['clean_text'] = df[text_column].apply(clean_text)
+        df['text_length'] = df[text_column].astype(str).apply(len)
+        # Process target column
+        label_encoder = LabelEncoder()
+        df['encoded_target'] = label_encoder.fit_transform(df[target_column])
+        save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
     except Exception as e:
+        st.error(f"❌ Error loading data: {str(e)}")
+        st.info("💡 Try selecting a different encoding from the sidebar.")
+        df = None
+# Section: Data Analysis
 if section == "📊 Data Analysis":
+    if uploaded_file is not None and df is not None:
+        st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)
+        # Data overview
+        col1, col2, col3 = st.columns(3)
         with col1:
+            st.metric("📋 Total Records", df.shape[0])
         with col2:
+            st.metric("📊 Features", df.shape[1])
         with col3:
+            st.metric("🏷️ Classes", df[target_column].nunique())
+        # Data preview
+        st.subheader("📖 Data Preview")
+        st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
+        # Analysis results
+        analysis = analyze_data(df, text_column, target_column)
         col1, col2 = st.columns(2)
         with col1:
+            st.subheader("📈 Text Statistics")
+            st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
+            st.write("**Text length distribution:**")
+            st.write(pd.DataFrame([analysis['text_length_stats']]).T)
         with col2:
+            st.subheader("🏷️ Class Distribution")
+            class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
+                                    columns=['Class', 'Count'])
             st.dataframe(class_dist)
+        # Visualizations
+        st.subheader("📊 Visualizations")
+        try:
+            fig = create_visualizations(df, text_column, target_column)
+            st.pyplot(fig)
+        except Exception as e:
+            st.error(f"Error creating visualizations: {str(e)}")
     else:
+        st.warning("📁 Please upload a dataset to analyze.")
+# Section: Train Model
 elif section == "🤖 Train Model":
+    if uploaded_file is not None and df is not None:
+        st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)
         col1, col2 = st.columns(2)
         with col1:
+            st.subheader("🤖 Select Model")
+            model_name = st.selectbox(
+                "Choose algorithm:",
+                ["Logistic Regression", "Decision Tree", "Random Forest",
+                 "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
             )
         with col2:
+            st.subheader("🔤 Select Vectorizer")
+            vectorizer_choice = st.selectbox(
+                "Choose text vectorizer:",
+                ["TF-IDF Vectorizer", "Count Vectorizer"]
+            )
+        # Vectorizer parameters
+        max_features = st.slider("Max features", 1000, 50000, 10000)
+        test_size = st.slider("Test size", 0.1, 0.5, 0.2)
         if st.button("🚀 Start Training", type="primary"):
+            with st.spinner("🔄 Training model..."):
                 try:
+                    # Initialize vectorizer
+                    if vectorizer_choice == "TF-IDF Vectorizer":
+                        vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
+                        st.session_state.vectorizer_type = "tfidf"
+                    else:
+                        vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
+                        st.session_state.vectorizer_type = "count"
+                    # Vectorize text
+                    X = vectorizer.fit_transform(df['clean_text'])
+                    y = df['encoded_target']
                     # Split data
                     X_train, X_test, y_train, y_test = train_test_split(
+                        X, y, test_size=test_size, random_state=42, stratify=y
                     )
                     # Save vectorizer
                     vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
                     save_artifacts(vectorizer, "artifacts", vectorizer_filename)
                     # Train model
+                    model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
+                    if model is not None:
+                        st.success(f"✅ Model trained successfully!")
+                        st.session_state.trained_models.append(model_name)
+                        # Display results
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.metric("🎯 Accuracy", f"{accuracy:.4f}")
+                        with col2:
+                            st.metric("🏷️ Classes", len(report) - 3)  # Exclude avg metrics
+                        # Detailed metrics
+                        st.subheader("📊 Detailed Metrics")
+                        metrics_df = pd.DataFrame(report).transpose()
+                        st.dataframe(metrics_df.round(4))
                 except Exception as e:
+                    st.error(f"❌ Training failed: {str(e)}")
     else:
+        st.warning("📁 Please upload a dataset to train a model.")
+# Section: Predictions
 elif section == "🔮 Predictions":
+    st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)
+    # Check for trained models
     if os.path.exists("models") and os.listdir("models"):
+        available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
+                           for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
             # Single prediction
+            st.subheader("🔮 Single Text Prediction")
+            col1, col2 = st.columns([3, 1])
             with col1:
+                text_input = st.text_area(
+                    "Enter text to classify:",
+                    height=100,
+                    placeholder="Type or paste your text here..."
+                )
             with col2:
+                selected_model = st.selectbox("Select model:", available_models)
+                if st.button("🔍 Predict", type="primary"):
+                    if text_input.strip():
+                        with st.spinner("🔄 Making prediction..."):
+                            predicted_label, prediction_proba = predict_text(
+                                selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
+                            )
+                            if predicted_label is not None:
+                                st.success("✅ Prediction completed!")
+                                # Results
+                                st.markdown("### 📋 Results")
+                                st.info(f"**Predicted Class:** {predicted_label}")
+                                # Probabilities
+                                if prediction_proba is not None:
+                                    encoder = load_artifacts("artifacts", "label_encoder.pkl")
+                                    if encoder is not None:
+                                        classes = encoder.classes_
+                                        prob_df = pd.DataFrame({
+                                            'Class': classes,
+                                            'Probability': prediction_proba
+                                        }).sort_values('Probability', ascending=False)
+                                        st.markdown("### 📊 Class Probabilities")
+                                        st.bar_chart(prob_df.set_index('Class'))
+                    else:
+                        st.warning("⚠️ Please enter some text to classify.")
             # Batch predictions
             st.markdown("---")
+            st.subheader("📦 Batch Predictions")
+            batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
+            if batch_file is not None:
                 try:
+                    batch_df = pd.read_csv(batch_file, encoding=encoding)
+                    st.write("📖 Preview:")
+                    st.dataframe(batch_df.head())
+                    batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
+                    batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
+                    if st.button("🚀 Run Batch Predictions"):
+                        with st.spinner("🔄 Processing batch predictions..."):
+                            predictions = []
+                            progress_bar = st.progress(0)
+                            for i, text in enumerate(batch_df[batch_text_col]):
+                                pred, _ = predict_text(
+                                    batch_model, str(text),
+                                    st.session_state.get('vectorizer_type', 'tfidf')
                                 )
+                                predictions.append(pred if pred is not None else "Error")
+                                progress_bar.progress((i + 1) / len(batch_df))
+                            batch_df['Predicted_Class'] = predictions
+                            st.success("✅ Batch predictions completed!")
+                            st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
+                            # Download option
+                            csv = batch_df.to_csv(index=False)
+                            st.download_button(
+                                "📥 Download Results",
+                                csv,
+                                "batch_predictions.csv",
+                                "text/csv"
+                            )
                 except Exception as e:
+                    st.error(f"❌ Batch prediction error: {str(e)}")
         else:
+            st.warning("⚠️ No trained models found.")
     else:
+        st.warning("⚠️ No models available. Please train a model first.")
 # Footer
 st.markdown("---")
+st.markdown("*Built with Streamlit • Text Classification Made Easy*")