Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

8d810b6

verified ·

1 Parent(s): cb7e73c

Update app.py

Browse files

Files changed (1) hide show

app.py +645 -336

app.py CHANGED Viewed

@@ -1,336 +1,645 @@
-import streamlit as st
-import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-from NoCodeTextClassifier.EDA import Informations, Visualizations
-from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
-from NoCodeTextClassifier.models import Models
-import os
-import pickle
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
-# Utility functions
-def save_artifacts(obj, folder_name, file_name):
-    """Save artifacts like encoders and vectorizers"""
-    os.makedirs(folder_name, exist_ok=True)
-    with open(os.path.join(folder_name, file_name), 'wb') as f:
-        pickle.dump(obj, f)
-def load_artifacts(folder_name, file_name):
-    """Load saved artifacts"""
-    try:
-        with open(os.path.join(folder_name, file_name), 'rb') as f:
-            return pickle.load(f)
-    except FileNotFoundError:
-        st.error(f"File {file_name} not found in {folder_name} folder")
-        return None
-def load_model(model_name):
-    """Load trained model"""
-    try:
-        with open(os.path.join('models', model_name), 'rb') as f:
-            return pickle.load(f)
-    except FileNotFoundError:
-        st.error(f"Model {model_name} not found. Please train a model first.")
-        return None
-def predict_text(model_name, text, vectorizer_type="tfidf"):
-    """Make prediction on new text"""
-    try:
-        # Load model
-        model = load_model(model_name)
-        if model is None:
-            return None, None
-        # Load vectorizer
-        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
-        vectorizer = load_artifacts("artifacts", vectorizer_file)
-        if vectorizer is None:
-            return None, None
-        # Load label encoder
-        encoder = load_artifacts("artifacts", "encoder.pkl")
-        if encoder is None:
-            return None, None
-        # Clean and vectorize text
-        text_cleaner = TextCleaner()
-        clean_text = text_cleaner.clean_text(text)
-        # Transform text using the same vectorizer used during training
-        text_vector = vectorizer.transform([clean_text])
-        # Make prediction
-        prediction = model.predict(text_vector)
-        prediction_proba = None
-        # Get prediction probabilities if available
-        if hasattr(model, 'predict_proba'):
-            try:
-                prediction_proba = model.predict_proba(text_vector)[0]
-            except:
-                pass
-        # Decode prediction
-        predicted_label = encoder.inverse_transform(prediction)[0]
-        return predicted_label, prediction_proba
-    except Exception as e:
-        st.error(f"Error during prediction: {str(e)}")
-        return None, None
-# Streamlit App
-st.title('No Code Text Classification App')
-st.write('Understand the behavior of your text data and train a model to classify the text data')
-# Sidebar
-section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
-# Upload Data
-st.sidebar.subheader("Upload Your Dataset")
-train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
-test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
-# Global variables to store data and settings
-if 'vectorizer_type' not in st.session_state:
-    st.session_state.vectorizer_type = "tfidf"
-if train_data is not None:
-    try:
-        train_df = pd.read_csv(train_data, encoding='latin1')
-        if test_data is not None:
-            test_df = pd.read_csv(test_data, encoding='latin1')
-        else:
-            test_df = None
-        st.write("Training Data Preview:")
-        st.write(train_df.head(3))
-        columns = train_df.columns.tolist()
-        text_data = st.sidebar.selectbox("Choose the text column:", columns)
-        target = st.sidebar.selectbox("Choose the target column:", columns)
-        # Process data
-        info = Informations(train_df, text_data, target)
-        train_df['clean_text'] = info.clean_text()
-        train_df['text_length'] = info.text_length()
-        # Handle label encoding manually if the class doesn't store encoder
-        from sklearn.preprocessing import LabelEncoder
-        label_encoder = LabelEncoder()
-        train_df['target'] = label_encoder.fit_transform(train_df[target])
-        # Save label encoder for later use
-        os.makedirs("artifacts", exist_ok=True)
-        save_artifacts(label_encoder, "artifacts", "encoder.pkl")
-    except Exception as e:
-        st.error(f"Error loading data: {str(e)}")
-        train_df = None
-        info = None
-# Data Analysis Section
-if section == "Data Analysis":
-    if train_data is not None and train_df is not None:
-        try:
-            st.subheader("Get Insights from the Data")
-            st.write("Data Shape:", info.shape())
-            st.write("Class Imbalance:", info.class_imbalanced())
-            st.write("Missing Values:", info.missing_values())
-            st.write("Processed Data Preview:")
-            st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
-            st.markdown("**Text Length Analysis**")
-            st.write(info.analysis_text_length('text_length'))
-            # Calculate correlation manually since we handled encoding separately
-            correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
-            st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
-            st.subheader("Visualizations")
-            vis = Visualizations(train_df, text_data, target)
-            vis.class_distribution()
-            vis.text_length_distribution()
-        except Exception as e:
-            st.error(f"Error in data analysis: {str(e)}")
-    else:
-        st.warning("Please upload training data to get insights")
-# Train Model Section
-elif section == "Train Model":
-    if train_data is not None and train_df is not None:
-        try:
-            st.subheader("Train a Model")
-            # Create two columns for model selection
-            col1, col2 = st.columns(2)
-            with col1:
-                model = st.radio("Choose the Model", [
-                    "Logistic Regression", "Decision Tree",
-                    "Random Forest", "Linear SVC", "SVC",
-                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
-                ])
-            with col2:
-                vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
-            # Initialize vectorizer
-            if vectorizer_choice == "Tfidf Vectorizer":
-                vectorizer = TfidfVectorizer(max_features=10000)
-                st.session_state.vectorizer_type = "tfidf"
-            else:
-                vectorizer = CountVectorizer(max_features=10000)
-                st.session_state.vectorizer_type = "count"
-            st.write("Training Data Preview:")
-            st.write(train_df[['clean_text', 'target']].head(3))
-            # Vectorize text data
-            X = vectorizer.fit_transform(train_df['clean_text'])
-            y = train_df['target']
-            # Split data
-            X_train, X_test, y_train, y_test = process.split_data(X, y)
-            st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
-            # Save vectorizer for later use
-            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
-            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
-            if st.button("Start Training"):
-                with st.spinner("Training model..."):
-                    models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
-                    # Train selected model
-                    if model == "Logistic Regression":
-                        models.LogisticRegression()
-                    elif model == "Decision Tree":
-                        models.DecisionTree()
-                    elif model == "Linear SVC":
-                        models.LinearSVC()
-                    elif model == "SVC":
-                        models.SVC()
-                    elif model == "Multinomial Naive Bayes":
-                        models.MultinomialNB()
-                    elif model == "Random Forest":
-                        models.RandomForestClassifier()
-                    elif model == "Gaussian Naive Bayes":
-                        models.GaussianNB()
-                st.success("Model training completed!")
-                st.info("You can now use the 'Predictions' section to classify new text.")
-        except Exception as e:
-            st.error(f"Error in model training: {str(e)}")
-    else:
-        st.warning("Please upload training data to train a model")
-# Predictions Section
-elif section == "Predictions":
-    st.subheader("Perform Predictions on New Text")
-    # Check if models exist
-    if os.path.exists("models") and os.listdir("models"):
-        # Text input for prediction
-        text_input = st.text_area("Enter the text to classify:", height=100)
-        # Model selection
-        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
-        if available_models:
-            selected_model = st.selectbox("Choose the trained model:", available_models)
-            # Prediction button
-            if st.button("Predict", key="single_predict"):
-                if text_input.strip():
-                    with st.spinner("Making prediction..."):
-                        predicted_label, prediction_proba = predict_text(
-                            selected_model,
-                            text_input,
-                            st.session_state.get('vectorizer_type', 'tfidf')
-                        )
-                        if predicted_label is not None:
-                            st.success("Prediction completed!")
-                            # Display results
-                            st.markdown("### Prediction Results")
-                            st.markdown(f"**Input Text:** {text_input}")
-                            st.markdown(f"**Predicted Class:** {predicted_label}")
-                            # Display probabilities if available
-                            if prediction_proba is not None:
-                                st.markdown("**Class Probabilities:**")
-                                # Load encoder to get class names
-                                encoder = load_artifacts("artifacts", "encoder.pkl")
-                                if encoder is not None:
-                                    classes = encoder.classes_
-                                    prob_df = pd.DataFrame({
-                                        'Class': classes,
-                                        'Probability': prediction_proba
-                                    }).sort_values('Probability', ascending=False)
-                                    st.bar_chart(prob_df.set_index('Class'))
-                                    st.dataframe(prob_df)
-                else:
-                    st.warning("Please enter some text to classify")
-        else:
-            st.warning("No trained models found. Please train a model first.")
-    else:
-        st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
-    # Option to classify multiple texts
-    st.markdown("---")
-    st.subheader("Batch Predictions")
-    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
-    if uploaded_file is not None:
-        try:
-            batch_df = pd.read_csv(uploaded_file, encoding='latin1')
-            st.write("Uploaded data preview:")
-            st.write(batch_df.head())
-            # Select text column
-            text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
-            if os.path.exists("models") and os.listdir("models"):
-                available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
-                batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
-                if st.button("Run Batch Predictions", key="batch_predict"):
-                    with st.spinner("Processing batch predictions..."):
-                        predictions = []
-                        for text in batch_df[text_column]:
-                            pred, _ = predict_text(
-                                batch_model,
-                                str(text),
-                                st.session_state.get('vectorizer_type', 'tfidf')
-                            )
-                            predictions.append(pred if pred is not None else "Error")
-                        batch_df['Predicted_Class'] = predictions
-                        st.success("Batch predictions completed!")
-                        st.write("Results:")
-                        st.write(batch_df[[text_column, 'Predicted_Class']])
-                        # Download results
-                        csv = batch_df.to_csv(index=False)
-                        st.download_button(
-                            label="Download predictions as CSV",
-                            data=csv,
-                            file_name="batch_predictions.csv",
-                            mime="text/csv"
-                        )
-        except Exception as e:
-            st.error(f"Error in batch prediction: {str(e)}")

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import LinearSVC, SVC
+from sklearn.naive_bayes import MultinomialNB, GaussianNB
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import os
+import pickle
+import re
+import string
+from collections import Counter
+import plotly.express as px
+import plotly.graph_objects as go
+# Configure Streamlit page
+st.set_page_config(
+    page_title="Text Classification App",
+    page_icon="📝",
+    layout="wide"
+)
+# Text preprocessing class
+class TextCleaner:
+    def __init__(self):
+        self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
+    def clean_text(self, text):
+        """Clean and preprocess text"""
+        if pd.isna(text):
+            return ""
+        text = str(text).lower()
+        text = re.sub(r'http\S+', '', text)  # Remove URLs
+        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
+        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
+        text = text.strip()
+        # Remove stop words (optional)
+        words = text.split()
+        words = [word for word in words if word not in self.stop_words]
+        return ' '.join(words)
+# Data analysis functions
+def get_data_insights(df, text_col, target_col):
+    """Get basic insights from the dataset"""
+    insights = {
+        'shape': df.shape,
+        'missing_values': df.isnull().sum().to_dict(),
+        'class_distribution': df[target_col].value_counts().to_dict(),
+        'text_length_stats': {
+            'mean': df[text_col].str.len().mean(),
+            'median': df[text_col].str.len().median(),
+            'min': df[text_col].str.len().min(),
+            'max': df[text_col].str.len().max()
+        }
+    }
+    return insights
+# Model training functions
+def train_model(model_name, X_train, X_test, y_train, y_test):
+    """Train and evaluate a model"""
+    models = {
+        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
+        'Decision Tree': DecisionTreeClassifier(random_state=42),
+        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
+        'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
+        'SVC': SVC(random_state=42, probability=True),
+        'Multinomial Naive Bayes': MultinomialNB(),
+        'Gaussian Naive Bayes': GaussianNB()
+    }
+    model = models[model_name]
+    # For Gaussian NB, convert sparse matrix to dense
+    if model_name == 'Gaussian Naive Bayes':
+        X_train = X_train.toarray()
+        X_test = X_test.toarray()
+    # Train model
+    model.fit(X_train, y_train)
+    # Make predictions
+    y_pred = model.predict(X_test)
+    # Calculate metrics
+    accuracy = accuracy_score(y_test, y_pred)
+    # Save model
+    os.makedirs("models", exist_ok=True)
+    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
+    with open(os.path.join("models", model_filename), 'wb') as f:
+        pickle.dump(model, f)
+    return model, accuracy, y_pred, model_filename
+# Utility functions
+def save_artifacts(obj, folder_name, file_name):
+    """Save artifacts like encoders and vectorizers"""
+    os.makedirs(folder_name, exist_ok=True)
+    with open(os.path.join(folder_name, file_name), 'wb') as f:
+        pickle.dump(obj, f)
+def load_artifacts(folder_name, file_name):
+    """Load saved artifacts"""
+    try:
+        with open(os.path.join(folder_name, file_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"File {file_name} not found in {folder_name} folder")
+        return None
+def predict_text(model_filename, text, vectorizer_type="tfidf"):
+    """Make prediction on new text"""
+    try:
+        # Load model
+        with open(os.path.join('models', model_filename), 'rb') as f:
+            model = pickle.load(f)
+        # Load vectorizer
+        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
+        vectorizer = load_artifacts("artifacts", vectorizer_file)
+        if vectorizer is None:
+            return None, None
+        # Load label encoder
+        encoder = load_artifacts("artifacts", "encoder.pkl")
+        if encoder is None:
+            return None, None
+        # Clean and vectorize text
+        text_cleaner = TextCleaner()
+        clean_text = text_cleaner.clean_text(text)
+        # Transform text
+        text_vector = vectorizer.transform([clean_text])
+        # For Gaussian NB, convert to dense
+        if 'gaussian' in model_filename:
+            text_vector = text_vector.toarray()
+        # Make prediction
+        prediction = model.predict(text_vector)
+        prediction_proba = None
+        # Get prediction probabilities if available
+        if hasattr(model, 'predict_proba'):
+            try:
+                prediction_proba = model.predict_proba(text_vector)[0]
+            except:
+                pass
+        # Decode prediction
+        predicted_label = encoder.inverse_transform(prediction)[0]
+        return predicted_label, prediction_proba
+    except Exception as e:
+        st.error(f"Error during prediction: {str(e)}")
+        return None, None
+# Streamlit App
+st.title('📝 No Code Text Classification App')
+st.markdown('---')
+st.write('Analyze your text data and train machine learning models without coding!')
+# Sidebar
+st.sidebar.title("Navigation")
+section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"])
+# Upload Data
+st.sidebar.markdown("---")
+st.sidebar.subheader("📁 Upload Your Dataset")
+train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
+test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
+# Global variables to store data and settings
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = "tfidf"
+if train_data is not None:
+    try:
+        # Try different encodings
+        encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
+        train_df = None
+        for encoding in encodings:
+            try:
+                train_df = pd.read_csv(train_data, encoding=encoding)
+                break
+            except UnicodeDecodeError:
+                continue
+        if train_df is None:
+            st.error("Unable to read the CSV file. Please check the file encoding.")
+        else:
+            if test_data is not None:
+                for encoding in encodings:
+                    try:
+                        test_df = pd.read_csv(test_data, encoding=encoding)
+                        break
+                    except UnicodeDecodeError:
+                        continue
+            else:
+                test_df = None
+            # Show data preview
+            with st.sidebar.expander("📋 Data Preview", expanded=True):
+                st.write("Shape:", train_df.shape)
+                st.write(train_df.head(2))
+            columns = train_df.columns.tolist()
+            text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
+            target = st.sidebar.selectbox("🎯 Choose the target column:", columns)
+            # Process data
+            if text_data and target:
+                # Clean text
+                text_cleaner = TextCleaner()
+                train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text)
+                train_df['text_length'] = train_df[text_data].str.len()
+                # Handle label encoding
+                label_encoder = LabelEncoder()
+                train_df['target_encoded'] = label_encoder.fit_transform(train_df[target])
+                # Save label encoder
+                save_artifacts(label_encoder, "artifacts", "encoder.pkl")
+    except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        train_df = None
+# Data Analysis Section
+if section == "📊 Data Analysis":
+    if train_data is not None and 'train_df' in locals() and train_df is not None:
+        st.header("📊 Data Analysis")
+        # Get insights
+        insights = get_data_insights(train_df, text_data, target)
+        # Display insights in columns
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Total Samples", insights['shape'][0])
+        with col2:
+            st.metric("Features", insights['shape'][1])
+        with col3:
+            st.metric("Classes", len(insights['class_distribution']))
+        with col4:
+            st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}")
+        st.markdown("---")
+        # Data quality section
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("📋 Dataset Overview")
+            st.write("**Shape:**", insights['shape'])
+            st.write("**Missing Values:**")
+            missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count'])
+            st.dataframe(missing_df[missing_df['Count'] > 0])
+            st.write("**Sample Data:**")
+            st.dataframe(train_df[[text_data, target, 'text_length']].head())
+        with col2:
+            st.subheader("📊 Class Distribution")
+            class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count'])
+            st.dataframe(class_dist)
+            # Plot class distribution
+            fig = px.bar(
+                x=class_dist.index,
+                y=class_dist['Count'],
+                title="Class Distribution",
+                labels={'x': 'Class', 'y': 'Count'}
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        st.markdown("---")
+        # Text analysis section
+        st.subheader("📝 Text Analysis")
+        col1, col2 = st.columns(2)
+        with col1:
+            # Text length distribution
+            fig = px.histogram(
+                train_df,
+                x='text_length',
+                title="Text Length Distribution",
+                nbins=30
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        with col2:
+            # Text length by class
+            fig = px.box(
+                train_df,
+                x=target,
+                y='text_length',
+                title="Text Length by Class"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        # Word frequency analysis
+        st.subheader("🔤 Most Common Words")
+        all_text = ' '.join(train_df['clean_text'].astype(str))
+        word_freq = Counter(all_text.split())
+        top_words = word_freq.most_common(20)
+        if top_words:
+            words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
+            fig = px.bar(
+                words_df,
+                x='Frequency',
+                y='Word',
+                orientation='h',
+                title="Top 20 Most Common Words"
+            )
+            fig.update_layout(yaxis={'categoryorder': 'total ascending'})
+            st.plotly_chart(fig, use_container_width=True)
+    else:
+        st.warning("📁 Please upload training data to perform analysis")
+# Train Model Section
+elif section == "🤖 Train Model":
+    if train_data is not None and 'train_df' in locals() and train_df is not None:
+        st.header("🤖 Train Machine Learning Model")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("⚙️ Model Configuration")
+            model_name = st.selectbox("Choose Model", [
+                "Logistic Regression", "Decision Tree",
+                "Random Forest", "Linear SVC", "SVC",
+                "Multinomial Naive Bayes", "Gaussian Naive Bayes"
+            ])
+        with col2:
+            st.subheader("📊 Vectorization Method")
+            vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"])
+        # Model parameters
+        st.subheader("🔧 Parameters")
+        col1, col2 = st.columns(2)
+        with col1:
+            max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000)
+            test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05)
+        with col2:
+            random_state = st.number_input("Random State", 0, 1000, 42)
+            min_df = st.slider("Min Document Frequency", 1, 10, 1)
+        # Initialize vectorizer
+        if vectorizer_choice == "TF-IDF":
+            vectorizer = TfidfVectorizer(
+                max_features=max_features,
+                min_df=min_df,
+                stop_words='english'
+            )
+            st.session_state.vectorizer_type = "tfidf"
+        else:
+            vectorizer = CountVectorizer(
+                max_features=max_features,
+                min_df=min_df,
+                stop_words='english'
+            )
+            st.session_state.vectorizer_type = "count"
+        # Show data info
+        st.subheader("📋 Training Data Info")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Samples", len(train_df))
+        with col2:
+            st.metric("Unique Classes", train_df[target].nunique())
+        with col3:
+            st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}")
+        if st.button("🚀 Start Training", type="primary"):
+            with st.spinner("Training model... This may take a few minutes."):
+                try:
+                    # Vectorize text data
+                    X = vectorizer.fit_transform(train_df['clean_text'])
+                    y = train_df['target_encoded']
+                    # Split data
+                    X_train, X_test, y_train, y_test = train_test_split(
+                        X, y,
+                        test_size=test_size,
+                        random_state=random_state,
+                        stratify=y
+                    )
+                    st.success(f"✅ Data split - Train: {X_train.shape}, Test: {X_test.shape}")
+                    # Save vectorizer
+                    vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
+                    save_artifacts(vectorizer, "artifacts", vectorizer_filename)
+                    # Train model
+                    model, accuracy, y_pred, model_filename = train_model(
+                        model_name, X_train, X_test, y_train, y_test
+                    )
+                    st.success("🎉 Model training completed!")
+                    # Display results
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.metric("🎯 Test Accuracy", f"{accuracy:.4f}")
+                        # Classification report
+                        st.subheader("📊 Classification Report")
+                        report = classification_report(
+                            y_test, y_pred,
+                            target_names=label_encoder.classes_,
+                            output_dict=True
+                        )
+                        report_df = pd.DataFrame(report).transpose()
+                        st.dataframe(report_df.round(4))
+                    with col2:
+                        # Confusion matrix
+                        st.subheader("🔄 Confusion Matrix")
+                        cm = confusion_matrix(y_test, y_pred)
+                        fig = px.imshow(
+                            cm,
+                            text_auto=True,
+                            aspect="auto",
+                            title="Confusion Matrix",
+                            labels=dict(x="Predicted", y="Actual"),
+                            x=label_encoder.classes_,
+                            y=label_encoder.classes_
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                    st.info(f"✅ Model saved as: {model_filename}")
+                    st.info("🔮 You can now use the 'Predictions' section to classify new text!")
+                except Exception as e:
+                    st.error(f"❌ Error during training: {str(e)}")
+    else:
+        st.warning("📁 Please upload training data to train a model")
+# Predictions Section
+elif section == "🔮 Predictions":
+    st.header("🔮 Text Classification Predictions")
+    # Check if models exist
+    if os.path.exists("models") and os.listdir("models"):
+        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+        if available_models:
+            # Single prediction
+            st.subheader("📝 Single Text Classification")
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                text_input = st.text_area("Enter text to classify:", height=150)
+            with col2:
+                selected_model = st.selectbox("Choose model:", available_models)
+                predict_button = st.button("🔮 Predict", type="primary")
+            if predict_button and text_input.strip():
+                with st.spinner("Making prediction..."):
+                    predicted_label, prediction_proba = predict_text(
+                        selected_model,
+                        text_input,
+                        st.session_state.get('vectorizer_type', 'tfidf')
+                    )
+                    if predicted_label is not None:
+                        st.success("✅ Prediction completed!")
+                        # Display results
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("### 🎯 Results")
+                            st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}")
+                            st.markdown(f"**Predicted Class:** `{predicted_label}`")
+                        with col2:
+                            # Display probabilities if available
+                            if prediction_proba is not None:
+                                st.markdown("### 📊 Class Probabilities")
+                                encoder = load_artifacts("artifacts", "encoder.pkl")
+                                if encoder is not None:
+                                    prob_df = pd.DataFrame({
+                                        'Class': encoder.classes_,
+                                        'Probability': prediction_proba
+                                    }).sort_values('Probability', ascending=False)
+                                    fig = px.bar(
+                                        prob_df,
+                                        x='Probability',
+                                        y='Class',
+                                        orientation='h',
+                                        title="Prediction Confidence"
+                                    )
+                                    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
+                                    st.plotly_chart(fig, use_container_width=True)
+            elif predict_button:
+                st.warning("⚠️ Please enter some text to classify")
+            # Batch predictions
+            st.markdown("---")
+            st.subheader("📊 Batch Predictions")
+            uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv'])
+            if uploaded_file is not None:
+                try:
+                    # Try different encodings for batch file
+                    encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
+                    batch_df = None
+                    for encoding in encodings:
+                        try:
+                            batch_df = pd.read_csv(uploaded_file, encoding=encoding)
+                            break
+                        except UnicodeDecodeError:
+                            continue
+                    if batch_df is not None:
+                        st.write("📋 Uploaded data preview:")
+                        st.dataframe(batch_df.head())
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
+                        with col2:
+                            batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
+                        if st.button("🚀 Run Batch Predictions", type="primary"):
+                            with st.spinner("Processing batch predictions..."):
+                                predictions = []
+                                confidences = []
+                                progress_bar = st.progress(0)
+                                total_texts = len(batch_df)
+                                for i, text in enumerate(batch_df[text_column]):
+                                    pred, proba = predict_text(
+                                        batch_model,
+                                        str(text),
+                                        st.session_state.get('vectorizer_type', 'tfidf')
+                                    )
+                                    predictions.append(pred if pred is not None else "Error")
+                                    # Get confidence (max probability)
+                                    if proba is not None:
+                                        confidences.append(max(proba))
+                                    else:
+                                        confidences.append(0.0)
+                                    progress_bar.progress((i + 1) / total_texts)
+                                batch_df['Predicted_Class'] = predictions
+                                batch_df['Confidence'] = confidences
+                                st.success("✅ Batch predictions completed!")
+                                # Show results
+                                st.subheader("📊 Results")
+                                result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']]
+                                st.dataframe(result_df)
+                                # Summary statistics
+                                st.subheader("📈 Summary")
+                                col1, col2, col3 = st.columns(3)
+                                with col1:
+                                    st.metric("Total Predictions", len(predictions))
+                                with col2:
+                                    successful_preds = sum(1 for p in predictions if p != "Error")
+                                    st.metric("Successful", successful_preds)
+                                with col3:
+                                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+                                    st.metric("Avg Confidence", f"{avg_confidence:.3f}")
+                                # Class distribution of predictions
+                                pred_counts = pd.Series(predictions).value_counts()
+                                if len(pred_counts) > 0:
+                                    fig = px.pie(
+                                        values=pred_counts.values,
+                                        names=pred_counts.index,
+                                        title="Distribution of Predictions"
+                                    )
+                                    st.plotly_chart(fig, use_container_width=True)
+                                # Download results
+                                csv = batch_df.to_csv(index=False)
+                                st.download_button(
+                                    label="📥 Download Results as CSV",
+                                    data=csv,
+                                    file_name="batch_predictions.csv",
+                                    mime="text/csv"
+                                )
+                    else:
+                        st.error("❌ Unable to read the CSV file. Please check the file encoding.")
+                except Exception as e:
+                    st.error(f"❌ Error in batch prediction: {str(e)}")
+        else:
+            st.warning("⚠️ No trained models found. Please train a model first.")
+    else:
+        st.warning("⚠️ No models directory found. Please go to 'Train Model' section to train a model first.")
+# Footer
+st.markdown("---")
+st.markdown("🚀 Built with Streamlit | 📊 No-Code Text Classification")