Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

9773b59

verified ·

1 Parent(s): 0a50c6f

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -434

app.py CHANGED Viewed

@@ -2,65 +2,20 @@ import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
-import seaborn as sns
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.svm import LinearSVC, SVC
-from sklearn.naive_bayes import MultinomialNB, GaussianNB
-from sklearn.preprocessing import LabelEncoder
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 import os
 import pickle
-import re
-import string
-from collections import Counter
-# Set page config
-st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")
-# Custom CSS for better styling
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 2.5rem;
-        color: #1f77b4;
-        text-align: center;
-        margin-bottom: 2rem;
-    }
-    .section-header {
-        font-size: 1.8rem;
-        color: #ff7f0e;
-        border-bottom: 2px solid #ff7f0e;
-        padding-bottom: 0.5rem;
-    }
-</style>
-""", unsafe_allow_html=True)
 # Utility functions
-def clean_text(text):
-    """Clean text data"""
-    if pd.isna(text):
-        return ""
-    text = str(text).lower()
-    text = re.sub(r'[^a-zA-Z\s]', '', text)
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
 def save_artifacts(obj, folder_name, file_name):
     """Save artifacts like encoders and vectorizers"""
-    try:
-        os.makedirs(folder_name, exist_ok=True)
-        with open(os.path.join(folder_name, file_name), 'wb') as f:
-            pickle.dump(obj, f)
-        return True
-    except Exception as e:
-        st.error(f"Error saving {file_name}: {str(e)}")
-        return False
 def load_artifacts(folder_name, file_name):
     """Load saved artifacts"""
@@ -70,136 +25,41 @@ def load_artifacts(folder_name, file_name):
     except FileNotFoundError:
         st.error(f"File {file_name} not found in {folder_name} folder")
         return None
-    except Exception as e:
-        st.error(f"Error loading {file_name}: {str(e)}")
-        return None
-def analyze_data(df, text_col, target_col):
-    """Perform data analysis"""
-    analysis = {}
-    # Basic info
-    analysis['shape'] = df.shape
-    analysis['columns'] = df.columns.tolist()
-    analysis['missing_values'] = df.isnull().sum().to_dict()
-    # Text analysis
-    df['text_length'] = df[text_col].astype(str).apply(len)
-    analysis['avg_text_length'] = df['text_length'].mean()
-    analysis['text_length_stats'] = df['text_length'].describe().to_dict()
-    # Target analysis
-    analysis['class_distribution'] = df[target_col].value_counts().to_dict()
-    analysis['num_classes'] = df[target_col].nunique()
-    return analysis
-def create_visualizations(df, text_col, target_col):
-    """Create visualizations"""
-    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
-    # Class distribution
-    class_counts = df[target_col].value_counts()
-    axes[0, 0].bar(class_counts.index, class_counts.values)
-    axes[0, 0].set_title('Class Distribution')
-    axes[0, 0].set_xlabel('Classes')
-    axes[0, 0].set_ylabel('Count')
-    plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
-    # Text length distribution
-    axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
-    axes[0, 1].set_title('Text Length Distribution')
-    axes[0, 1].set_xlabel('Text Length')
-    axes[0, 1].set_ylabel('Frequency')
-    # Box plot of text length by class
-    df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
-    axes[1, 0].set_title('Text Length by Class')
-    axes[1, 0].set_xlabel('Class')
-    axes[1, 0].set_ylabel('Text Length')
-    # Correlation plot (if applicable)
-    if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
-        correlation = df[['text_length', target_col]].corr()
-        sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
-        axes[1, 1].set_title('Correlation Matrix')
-    else:
-        axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
-                       ha='center', va='center', transform=axes[1, 1].transAxes)
-        axes[1, 1].set_title('Correlation Analysis')
-    plt.tight_layout()
-    return fig
-def train_model(model_name, X_train, X_test, y_train, y_test):
-    """Train selected model"""
-    models_dict = {
-        "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
-        "Decision Tree": DecisionTreeClassifier(random_state=42),
-        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
-        "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
-        "SVC": SVC(random_state=42, probability=True),
-        "Multinomial Naive Bayes": MultinomialNB(),
-        "Gaussian Naive Bayes": GaussianNB()
-    }
-    if model_name not in models_dict:
-        return None, None, None
-    model = models_dict[model_name]
-    # Special handling for Gaussian NB (needs dense array)
-    if model_name == "Gaussian Naive Bayes":
-        X_train_model = X_train.toarray()
-        X_test_model = X_test.toarray()
-    else:
-        X_train_model = X_train
-        X_test_model = X_test
-    # Train model
-    model.fit(X_train_model, y_train)
-    # Make predictions
-    y_pred = model.predict(X_test_model)
-    # Calculate metrics
-    accuracy = accuracy_score(y_test, y_pred)
-    report = classification_report(y_test, y_pred, output_dict=True)
-    # Save model
-    os.makedirs("models", exist_ok=True)
-    model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
-    save_artifacts(model, "models", model_filename)
-    return model, accuracy, report
 def predict_text(model_name, text, vectorizer_type="tfidf"):
     """Make prediction on new text"""
     try:
         # Load model
-        model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
-        model = load_artifacts("models", model_filename)
         if model is None:
             return None, None
         # Load vectorizer
-        vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
-        vectorizer = load_artifacts("artifacts", vectorizer_filename)
         if vectorizer is None:
             return None, None
         # Load label encoder
-        encoder = load_artifacts("artifacts", "label_encoder.pkl")
         if encoder is None:
             return None, None
         # Clean and vectorize text
-        clean_text_input = clean_text(text)
-        text_vector = vectorizer.transform([clean_text_input])
-        # Special handling for Gaussian NB
-        if "gaussian" in model_name.lower():
-            text_vector = text_vector.toarray()
         # Make prediction
         prediction = model.predict(text_vector)
@@ -208,12 +68,9 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
         # Get prediction probabilities if available
         if hasattr(model, 'predict_proba'):
             try:
-                if "gaussian" in model_name.lower():
-                    prediction_proba = model.predict_proba(text_vector)[0]
-                else:
-                    prediction_proba = model.predict_proba(text_vector)[0]
-            except Exception as e:
-                st.warning(f"Could not get prediction probabilities: {str(e)}")
         # Decode prediction
         predicted_label = encoder.inverse_transform(prediction)[0]
@@ -224,298 +81,256 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
         st.error(f"Error during prediction: {str(e)}")
         return None, None
-# Main App
-st.markdown('<h1 class="main-header">📊 No Code Text Classification App</h1>', unsafe_allow_html=True)
-st.markdown("### Analyze your text data and train machine learning models without coding!")
-# Initialize session state
-if 'vectorizer_type' not in st.session_state:
-    st.session_state.vectorizer_type = "tfidf"
-if 'trained_models' not in st.session_state:
-    st.session_state.trained_models = []
 # Sidebar
-st.sidebar.markdown("## 📁 Upload Your Dataset")
-# File upload with better error handling
-try:
-    uploaded_file = st.sidebar.file_uploader(
-        "Choose a CSV file",
-        type="csv",
-        help="Upload your training dataset (CSV format)"
-    )
-    # Encoding selection
-    encoding = st.sidebar.selectbox(
-        "Select file encoding",
-        ["utf-8", "latin1", "iso-8859-1", "cp1252"],
-        help="Try different encodings if you get reading errors"
-    )
-except Exception as e:
-    st.sidebar.error(f"File upload error: {str(e)}")
-    uploaded_file = None
-# Navigation
-section = st.sidebar.radio(
-    "Choose Section",
-    ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
-    help="Navigate through different sections of the app"
-)
-# Main content based on section
-if uploaded_file is not None:
     try:
-        # Load data with selected encoding
-        df = pd.read_csv(uploaded_file, encoding=encoding)
-        st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")
-        # Column selection
-        columns = df.columns.tolist()
-        text_column = st.sidebar.selectbox("📝 Select text column:", columns)
-        target_column = st.sidebar.selectbox("🎯 Select target column:", columns)
-        # Data preprocessing
-        df['clean_text'] = df[text_column].apply(clean_text)
-        df['text_length'] = df[text_column].astype(str).apply(len)
-        # Process target column
         label_encoder = LabelEncoder()
-        df['encoded_target'] = label_encoder.fit_transform(df[target_column])
-        save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
     except Exception as e:
-        st.error(f"❌ Error loading data: {str(e)}")
-        st.info("💡 Try selecting a different encoding from the sidebar.")
-        df = None
-# Section: Data Analysis
-if section == "📊 Data Analysis":
-    if uploaded_file is not None and df is not None:
-        st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)
-        # Data overview
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("📋 Total Records", df.shape[0])
-        with col2:
-            st.metric("📊 Features", df.shape[1])
-        with col3:
-            st.metric("🏷️ Classes", df[target_column].nunique())
-        # Data preview
-        st.subheader("📖 Data Preview")
-        st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
-        # Analysis results
-        analysis = analyze_data(df, text_column, target_column)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("📈 Text Statistics")
-            st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
-            st.write("**Text length distribution:**")
-            st.write(pd.DataFrame([analysis['text_length_stats']]).T)
-        with col2:
-            st.subheader("🏷️ Class Distribution")
-            class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
-                                    columns=['Class', 'Count'])
-            st.dataframe(class_dist)
-        # Visualizations
-        st.subheader("📊 Visualizations")
         try:
-            fig = create_visualizations(df, text_column, target_column)
-            st.pyplot(fig)
         except Exception as e:
-            st.error(f"Error creating visualizations: {str(e)}")
     else:
-        st.warning("📁 Please upload a dataset to analyze.")
-# Section: Train Model
-elif section == "🤖 Train Model":
-    if uploaded_file is not None and df is not None:
-        st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("🤖 Select Model")
-            model_name = st.selectbox(
-                "Choose algorithm:",
-                ["Logistic Regression", "Decision Tree", "Random Forest",
-                 "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
-            )
-        with col2:
-            st.subheader("🔤 Select Vectorizer")
-            vectorizer_choice = st.selectbox(
-                "Choose text vectorizer:",
-                ["TF-IDF Vectorizer", "Count Vectorizer"]
-            )
-        # Vectorizer parameters
-        max_features = st.slider("Max features", 1000, 50000, 10000)
-        test_size = st.slider("Test size", 0.1, 0.5, 0.2)
-        if st.button("🚀 Start Training", type="primary"):
-            with st.spinner("🔄 Training model..."):
-                try:
-                    # Initialize vectorizer
-                    if vectorizer_choice == "TF-IDF Vectorizer":
-                        vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
-                        st.session_state.vectorizer_type = "tfidf"
-                    else:
-                        vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
-                        st.session_state.vectorizer_type = "count"
-                    # Vectorize text
-                    X = vectorizer.fit_transform(df['clean_text'])
-                    y = df['encoded_target']
-                    # Split data
-                    X_train, X_test, y_train, y_test = train_test_split(
-                        X, y, test_size=test_size, random_state=42, stratify=y
-                    )
-                    # Save vectorizer
-                    vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
-                    save_artifacts(vectorizer, "artifacts", vectorizer_filename)
-                    # Train model
-                    model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
-                    if model is not None:
-                        st.success(f"✅ Model trained successfully!")
-                        st.session_state.trained_models.append(model_name)
-                        # Display results
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.metric("🎯 Accuracy", f"{accuracy:.4f}")
-                        with col2:
-                            st.metric("🏷️ Classes", len(report) - 3)  # Exclude avg metrics
-                        # Detailed metrics
-                        st.subheader("📊 Detailed Metrics")
-                        metrics_df = pd.DataFrame(report).transpose()
-                        st.dataframe(metrics_df.round(4))
-                except Exception as e:
-                    st.error(f"❌ Training failed: {str(e)}")
     else:
-        st.warning("📁 Please upload a dataset to train a model.")
-# Section: Predictions
-elif section == "🔮 Predictions":
-    st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)
-    # Check for trained models
     if os.path.exists("models") and os.listdir("models"):
-        available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
-                           for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
-            # Single prediction
-            st.subheader("🔮 Single Text Prediction")
-            col1, col2 = st.columns([3, 1])
-            with col1:
-                text_input = st.text_area(
-                    "Enter text to classify:",
-                    height=100,
-                    placeholder="Type or paste your text here..."
-                )
-            with col2:
-                selected_model = st.selectbox("Select model:", available_models)
-                if st.button("🔍 Predict", type="primary"):
-                    if text_input.strip():
-                        with st.spinner("🔄 Making prediction..."):
-                            predicted_label, prediction_proba = predict_text(
-                                selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
-                            )
-                            if predicted_label is not None:
-                                st.success("✅ Prediction completed!")
-                                # Results
-                                st.markdown("### 📋 Results")
-                                st.info(f"**Predicted Class:** {predicted_label}")
-                                # Probabilities
-                                if prediction_proba is not None:
-                                    encoder = load_artifacts("artifacts", "label_encoder.pkl")
-                                    if encoder is not None:
-                                        classes = encoder.classes_
-                                        prob_df = pd.DataFrame({
-                                            'Class': classes,
-                                            'Probability': prediction_proba
-                                        }).sort_values('Probability', ascending=False)
-                                        st.markdown("### 📊 Class Probabilities")
-                                        st.bar_chart(prob_df.set_index('Class'))
-                    else:
-                        st.warning("⚠️ Please enter some text to classify.")
-            # Batch predictions
-            st.markdown("---")
-            st.subheader("📦 Batch Predictions")
-            batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
-            if batch_file is not None:
-                try:
-                    batch_df = pd.read_csv(batch_file, encoding=encoding)
-                    st.write("📖 Preview:")
-                    st.dataframe(batch_df.head())
-                    batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
-                    batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
-                    if st.button("🚀 Run Batch Predictions"):
-                        with st.spinner("🔄 Processing batch predictions..."):
-                            predictions = []
-                            progress_bar = st.progress(0)
-                            for i, text in enumerate(batch_df[batch_text_col]):
-                                pred, _ = predict_text(
-                                    batch_model, str(text),
-                                    st.session_state.get('vectorizer_type', 'tfidf')
-                                )
-                                predictions.append(pred if pred is not None else "Error")
-                                progress_bar.progress((i + 1) / len(batch_df))
-                            batch_df['Predicted_Class'] = predictions
-                            st.success("✅ Batch predictions completed!")
-                            st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
-                            # Download option
-                            csv = batch_df.to_csv(index=False)
-                            st.download_button(
-                                "📥 Download Results",
-                                csv,
-                                "batch_predictions.csv",
-                                "text/csv"
                             )
-                except Exception as e:
-                    st.error(f"❌ Batch prediction error: {str(e)}")
-        else:
-            st.warning("⚠️ No trained models found.")
-    else:
-        st.warning("⚠️ No models available. Please train a model first.")
-# Footer
-st.markdown("---")
-st.markdown("*Built with Streamlit • Text Classification Made Easy*")

 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+from NoCodeTextClassifier.EDA import Informations, Visualizations
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
+from NoCodeTextClassifier.models import Models
 import os
 import pickle
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 # Utility functions
 def save_artifacts(obj, folder_name, file_name):
     """Save artifacts like encoders and vectorizers"""
+    os.makedirs(folder_name, exist_ok=True)
+    with open(os.path.join(folder_name, file_name), 'wb') as f:
+        pickle.dump(obj, f)
 def load_artifacts(folder_name, file_name):
     """Load saved artifacts"""
     except FileNotFoundError:
         st.error(f"File {file_name} not found in {folder_name} folder")
         return None
+def load_model(model_name):
+    """Load trained model"""
+    try:
+        with open(os.path.join('models', model_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"Model {model_name} not found. Please train a model first.")
+        return None
 def predict_text(model_name, text, vectorizer_type="tfidf"):
     """Make prediction on new text"""
     try:
         # Load model
+        model = load_model(model_name)
         if model is None:
             return None, None
         # Load vectorizer
+        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
+        vectorizer = load_artifacts("artifacts", vectorizer_file)
         if vectorizer is None:
             return None, None
         # Load label encoder
+        encoder = load_artifacts("artifacts", "encoder.pkl")
         if encoder is None:
             return None, None
         # Clean and vectorize text
+        text_cleaner = TextCleaner()
+        clean_text = text_cleaner.clean_text(text)
+        # Transform text using the same vectorizer used during training
+        text_vector = vectorizer.transform([clean_text])
         # Make prediction
         prediction = model.predict(text_vector)
         # Get prediction probabilities if available
         if hasattr(model, 'predict_proba'):
             try:
+                prediction_proba = model.predict_proba(text_vector)[0]
+            except:
+                pass
         # Decode prediction
         predicted_label = encoder.inverse_transform(prediction)[0]
         st.error(f"Error during prediction: {str(e)}")
         return None, None
+# Streamlit App
+st.title('No Code Text Classification App')
+st.write('Understand the behavior of your text data and train a model to classify the text data')
 # Sidebar
+section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
+# Upload Data
+st.sidebar.subheader("Upload Your Dataset")
+train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
+test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
+# Global variables to store data and settings
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = "tfidf"
+if train_data is not None:
     try:
+        train_df = pd.read_csv(train_data, encoding='latin1')
+        if test_data is not None:
+            test_df = pd.read_csv(test_data, encoding='latin1')
+        else:
+            test_df = None
+        st.write("Training Data Preview:")
+        st.write(train_df.head(3))
+        columns = train_df.columns.tolist()
+        text_data = st.sidebar.selectbox("Choose the text column:", columns)
+        target = st.sidebar.selectbox("Choose the target column:", columns)
+        # Process data
+        info = Informations(train_df, text_data, target)
+        train_df['clean_text'] = info.clean_text()
+        train_df['text_length'] = info.text_length()
+        # Handle label encoding manually if the class doesn't store encoder
+        from sklearn.preprocessing import LabelEncoder
         label_encoder = LabelEncoder()
+        train_df['target'] = label_encoder.fit_transform(train_df[target])
+        # Save label encoder for later use
+        os.makedirs("artifacts", exist_ok=True)
+        save_artifacts(label_encoder, "artifacts", "encoder.pkl")
     except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        train_df = None
+        info = None
+# Data Analysis Section
+if section == "Data Analysis":
+    if train_data is not None and train_df is not None:
         try:
+            st.subheader("Get Insights from the Data")
+            st.write("Data Shape:", info.shape())
+            st.write("Class Imbalance:", info.class_imbalanced())
+            st.write("Missing Values:", info.missing_values())
+            st.write("Processed Data Preview:")
+            st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
+            st.markdown("**Text Length Analysis**")
+            st.write(info.analysis_text_length('text_length'))
+            # Calculate correlation manually since we handled encoding separately
+            correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
+            st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
+            st.subheader("Visualizations")
+            vis = Visualizations(train_df, text_data, target)
+            vis.class_distribution()
+            vis.text_length_distribution()
         except Exception as e:
+            st.error(f"Error in data analysis: {str(e)}")
     else:
+        st.warning("Please upload training data to get insights")
+# Train Model Section
+elif section == "Train Model":
+    if train_data is not None and train_df is not None:
+        try:
+            st.subheader("Train a Model")
+            # Create two columns for model selection
+            col1, col2 = st.columns(2)
+            with col1:
+                model = st.radio("Choose the Model", [
+                    "Logistic Regression", "Decision Tree",
+                    "Random Forest", "Linear SVC", "SVC",
+                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
+                ])
+            with col2:
+                vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
+            # Initialize vectorizer
+            if vectorizer_choice == "Tfidf Vectorizer":
+                vectorizer = TfidfVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "tfidf"
+            else:
+                vectorizer = CountVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "count"
+            st.write("Training Data Preview:")
+            st.write(train_df[['clean_text', 'target']].head(3))
+            # Vectorize text data
+            X = vectorizer.fit_transform(train_df['clean_text'])
+            y = train_df['target']
+            # Split data
+            X_train, X_test, y_train, y_test = process.split_data(X, y)
+            st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
+            # Save vectorizer for later use
+            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
+            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
+            if st.button("Start Training"):
+                with st.spinner("Training model..."):
+                    models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
+                    # Train selected model
+                    if model == "Logistic Regression":
+                        models.LogisticRegression()
+                    elif model == "Decision Tree":
+                        models.DecisionTree()
+                    elif model == "Linear SVC":
+                        models.LinearSVC()
+                    elif model == "SVC":
+                        models.SVC()
+                    elif model == "Multinomial Naive Bayes":
+                        models.MultinomialNB()
+                    elif model == "Random Forest":
+                        models.RandomForestClassifier()
+                    elif model == "Gaussian Naive Bayes":
+                        models.GaussianNB()
+                st.success("Model training completed!")
+                st.info("You can now use the 'Predictions' section to classify new text.")
+        except Exception as e:
+            st.error(f"Error in model training: {str(e)}")
     else:
+        st.warning("Please upload training data to train a model")
+# Predictions Section
+elif section == "Predictions":
+    st.subheader("Perform Predictions on New Text")
+    # Check if models exist
     if os.path.exists("models") and os.listdir("models"):
+        # Text input for prediction
+        text_input = st.text_area("Enter the text to classify:", height=100)
+        # Model selection
+        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
         if available_models:
+            selected_model = st.selectbox("Choose the trained model:", available_models)
+            # Prediction button
+            if st.button("Predict", key="single_predict"):
+                if text_input.strip():
+                    with st.spinner("Making prediction..."):
+                        predicted_label, prediction_proba = predict_text(
+                            selected_model,
+                            text_input,
+                            st.session_state.get('vectorizer_type', 'tfidf')
+                        )
+                        if predicted_label is not None:
+                            st.success("Prediction completed!")
+                            # Display results
+                            st.markdown("### Prediction Results")
+                            st.markdown(f"**Input Text:** {text_input}")
+                            st.markdown(f"**Predicted Class:** {predicted_label}")
+                            # Display probabilities if available
+                            if prediction_proba is not None:
+                                st.markdown("**Class Probabilities:**")
+                                # Load encoder to get class names
+                                encoder = load_artifacts("artifacts", "encoder.pkl")
+                                if encoder is not None:
+                                    classes = encoder.classes_
+                                    prob_df = pd.DataFrame({
+                                        'Class': classes,
+                                        'Probability': prediction_proba
+                                    }).sort_values('Probability', ascending=False)
+                                    st.bar_chart(prob_df.set_index('Class'))
+                                    st.dataframe(prob_df)
+                else:
+                    st.warning("Please enter some text to classify")
+        else:
+            st.warning("No trained models found. Please train a model first.")
+    else:
+        st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
+    # Option to classify multiple texts
+    st.markdown("---")
+    st.subheader("Batch Predictions")
+    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
+    if uploaded_file is not None:
+        try:
+            batch_df = pd.read_csv(uploaded_file, encoding='latin1')
+            st.write("Uploaded data preview:")
+            st.write(batch_df.head())
+            # Select text column
+            text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
+            if os.path.exists("models") and os.listdir("models"):
+                available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+                batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
+                if st.button("Run Batch Predictions", key="batch_predict"):
+                    with st.spinner("Processing batch predictions..."):
+                        predictions = []
+                        for text in batch_df[text_column]:
+                            pred, _ = predict_text(
+                                batch_model,
+                                str(text),
+                                st.session_state.get('vectorizer_type', 'tfidf')
                             )
+                            predictions.append(pred if pred is not None else "Error")
+                        batch_df['Predicted_Class'] = predictions
+                        st.success("Batch predictions completed!")
+                        st.write("Results:")
+                        st.write(batch_df[[text_column, 'Predicted_Class']])
+                        # Download results
+                        csv = batch_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download predictions as CSV",
+                            data=csv,
+                            file_name="batch_predictions.csv",
+                            mime="text/csv"
+                        )
+        except Exception as e:
+            st.error(f"Error in batch prediction: {str(e)}")