Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

c9acfc2

verified ·

1 Parent(s): 7a3dbca

Upload app.py

Browse files

Files changed (1) hide show

app.py +336 -0

app.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from NoCodeTextClassifier.EDA import Informations, Visualizations
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
+from NoCodeTextClassifier.models import Models
+import os
+import pickle
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+# Utility functions
+def save_artifacts(obj, folder_name, file_name):
+    """Save artifacts like encoders and vectorizers"""
+    os.makedirs(folder_name, exist_ok=True)
+    with open(os.path.join(folder_name, file_name), 'wb') as f:
+        pickle.dump(obj, f)
+def load_artifacts(folder_name, file_name):
+    """Load saved artifacts"""
+    try:
+        with open(os.path.join(folder_name, file_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"File {file_name} not found in {folder_name} folder")
+        return None
+def load_model(model_name):
+    """Load trained model"""
+    try:
+        with open(os.path.join('models', model_name), 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        st.error(f"Model {model_name} not found. Please train a model first.")
+        return None
+def predict_text(model_name, text, vectorizer_type="tfidf"):
+    """Make prediction on new text"""
+    try:
+        # Load model
+        model = load_model(model_name)
+        if model is None:
+            return None, None
+        # Load vectorizer
+        vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
+        vectorizer = load_artifacts("artifacts", vectorizer_file)
+        if vectorizer is None:
+            return None, None
+        # Load label encoder
+        encoder = load_artifacts("artifacts", "encoder.pkl")
+        if encoder is None:
+            return None, None
+        # Clean and vectorize text
+        text_cleaner = TextCleaner()
+        clean_text = text_cleaner.clean_text(text)
+        # Transform text using the same vectorizer used during training
+        text_vector = vectorizer.transform([clean_text])
+        # Make prediction
+        prediction = model.predict(text_vector)
+        prediction_proba = None
+        # Get prediction probabilities if available
+        if hasattr(model, 'predict_proba'):
+            try:
+                prediction_proba = model.predict_proba(text_vector)[0]
+            except:
+                pass
+        # Decode prediction
+        predicted_label = encoder.inverse_transform(prediction)[0]
+        return predicted_label, prediction_proba
+    except Exception as e:
+        st.error(f"Error during prediction: {str(e)}")
+        return None, None
+# Streamlit App
+st.title('No Code Text Classification App')
+st.write('Understand the behavior of your text data and train a model to classify the text data')
+# Sidebar
+section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
+# Upload Data
+st.sidebar.subheader("Upload Your Dataset")
+train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
+test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
+# Global variables to store data and settings
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = "tfidf"
+if train_data is not None:
+    try:
+        train_df = pd.read_csv(train_data, encoding='latin1')
+        if test_data is not None:
+            test_df = pd.read_csv(test_data, encoding='latin1')
+        else:
+            test_df = None
+        st.write("Training Data Preview:")
+        st.write(train_df.head(3))
+        columns = train_df.columns.tolist()
+        text_data = st.sidebar.selectbox("Choose the text column:", columns)
+        target = st.sidebar.selectbox("Choose the target column:", columns)
+        # Process data
+        info = Informations(train_df, text_data, target)
+        train_df['clean_text'] = info.clean_text()
+        train_df['text_length'] = info.text_length()
+        # Handle label encoding manually if the class doesn't store encoder
+        from sklearn.preprocessing import LabelEncoder
+        label_encoder = LabelEncoder()
+        train_df['target'] = label_encoder.fit_transform(train_df[target])
+        # Save label encoder for later use
+        os.makedirs("artifacts", exist_ok=True)
+        save_artifacts(label_encoder, "artifacts", "encoder.pkl")
+    except Exception as e:
+        st.error(f"Error loading data: {str(e)}")
+        train_df = None
+        info = None
+# Data Analysis Section
+if section == "Data Analysis":
+    if train_data is not None and train_df is not None:
+        try:
+            st.subheader("Get Insights from the Data")
+            st.write("Data Shape:", info.shape())
+            st.write("Class Imbalance:", info.class_imbalanced())
+            st.write("Missing Values:", info.missing_values())
+            st.write("Processed Data Preview:")
+            st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
+            st.markdown("**Text Length Analysis**")
+            st.write(info.analysis_text_length('text_length'))
+            # Calculate correlation manually since we handled encoding separately
+            correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
+            st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
+            st.subheader("Visualizations")
+            vis = Visualizations(train_df, text_data, target)
+            vis.class_distribution()
+            vis.text_length_distribution()
+        except Exception as e:
+            st.error(f"Error in data analysis: {str(e)}")
+    else:
+        st.warning("Please upload training data to get insights")
+# Train Model Section
+elif section == "Train Model":
+    if train_data is not None and train_df is not None:
+        try:
+            st.subheader("Train a Model")
+            # Create two columns for model selection
+            col1, col2 = st.columns(2)
+            with col1:
+                model = st.radio("Choose the Model", [
+                    "Logistic Regression", "Decision Tree",
+                    "Random Forest", "Linear SVC", "SVC",
+                    "Multinomial Naive Bayes", "Gaussian Naive Bayes"
+                ])
+            with col2:
+                vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
+            # Initialize vectorizer
+            if vectorizer_choice == "Tfidf Vectorizer":
+                vectorizer = TfidfVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "tfidf"
+            else:
+                vectorizer = CountVectorizer(max_features=10000)
+                st.session_state.vectorizer_type = "count"
+            st.write("Training Data Preview:")
+            st.write(train_df[['clean_text', 'target']].head(3))
+            # Vectorize text data
+            X = vectorizer.fit_transform(train_df['clean_text'])
+            y = train_df['target']
+            # Split data
+            X_train, X_test, y_train, y_test = process.split_data(X, y)
+            st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
+            # Save vectorizer for later use
+            vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
+            save_artifacts(vectorizer, "artifacts", vectorizer_filename)
+            if st.button("Start Training"):
+                with st.spinner("Training model..."):
+                    models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
+                    # Train selected model
+                    if model == "Logistic Regression":
+                        models.LogisticRegression()
+                    elif model == "Decision Tree":
+                        models.DecisionTree()
+                    elif model == "Linear SVC":
+                        models.LinearSVC()
+                    elif model == "SVC":
+                        models.SVC()
+                    elif model == "Multinomial Naive Bayes":
+                        models.MultinomialNB()
+                    elif model == "Random Forest":
+                        models.RandomForestClassifier()
+                    elif model == "Gaussian Naive Bayes":
+                        models.GaussianNB()
+                st.success("Model training completed!")
+                st.info("You can now use the 'Predictions' section to classify new text.")
+        except Exception as e:
+            st.error(f"Error in model training: {str(e)}")
+    else:
+        st.warning("Please upload training data to train a model")
+# Predictions Section
+elif section == "Predictions":
+    st.subheader("Perform Predictions on New Text")
+    # Check if models exist
+    if os.path.exists("models") and os.listdir("models"):
+        # Text input for prediction
+        text_input = st.text_area("Enter the text to classify:", height=100)
+        # Model selection
+        available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+        if available_models:
+            selected_model = st.selectbox("Choose the trained model:", available_models)
+            # Prediction button
+            if st.button("Predict", key="single_predict"):
+                if text_input.strip():
+                    with st.spinner("Making prediction..."):
+                        predicted_label, prediction_proba = predict_text(
+                            selected_model,
+                            text_input,
+                            st.session_state.get('vectorizer_type', 'tfidf')
+                        )
+                        if predicted_label is not None:
+                            st.success("Prediction completed!")
+                            # Display results
+                            st.markdown("### Prediction Results")
+                            st.markdown(f"**Input Text:** {text_input}")
+                            st.markdown(f"**Predicted Class:** {predicted_label}")
+                            # Display probabilities if available
+                            if prediction_proba is not None:
+                                st.markdown("**Class Probabilities:**")
+                                # Load encoder to get class names
+                                encoder = load_artifacts("artifacts", "encoder.pkl")
+                                if encoder is not None:
+                                    classes = encoder.classes_
+                                    prob_df = pd.DataFrame({
+                                        'Class': classes,
+                                        'Probability': prediction_proba
+                                    }).sort_values('Probability', ascending=False)
+                                    st.bar_chart(prob_df.set_index('Class'))
+                                    st.dataframe(prob_df)
+                else:
+                    st.warning("Please enter some text to classify")
+        else:
+            st.warning("No trained models found. Please train a model first.")
+    else:
+        st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
+    # Option to classify multiple texts
+    st.markdown("---")
+    st.subheader("Batch Predictions")
+    uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
+    if uploaded_file is not None:
+        try:
+            batch_df = pd.read_csv(uploaded_file, encoding='latin1')
+            st.write("Uploaded data preview:")
+            st.write(batch_df.head())
+            # Select text column
+            text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
+            if os.path.exists("models") and os.listdir("models"):
+                available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
+                batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
+                if st.button("Run Batch Predictions", key="batch_predict"):
+                    with st.spinner("Processing batch predictions..."):
+                        predictions = []
+                        for text in batch_df[text_column]:
+                            pred, _ = predict_text(
+                                batch_model,
+                                str(text),
+                                st.session_state.get('vectorizer_type', 'tfidf')
+                            )
+                            predictions.append(pred if pred is not None else "Error")
+                        batch_df['Predicted_Class'] = predictions
+                        st.success("Batch predictions completed!")
+                        st.write("Results:")
+                        st.write(batch_df[[text_column, 'Predicted_Class']])
+                        # Download results
+                        csv = batch_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download predictions as CSV",
+                            data=csv,
+                            file_name="batch_predictions.csv",
+                            mime="text/csv"
+                        )
+        except Exception as e:
+            st.error(f"Error in batch prediction: {str(e)}")