Alamgirapi commited on
Commit
a5bc77a
ยท
verified ยท
1 Parent(s): a55dc06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +431 -295
app.py CHANGED
@@ -2,335 +2,471 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
- from NoCodeTextClassifier.EDA import Informations, Visualizations
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
- from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
- from NoCodeTextClassifier.models import Models
 
 
 
 
 
 
 
 
 
9
  import os
10
  import pickle
11
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
12
 
13
- # Utility functions
14
- def save_artifacts(obj, folder_name, file_name):
15
- """Save artifacts like encoders and vectorizers"""
16
- os.makedirs(folder_name, exist_ok=True)
17
- with open(os.path.join(folder_name, file_name), 'wb') as f:
18
- pickle.dump(obj, f)
19
 
20
- def load_artifacts(folder_name, file_name):
21
- """Load saved artifacts"""
22
- try:
23
- with open(os.path.join(folder_name, file_name), 'rb') as f:
24
- return pickle.load(f)
25
- except FileNotFoundError:
26
- st.error(f"File {file_name} not found in {folder_name} folder")
27
- return None
28
 
29
- def load_model(model_name):
30
- """Load trained model"""
31
- try:
32
- with open(os.path.join('models', model_name), 'rb') as f:
33
- return pickle.load(f)
34
- except FileNotFoundError:
35
- st.error(f"Model {model_name} not found. Please train a model first.")
36
- return None
37
 
38
- def predict_text(model_name, text, vectorizer_type="tfidf"):
39
- """Make prediction on new text"""
40
- try:
41
- # Load model
42
- model = load_model(model_name)
43
- if model is None:
44
- return None, None
45
-
46
- # Load vectorizer
47
- vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
- vectorizer = load_artifacts("artifacts", vectorizer_file)
49
- if vectorizer is None:
50
- return None, None
51
-
52
- # Load label encoder
53
- encoder = load_artifacts("artifacts", "encoder.pkl")
54
- if encoder is None:
55
- return None, None
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Clean and vectorize text
58
- text_cleaner = TextCleaner()
59
- clean_text = text_cleaner.clean_text(text)
60
 
61
- # Transform text using the same vectorizer used during training
62
- text_vector = vectorizer.transform([clean_text])
63
 
64
- # Make prediction
65
- prediction = model.predict(text_vector)
66
- prediction_proba = None
67
 
68
- # Get prediction probabilities if available
69
- if hasattr(model, 'predict_proba'):
70
- try:
71
- prediction_proba = model.predict_proba(text_vector)[0]
72
- except:
73
- pass
74
 
75
- # Decode prediction
76
- predicted_label = encoder.inverse_transform(prediction)[0]
77
 
78
- return predicted_label, prediction_proba
 
 
79
 
80
- except Exception as e:
81
- st.error(f"Error during prediction: {str(e)}")
82
- return None, None
83
-
84
- # Streamlit App
85
- st.title('No Code Text Classification App')
86
- st.write('Understand the behavior of your text data and train a model to classify the text data')
87
-
88
- # Sidebar
89
- section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
 
91
- # Upload Data
92
- st.sidebar.subheader("Upload Your Dataset")
93
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
-
96
- # Global variables to store data and settings
97
- if 'vectorizer_type' not in st.session_state:
98
- st.session_state.vectorizer_type = "tfidf"
99
 
100
- if train_data is not None:
 
101
  try:
102
- train_df = pd.read_csv(train_data)
103
-
104
- if test_data is not None:
105
- test_df = pd.read_csv(test_data)
106
- else:
107
- test_df = None
108
-
109
- st.write("Training Data Preview:")
110
- st.write(train_df.head(3))
111
-
112
- columns = train_df.columns.tolist()
113
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
- target = st.sidebar.selectbox("Choose the target column:", columns)
115
-
116
- # Process data
117
- info = Informations(train_df, text_data, target)
118
- train_df['clean_text'] = info.clean_text()
119
- train_df['text_length'] = info.text_length()
120
-
121
- # Handle label encoding manually if the class doesn't store encoder
122
- from sklearn.preprocessing import LabelEncoder
123
- label_encoder = LabelEncoder()
124
- train_df['target'] = label_encoder.fit_transform(train_df[target])
125
-
126
- # Save label encoder for later use
127
- os.makedirs("artifacts", exist_ok=True)
128
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
-
130
- except Exception as e:
131
- st.error(f"Error loading data: {str(e)}")
132
- train_df = None
133
- info = None
134
-
135
- # Data Analysis Section
136
- if section == "Data Analysis":
137
- if train_data is not None and train_df is not None:
138
  try:
139
- st.subheader("Get Insights from the Data")
140
-
141
- st.write("Data Shape:", info.shape())
142
- st.write("Class Imbalance:", info.class_imbalanced())
143
- st.write("Missing Values:", info.missing_values())
144
-
145
- st.write("Processed Data Preview:")
146
- st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
-
148
- st.markdown("**Text Length Analysis**")
149
- st.write(info.analysis_text_length('text_length'))
150
-
151
- # Calculate correlation manually since we handled encoding separately
152
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
- st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
 
155
- st.subheader("Visualizations")
156
- vis = Visualizations(train_df, text_data, target)
157
- vis.class_distribution()
158
- vis.text_length_distribution()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- except Exception as e:
161
- st.error(f"Error in data analysis: {str(e)}")
162
- else:
163
- st.warning("Please upload training data to get insights")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Train Model Section
166
- elif section == "Train Model":
167
- if train_data is not None and train_df is not None:
168
- try:
169
- st.subheader("Train a Model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- # Create two columns for model selection
172
- col1, col2 = st.columns(2)
 
173
 
174
- with col1:
175
- model = st.radio("Choose the Model", [
176
- "Logistic Regression", "Decision Tree",
177
- "Random Forest", "Linear SVC", "SVC",
178
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
- ])
180
-
181
- with col2:
182
- vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
 
184
- # Initialize vectorizer
185
- if vectorizer_choice == "Tfidf Vectorizer":
186
- vectorizer = TfidfVectorizer(max_features=10000)
187
- st.session_state.vectorizer_type = "tfidf"
188
- else:
189
- vectorizer = CountVectorizer(max_features=10000)
190
- st.session_state.vectorizer_type = "count"
191
 
192
- st.write("Training Data Preview:")
193
- st.write(train_df[['clean_text', 'target']].head(3))
 
 
 
194
 
195
- # Vectorize text data
196
- X = vectorizer.fit_transform(train_df['clean_text'])
197
- y = train_df['target']
198
 
199
- # Split data
200
- X_train, X_test, y_train, y_test = process.split_data(X, y)
201
- st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
 
203
- # Save vectorizer for later use
204
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
 
206
 
207
- if st.button("Start Training"):
208
- with st.spinner("Training model..."):
209
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 
 
210
 
211
- # Train selected model
212
- if model == "Logistic Regression":
213
- models.LogisticRegression()
214
- elif model == "Decision Tree":
215
- models.DecisionTree()
216
- elif model == "Linear SVC":
217
- models.LinearSVC()
218
- elif model == "SVC":
219
- models.SVC()
220
- elif model == "Multinomial Naive Bayes":
221
- models.MultinomialNB()
222
- elif model == "Random Forest":
223
- models.RandomForestClassifier()
224
- elif model == "Gaussian Naive Bayes":
225
- models.GaussianNB()
226
 
227
- st.success("Model training completed!")
228
- st.info("You can now use the 'Predictions' section to classify new text.")
229
-
230
- except Exception as e:
231
- st.error(f"Error in model training: {str(e)}")
232
- else:
233
- st.warning("Please upload training data to train a model")
234
-
235
- # Predictions Section
236
- elif section == "Predictions":
237
- st.subheader("Perform Predictions on New Text")
238
-
239
- # Check if models exist
240
- if os.path.exists("models") and os.listdir("models"):
241
- # Text input for prediction
242
- text_input = st.text_area("Enter the text to classify:", height=100)
243
-
244
- # Model selection
245
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
-
247
- if available_models:
248
- selected_model = st.selectbox("Choose the trained model:", available_models)
249
-
250
- # Prediction button
251
- if st.button("Predict", key="single_predict"):
252
- if text_input.strip():
253
- with st.spinner("Making prediction..."):
254
- predicted_label, prediction_proba = predict_text(
255
- selected_model,
256
- text_input,
257
- st.session_state.get('vectorizer_type', 'tfidf')
258
- )
259
 
260
- if predicted_label is not None:
261
- st.success("Prediction completed!")
262
-
263
- # Display results
264
- st.markdown("### Prediction Results")
265
- st.markdown(f"**Input Text:** {text_input}")
266
- st.markdown(f"**Predicted Class:** {predicted_label}")
267
-
268
- # Display probabilities if available
269
- if prediction_proba is not None:
270
- st.markdown("**Class Probabilities:**")
271
-
272
- # Load encoder to get class names
273
- encoder = load_artifacts("artifacts", "encoder.pkl")
274
- if encoder is not None:
275
- classes = encoder.classes_
276
- prob_df = pd.DataFrame({
277
- 'Class': classes,
278
- 'Probability': prediction_proba
279
- }).sort_values('Probability', ascending=False)
280
-
281
- st.bar_chart(prob_df.set_index('Class'))
282
- st.dataframe(prob_df)
283
- else:
284
- st.warning("Please enter some text to classify")
285
- else:
286
- st.warning("No trained models found. Please train a model first.")
287
- else:
288
- st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
-
290
- # Option to classify multiple texts
291
- st.markdown("---")
292
- st.subheader("Batch Predictions")
293
-
294
- uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
-
296
- if uploaded_file is not None:
297
- try:
298
- batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
- st.write("Uploaded data preview:")
300
- st.write(batch_df.head())
301
-
302
- # Select text column
303
- text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
-
305
- if os.path.exists("models") and os.listdir("models"):
306
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
- batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
 
309
- if st.button("Run Batch Predictions", key="batch_predict"):
310
- with st.spinner("Processing batch predictions..."):
311
- predictions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
- for text in batch_df[text_column]:
314
- pred, _ = predict_text(
315
- batch_model,
316
- str(text),
317
- st.session_state.get('vectorizer_type', 'tfidf')
318
- )
319
- predictions.append(pred if pred is not None else "Error")
 
 
 
 
320
 
321
- batch_df['Predicted_Class'] = predictions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
- st.success("Batch predictions completed!")
324
- st.write("Results:")
325
- st.write(batch_df[[text_column, 'Predicted_Class']])
326
 
327
- # Download results
328
- csv = batch_df.to_csv(index=False)
329
- st.download_button(
330
- label="Download predictions as CSV",
331
- data=csv,
332
- file_name="batch_predictions.csv",
333
- mime="text/csv"
334
- )
335
- except Exception as e:
336
- st.error(f"Error in batch prediction: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.ensemble import RandomForestClassifier
10
+ from sklearn.svm import LinearSVC, SVC
11
+ from sklearn.naive_bayes import MultinomialNB, GaussianNB
12
+ from sklearn.preprocessing import LabelEncoder
13
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
+ import re
15
+ import string
16
+ import nltk
17
  import os
18
  import pickle
19
+ import io
20
+ import base64
21
 
22
+ # Download required NLTK data
23
+ try:
24
+ nltk.data.find('corpora/stopwords')
25
+ except LookupError:
26
+ nltk.download('stopwords', quiet=True)
 
27
 
28
+ try:
29
+ nltk.data.find('corpora/wordnet')
30
+ except LookupError:
31
+ nltk.download('wordnet', quiet=True)
 
 
 
 
32
 
33
+ from nltk.corpus import stopwords
34
+ from nltk.stem import WordNetLemmatizer
 
 
 
 
 
 
35
 
36
+ # Set page config
37
+ st.set_page_config(
38
+ page_title="No Code Text Classification",
39
+ page_icon="๐Ÿ“",
40
+ layout="wide"
41
+ )
42
+
43
+ # Initialize session state
44
+ if 'trained_model' not in st.session_state:
45
+ st.session_state.trained_model = None
46
+ if 'vectorizer' not in st.session_state:
47
+ st.session_state.vectorizer = None
48
+ if 'label_encoder' not in st.session_state:
49
+ st.session_state.label_encoder = None
50
+ if 'vectorizer_type' not in st.session_state:
51
+ st.session_state.vectorizer_type = 'tfidf'
52
+ if 'train_df' not in st.session_state:
53
+ st.session_state.train_df = None
54
+
55
+ # Text cleaning class
56
+ class TextCleaner:
57
+ def __init__(self):
58
+ self.stop_words = set(stopwords.words('english'))
59
+ self.lemmatizer = WordNetLemmatizer()
60
+
61
+ def clean_text(self, text):
62
+ if pd.isna(text):
63
+ return ""
64
 
65
+ # Convert to lowercase
66
+ text = str(text).lower()
 
67
 
68
+ # Remove URLs
69
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
70
 
71
+ # Remove user mentions and hashtags
72
+ text = re.sub(r'@\w+|#\w+', '', text)
 
73
 
74
+ # Remove punctuation
75
+ text = text.translate(str.maketrans('', '', string.punctuation))
 
 
 
 
76
 
77
+ # Remove extra whitespace
78
+ text = re.sub(r'\s+', ' ', text).strip()
79
 
80
+ # Remove stopwords and lemmatize
81
+ words = text.split()
82
+ words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
83
 
84
+ return ' '.join(words)
 
 
 
 
 
 
 
 
 
85
 
86
+ # Utility functions
87
+ def create_download_link(val, filename):
88
+ """Generate a download link for a file"""
89
+ b64 = base64.b64encode(val)
90
+ return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">Download {filename}</a>'
 
 
 
91
 
92
+ def safe_file_read(uploaded_file):
93
+ """Safely read uploaded file with multiple encoding attempts"""
94
  try:
95
+ # Try UTF-8 first
96
+ return pd.read_csv(uploaded_file, encoding='utf-8')
97
+ except UnicodeDecodeError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
+ # Try latin1
100
+ uploaded_file.seek(0) # Reset file pointer
101
+ return pd.read_csv(uploaded_file, encoding='latin1')
102
+ except:
103
+ try:
104
+ # Try cp1252
105
+ uploaded_file.seek(0)
106
+ return pd.read_csv(uploaded_file, encoding='cp1252')
107
+ except Exception as e:
108
+ st.error(f"Error reading file: {str(e)}")
109
+ return None
 
 
 
 
110
 
111
+ # Data Analysis Functions
112
+ def get_data_insights(df, text_col, target_col):
113
+ """Get basic insights from the data"""
114
+ insights = {}
115
+
116
+ # Basic info
117
+ insights['shape'] = df.shape
118
+ insights['missing_values'] = df.isnull().sum().to_dict()
119
+
120
+ # Class distribution
121
+ insights['class_distribution'] = df[target_col].value_counts().to_dict()
122
+
123
+ # Text length analysis
124
+ df['text_length'] = df[text_col].astype(str).str.len()
125
+ insights['avg_text_length'] = df['text_length'].mean()
126
+ insights['min_text_length'] = df['text_length'].min()
127
+ insights['max_text_length'] = df['text_length'].max()
128
+
129
+ return insights
130
 
131
+ def create_visualizations(df, text_col, target_col):
132
+ """Create visualizations for the data"""
133
+
134
+ # Class distribution
135
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
136
+
137
+ # Class distribution bar plot
138
+ class_counts = df[target_col].value_counts()
139
+ ax1.bar(class_counts.index, class_counts.values)
140
+ ax1.set_title('Class Distribution')
141
+ ax1.set_xlabel('Classes')
142
+ ax1.set_ylabel('Count')
143
+ ax1.tick_params(axis='x', rotation=45)
144
+
145
+ # Text length distribution
146
+ df['text_length'] = df[text_col].astype(str).str.len()
147
+ ax2.hist(df['text_length'], bins=30, alpha=0.7)
148
+ ax2.set_title('Text Length Distribution')
149
+ ax2.set_xlabel('Text Length')
150
+ ax2.set_ylabel('Frequency')
151
+
152
+ plt.tight_layout()
153
+ st.pyplot(fig)
154
 
155
+ # Model Training Functions
156
+ def train_model(X_train, X_test, y_train, y_test, model_name):
157
+ """Train the selected model"""
158
+
159
+ models = {
160
+ 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
161
+ 'Decision Tree': DecisionTreeClassifier(random_state=42),
162
+ 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
163
+ 'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
164
+ 'SVC': SVC(random_state=42, probability=True),
165
+ 'Multinomial Naive Bayes': MultinomialNB(),
166
+ 'Gaussian Naive Bayes': GaussianNB()
167
+ }
168
+
169
+ model = models[model_name]
170
+
171
+ # Handle sparse matrices for Gaussian NB
172
+ if model_name == 'Gaussian Naive Bayes':
173
+ if hasattr(X_train, 'toarray'):
174
+ X_train = X_train.toarray()
175
+ X_test = X_test.toarray()
176
+
177
+ # Train model
178
+ model.fit(X_train, y_train)
179
+
180
+ # Make predictions
181
+ y_pred = model.predict(X_test)
182
+
183
+ # Calculate metrics
184
+ accuracy = accuracy_score(y_test, y_pred)
185
+
186
+ return model, accuracy, y_pred
187
 
188
+ # Main App
189
+ st.title('๐Ÿ”ค No Code Text Classification App')
190
+ st.markdown('Upload your data, analyze it, train models, and make predictions without writing any code!')
191
 
192
+ # Sidebar
193
+ st.sidebar.header("๐Ÿ“ Data Upload")
 
 
 
 
 
 
 
194
 
195
+ # File upload with better error handling
196
+ train_data = st.sidebar.file_uploader(
197
+ "Upload training data (CSV)",
198
+ type=["csv"],
199
+ help="Upload a CSV file with text and labels"
200
+ )
 
201
 
202
+ # Process uploaded data
203
+ if train_data is not None:
204
+ try:
205
+ with st.spinner("Loading data..."):
206
+ train_df = safe_file_read(train_data)
207
 
208
+ if train_df is not None:
209
+ st.session_state.train_df = train_df
 
210
 
211
+ st.sidebar.success(f"โœ… Data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
 
 
212
 
213
+ # Column selection
214
+ columns = train_df.columns.tolist()
215
+ text_col = st.sidebar.selectbox("๐Ÿ“ Select text column:", columns, key="text_col")
216
+ target_col = st.sidebar.selectbox("๐ŸŽฏ Select target column:", columns, key="target_col")
217
 
218
+ if text_col and target_col and text_col != target_col:
219
+ # Clean and prepare data
220
+ with st.spinner("Preprocessing data..."):
221
+ text_cleaner = TextCleaner()
222
+ train_df['clean_text'] = train_df[text_col].apply(text_cleaner.clean_text)
223
 
224
+ # Encode labels
225
+ label_encoder = LabelEncoder()
226
+ train_df['encoded_target'] = label_encoder.fit_transform(train_df[target_col])
227
+ st.session_state.label_encoder = label_encoder
 
 
 
 
 
 
 
 
 
 
 
228
 
229
+ # Main sections
230
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ” Predictions"])
231
+
232
+ # Data Analysis Tab
233
+ with tab1:
234
+ st.header("๐Ÿ“Š Data Analysis")
235
+
236
+ col1, col2 = st.columns(2)
237
+
238
+ with col1:
239
+ st.subheader("๐Ÿ“ˆ Dataset Overview")
240
+ insights = get_data_insights(train_df, text_col, target_col)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ st.metric("Total Samples", insights['shape'][0])
243
+ st.metric("Number of Features", insights['shape'][1])
244
+ st.metric("Average Text Length", f"{insights['avg_text_length']:.1f}")
245
+
246
+ st.subheader("๐ŸŽฏ Class Distribution")
247
+ class_dist_df = pd.DataFrame(list(insights['class_distribution'].items()),
248
+ columns=['Class', 'Count'])
249
+ st.dataframe(class_dist_df, use_container_width=True)
250
+
251
+ with col2:
252
+ st.subheader("๐Ÿ“‹ Data Preview")
253
+ preview_df = train_df[[text_col, target_col]].head()
254
+ st.dataframe(preview_df, use_container_width=True)
255
+
256
+ st.subheader("๐Ÿงน Cleaned Text Preview")
257
+ cleaned_preview = train_df[['clean_text', target_col]].head()
258
+ st.dataframe(cleaned_preview, use_container_width=True)
259
+
260
+ st.subheader("๐Ÿ“Š Visualizations")
261
+ create_visualizations(train_df, text_col, target_col)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
+ # Train Model Tab
264
+ with tab2:
265
+ st.header("๐Ÿค– Train Model")
266
+
267
+ col1, col2 = st.columns(2)
268
+
269
+ with col1:
270
+ st.subheader("๐Ÿ”ง Model Selection")
271
+ model_name = st.selectbox(
272
+ "Choose a model:",
273
+ ["Logistic Regression", "Decision Tree", "Random Forest",
274
+ "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
275
+ )
276
+
277
+ with col2:
278
+ st.subheader("๐Ÿ“Š Vectorizer Selection")
279
+ vectorizer_type = st.selectbox(
280
+ "Choose vectorizer:",
281
+ ["TF-IDF Vectorizer", "Count Vectorizer"]
282
+ )
283
+
284
+ # Training parameters
285
+ st.subheader("โš™๏ธ Training Parameters")
286
+ col3, col4 = st.columns(2)
287
+ with col3:
288
+ test_size = st.slider("Test size", 0.1, 0.5, 0.2, 0.05)
289
+ max_features = st.number_input("Max features", 1000, 20000, 10000, 1000)
290
+
291
+ if st.button("๐Ÿš€ Train Model", type="primary"):
292
+ try:
293
+ with st.spinner("Training model... This may take a few minutes."):
294
+ # Initialize vectorizer
295
+ if vectorizer_type == "TF-IDF Vectorizer":
296
+ vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
297
+ st.session_state.vectorizer_type = 'tfidf'
298
+ else:
299
+ vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
300
+ st.session_state.vectorizer_type = 'count'
301
+
302
+ # Vectorize text
303
+ X = vectorizer.fit_transform(train_df['clean_text'])
304
+ y = train_df['encoded_target']
305
+
306
+ # Split data
307
+ X_train, X_test, y_train, y_test = train_test_split(
308
+ X, y, test_size=test_size, random_state=42, stratify=y
309
+ )
310
+
311
+ # Train model
312
+ model, accuracy, y_pred = train_model(X_train, X_test, y_train, y_test, model_name)
313
+
314
+ # Store in session state
315
+ st.session_state.trained_model = model
316
+ st.session_state.vectorizer = vectorizer
317
+
318
+ # Display results
319
+ st.success("๐ŸŽ‰ Model training completed!")
320
+
321
+ col5, col6 = st.columns(2)
322
+ with col5:
323
+ st.metric("๐ŸŽฏ Accuracy", f"{accuracy:.4f}")
324
+ st.metric("๐Ÿ‹๏ธ Training Samples", len(X_train))
325
+ st.metric("๐Ÿงช Test Samples", len(X_test))
326
+
327
+ with col6:
328
+ st.subheader("๐Ÿ“Š Classification Report")
329
+ report = classification_report(y_test, y_pred,
330
+ target_names=label_encoder.classes_,
331
+ output_dict=True)
332
+ report_df = pd.DataFrame(report).transpose()
333
+ st.dataframe(report_df.round(3), use_container_width=True)
334
 
335
+ except Exception as e:
336
+ st.error(f"โŒ Error during training: {str(e)}")
337
+
338
+ # Predictions Tab
339
+ with tab3:
340
+ st.header("๐Ÿ” Make Predictions")
341
+
342
+ if st.session_state.trained_model is not None:
343
+ # Single prediction
344
+ st.subheader("๐Ÿ“ Single Text Prediction")
345
+ user_input = st.text_area("Enter text to classify:", height=100)
346
 
347
+ if st.button("๐Ÿ”ฎ Predict", type="primary"):
348
+ if user_input.strip():
349
+ try:
350
+ with st.spinner("Making prediction..."):
351
+ # Clean and vectorize input
352
+ text_cleaner = TextCleaner()
353
+ clean_input = text_cleaner.clean_text(user_input)
354
+ input_vector = st.session_state.vectorizer.transform([clean_input])
355
+
356
+ # Handle sparse matrix for Gaussian NB
357
+ if isinstance(st.session_state.trained_model, GaussianNB):
358
+ input_vector = input_vector.toarray()
359
+
360
+ # Make prediction
361
+ prediction = st.session_state.trained_model.predict(input_vector)[0]
362
+ predicted_label = st.session_state.label_encoder.inverse_transform([prediction])[0]
363
+
364
+ # Get probabilities if available
365
+ if hasattr(st.session_state.trained_model, 'predict_proba'):
366
+ try:
367
+ proba = st.session_state.trained_model.predict_proba(input_vector)[0]
368
+
369
+ st.success("๐ŸŽ‰ Prediction completed!")
370
+ st.write(f"**Input:** {user_input}")
371
+ st.write(f"**Predicted Class:** {predicted_label}")
372
+
373
+ # Show probabilities
374
+ st.subheader("๐Ÿ“Š Class Probabilities")
375
+ prob_df = pd.DataFrame({
376
+ 'Class': st.session_state.label_encoder.classes_,
377
+ 'Probability': proba
378
+ }).sort_values('Probability', ascending=False)
379
+
380
+ st.bar_chart(prob_df.set_index('Class'))
381
+ st.dataframe(prob_df.round(4), use_container_width=True)
382
+ except:
383
+ st.success("๐ŸŽ‰ Prediction completed!")
384
+ st.write(f"**Predicted Class:** {predicted_label}")
385
+ else:
386
+ st.success("๐ŸŽ‰ Prediction completed!")
387
+ st.write(f"**Predicted Class:** {predicted_label}")
388
+
389
+ except Exception as e:
390
+ st.error(f"โŒ Error during prediction: {str(e)}")
391
+ else:
392
+ st.warning("โš ๏ธ Please enter some text to classify")
393
 
394
+ # Batch predictions
395
+ st.subheader("๐Ÿ“Š Batch Predictions")
396
+ batch_file = st.file_uploader("Upload CSV for batch predictions", type=["csv"])
397
 
398
+ if batch_file is not None:
399
+ try:
400
+ batch_df = safe_file_read(batch_file)
401
+ if batch_df is not None:
402
+ st.write("**Preview:**")
403
+ st.dataframe(batch_df.head(), use_container_width=True)
404
+
405
+ batch_text_col = st.selectbox("Select text column for prediction:",
406
+ batch_df.columns.tolist())
407
+
408
+ if st.button("๐Ÿš€ Run Batch Predictions"):
409
+ with st.spinner("Processing batch predictions..."):
410
+ text_cleaner = TextCleaner()
411
+ predictions = []
412
+
413
+ for text in batch_df[batch_text_col]:
414
+ try:
415
+ clean_text = text_cleaner.clean_text(str(text))
416
+ text_vector = st.session_state.vectorizer.transform([clean_text])
417
+
418
+ if isinstance(st.session_state.trained_model, GaussianNB):
419
+ text_vector = text_vector.toarray()
420
+
421
+ pred = st.session_state.trained_model.predict(text_vector)[0]
422
+ pred_label = st.session_state.label_encoder.inverse_transform([pred])[0]
423
+ predictions.append(pred_label)
424
+ except:
425
+ predictions.append("Error")
426
+
427
+ batch_df['Predicted_Class'] = predictions
428
+
429
+ st.success("๐ŸŽ‰ Batch predictions completed!")
430
+ st.dataframe(batch_df, use_container_width=True)
431
+
432
+ # Download results
433
+ csv_data = batch_df.to_csv(index=False)
434
+ st.download_button(
435
+ label="๐Ÿ“ฅ Download Results",
436
+ data=csv_data,
437
+ file_name="batch_predictions.csv",
438
+ mime="text/csv"
439
+ )
440
+ except Exception as e:
441
+ st.error(f"โŒ Error processing batch file: {str(e)}")
442
+ else:
443
+ st.warning("โš ๏ธ No trained model found. Please train a model first in the 'Train Model' tab.")
444
+ else:
445
+ st.warning("โš ๏ธ Please select different columns for text and target.")
446
+
447
+ except Exception as e:
448
+ st.error(f"โŒ Error loading file: {str(e)}")
449
+ st.info("๐Ÿ’ก Try these solutions:")
450
+ st.write("- Check if the file is a valid CSV")
451
+ st.write("- Ensure the file is not corrupted")
452
+ st.write("- Try saving the file with UTF-8 encoding")
453
+
454
+ else:
455
+ st.info("๐Ÿ‘† Please upload a CSV file to get started")
456
+
457
+ # Show example data format
458
+ st.subheader("๐Ÿ“‹ Expected Data Format")
459
+ example_df = pd.DataFrame({
460
+ 'text': [
461
+ "This product is amazing! I love it.",
462
+ "Terrible quality, waste of money.",
463
+ "Good value for the price.",
464
+ "Not what I expected, disappointed."
465
+ ],
466
+ 'sentiment': ['positive', 'negative', 'positive', 'negative']
467
+ })
468
+ st.dataframe(example_df, use_container_width=True)
469
+
470
+ # Footer
471
+ st.markdown("---")
472
+ st.markdown("Built with โค๏ธ using Streamlit | No Code Text Classification App")