Alamgirapi commited on
Commit
6b934fc
ยท
verified ยท
1 Parent(s): 5ba4816

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -412
app.py CHANGED
@@ -2,106 +2,110 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
- from sklearn.preprocessing import LabelEncoder
7
- from sklearn.model_selection import train_test_split
8
- from sklearn.linear_model import LogisticRegression
9
- from sklearn.tree import DecisionTreeClassifier
10
- from sklearn.ensemble import RandomForestClassifier
11
- from sklearn.svm import LinearSVC, SVC
12
- from sklearn.naive_bayes import MultinomialNB, GaussianNB
13
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
  import os
15
  import pickle
16
- import tempfile
17
- import re
18
- import string
19
- from collections import Counter
20
-
21
- # Text Cleaning Class (replacing the custom module)
22
- class TextCleaner:
23
- def clean_text(self, text):
24
- """Clean and preprocess text"""
25
- if pd.isna(text):
26
- return ""
27
-
28
- # Convert to lowercase
29
- text = str(text).lower()
30
-
31
- # Remove special characters and digits
32
- text = re.sub(r'[^a-zA-Z\s]', '', text)
33
-
34
- # Remove extra whitespace
35
- text = ' '.join(text.split())
36
-
37
- return text
38
 
39
- # Information Analysis Class (replacing the custom module)
40
- class TextInformations:
41
- def __init__(self, df, text_col, target_col):
42
- self.df = df
43
- self.text_col = text_col
44
- self.target_col = target_col
45
-
46
- def shape(self):
47
- return self.df.shape
48
-
49
- def missing_values(self):
50
- return self.df.isnull().sum().to_dict()
51
-
52
- def class_imbalanced(self):
53
- return self.df[self.target_col].value_counts().to_dict()
54
-
55
- def clean_text(self):
56
- cleaner = TextCleaner()
57
- return self.df[self.text_col].apply(cleaner.clean_text)
58
-
59
- def text_length(self):
60
- return self.df[self.text_col].str.len()
61
 
62
  # Utility functions
63
- def save_to_session(obj, key):
64
- """Save objects to session state instead of files"""
65
- st.session_state[key] = obj
 
 
 
 
 
 
 
66
 
67
- def load_from_session(key):
68
- """Load objects from session state"""
69
- return st.session_state.get(key, None)
 
 
 
 
 
 
 
 
70
 
71
- def train_model(model_name, X_train, X_test, y_train, y_test):
72
- """Train the selected model"""
73
- if model_name == "Logistic Regression":
74
- model = LogisticRegression(random_state=42, max_iter=1000)
75
- elif model_name == "Decision Tree":
76
- model = DecisionTreeClassifier(random_state=42)
77
- elif model_name == "Random Forest":
78
- model = RandomForestClassifier(random_state=42, n_estimators=100)
79
- elif model_name == "Linear SVC":
80
- model = LinearSVC(random_state=42, max_iter=1000)
81
- elif model_name == "SVC":
82
- model = SVC(random_state=42, probability=True)
83
- elif model_name == "Multinomial Naive Bayes":
84
- model = MultinomialNB()
85
- elif model_name == "Gaussian Naive Bayes":
86
- model = GaussianNB()
87
-
88
- # Train model
89
- model.fit(X_train, y_train)
90
-
91
- # Make predictions
92
- y_pred = model.predict(X_test)
93
- accuracy = accuracy_score(y_test, y_pred)
 
 
 
 
 
 
 
 
 
 
94
 
95
- return model, accuracy
 
 
 
 
 
 
 
 
96
 
97
- def predict_text(text, model, vectorizer, encoder):
98
  """Make prediction on new text"""
99
  try:
100
- # Clean text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  text_cleaner = TextCleaner()
102
  clean_text = text_cleaner.clean_text(text)
103
 
104
- # Transform text using the vectorizer
105
  text_vector = vectorizer.transform([clean_text])
106
 
107
  # Make prediction
@@ -124,425 +128,315 @@ def predict_text(text, model, vectorizer, encoder):
124
  st.error(f"Error during prediction: {str(e)}")
125
  return None, None
126
 
127
- # Streamlit App Configuration
128
- st.set_page_config(
129
- page_title="Text Classification App",
130
- page_icon="๐Ÿ“",
131
- layout="wide"
132
- )
133
-
134
  st.title('๐Ÿ“ No Code Text Classification App')
135
- st.markdown('Analyze your text data and train machine learning models for text classification')
136
-
137
- # Initialize session state
138
- if 'model_trained' not in st.session_state:
139
- st.session_state.model_trained = False
140
- if 'training_data_processed' not in st.session_state:
141
- st.session_state.training_data_processed = False
142
 
143
  # Sidebar
144
  st.sidebar.title("Navigation")
145
- section = st.sidebar.radio(
146
- "Choose Section",
147
- ["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ”ฎ Predictions"],
148
- index=0
149
- )
150
 
151
- # Upload Data Section
152
- st.sidebar.markdown("---")
153
  st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
 
 
154
 
155
- # File uploader with better error handling
156
- try:
157
- train_data = st.sidebar.file_uploader(
158
- "Upload training data (CSV)",
159
- type=["csv"],
160
- help="Upload a CSV file with text and labels for training"
161
- )
162
-
163
- test_data = st.sidebar.file_uploader(
164
- "Upload test data (CSV, optional)",
165
- type=["csv"],
166
- help="Optional: Upload a separate test dataset"
167
- )
168
- except Exception as e:
169
- st.sidebar.error(f"File upload error: {str(e)}")
170
- st.sidebar.info("Try refreshing the page or using a different browser")
171
 
172
  # Process uploaded data
173
  if train_data is not None:
174
  try:
175
- # Add encoding options to handle different CSV formats
176
- encoding_option = st.sidebar.selectbox(
177
- "CSV Encoding",
178
- ["utf-8", "latin-1", "cp1252", "iso-8859-1"],
179
- help="Try different encodings if you get errors"
180
- )
181
 
182
- train_df = pd.read_csv(train_data, encoding=encoding_option)
183
-
184
- if test_data is not None:
185
- test_df = pd.read_csv(test_data, encoding=encoding_option)
186
- else:
187
- test_df = None
188
 
189
- st.sidebar.success(f"โœ… Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
190
-
191
- # Column selection
192
- columns = train_df.columns.tolist()
193
- text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
194
- target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
195
-
196
- # Store processed data in session state
197
- st.session_state.train_df = train_df
198
- st.session_state.test_df = test_df
199
- st.session_state.text_col = text_data
200
- st.session_state.target_col = target
201
- st.session_state.training_data_processed = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  except Exception as e:
204
- st.sidebar.error(f"โŒ Error loading data: {str(e)}")
205
- st.sidebar.info("Please check your CSV file format and encoding")
 
 
 
 
 
206
 
207
  # Data Analysis Section
208
- if section == "๐Ÿ“Š Data Analysis":
209
- st.header("๐Ÿ“Š Data Analysis")
210
-
211
- if st.session_state.get('training_data_processed', False):
212
  try:
213
- train_df = st.session_state.train_df
214
- text_col = st.session_state.text_col
215
- target_col = st.session_state.target_col
216
 
217
- # Create info object
218
- info = TextInformations(train_df, text_col, target_col)
219
-
220
- # Data preprocessing
221
- train_df['clean_text'] = info.clean_text()
222
- train_df['text_length'] = info.text_length()
223
-
224
- # Display basic information
225
  col1, col2, col3 = st.columns(3)
226
-
227
  with col1:
228
- st.metric("Dataset Shape", f"{info.shape()[0]} ร— {info.shape()[1]}")
229
-
230
  with col2:
231
- missing_vals = sum(info.missing_values().values())
232
- st.metric("Missing Values", missing_vals)
233
-
234
  with col3:
235
- unique_classes = len(info.class_imbalanced())
236
- st.metric("Unique Classes", unique_classes)
237
-
238
- # Data preview
239
- st.subheader("๐Ÿ“‹ Data Preview")
240
- st.dataframe(train_df[[text_col, target_col, 'clean_text', 'text_length']].head(10))
241
-
242
- # Class distribution
243
- st.subheader("๐Ÿ“Š Class Distribution")
244
- class_counts = info.class_imbalanced()
245
-
246
- col1, col2 = st.columns(2)
247
-
248
- with col1:
249
- fig, ax = plt.subplots(figsize=(8, 6))
250
- classes = list(class_counts.keys())
251
- counts = list(class_counts.values())
252
- ax.bar(classes, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8'])
253
- ax.set_title('Class Distribution')
254
- ax.set_xlabel('Classes')
255
- ax.set_ylabel('Count')
256
- plt.xticks(rotation=45)
257
- st.pyplot(fig)
258
 
259
- with col2:
260
- st.write("**Class Distribution:**")
261
- for class_name, count in class_counts.items():
262
- percentage = (count / len(train_df)) * 100
263
- st.write(f"- {class_name}: {count} ({percentage:.1f}%)")
264
-
265
- # Text length analysis
266
- st.subheader("๐Ÿ“ Text Length Analysis")
267
-
268
- col1, col2 = st.columns(2)
269
-
270
- with col1:
271
- fig, ax = plt.subplots(figsize=(8, 6))
272
- ax.hist(train_df['text_length'], bins=50, alpha=0.7, color='#4ECDC4')
273
- ax.set_title('Text Length Distribution')
274
- ax.set_xlabel('Text Length (characters)')
275
- ax.set_ylabel('Frequency')
276
- st.pyplot(fig)
277
 
278
- with col2:
279
- st.write("**Text Length Statistics:**")
280
- length_stats = train_df['text_length'].describe()
281
- for stat, value in length_stats.items():
282
- st.write(f"- {stat.title()}: {value:.1f}")
283
 
284
- # Update session state
285
- st.session_state.processed_train_df = train_df
 
 
 
286
 
 
 
 
 
 
 
 
 
 
 
 
287
  except Exception as e:
288
- st.error(f"โŒ Error in data analysis: {str(e)}")
289
  else:
290
- st.info("๐Ÿ”„ Please upload training data to perform analysis")
291
 
292
  # Train Model Section
293
- elif section == "๐Ÿค– Train Model":
294
- st.header("๐Ÿค– Train Model")
295
-
296
- if st.session_state.get('training_data_processed', False):
297
  try:
298
- if 'processed_train_df' in st.session_state:
299
- train_df = st.session_state.processed_train_df
300
- else:
301
- # Process data if not already processed
302
- train_df = st.session_state.train_df
303
- text_col = st.session_state.text_col
304
- target_col = st.session_state.target_col
305
-
306
- info = TextInformations(train_df, text_col, target_col)
307
- train_df['clean_text'] = info.clean_text()
308
- train_df['text_length'] = info.text_length()
309
-
310
- # Model and vectorizer selection
311
  col1, col2 = st.columns(2)
312
-
313
  with col1:
314
- st.subheader("๐ŸŽฏ Model Selection")
315
- model_name = st.selectbox("Choose the Model", [
316
  "Logistic Regression", "Decision Tree",
317
  "Random Forest", "Linear SVC", "SVC",
318
  "Multinomial Naive Bayes", "Gaussian Naive Bayes"
319
  ])
320
 
321
  with col2:
322
- st.subheader("๐Ÿ“Š Vectorizer Selection")
323
- vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count"])
 
 
 
 
 
 
 
 
 
 
 
324
 
325
- # Training parameters
326
- st.subheader("โš™๏ธ Training Parameters")
327
- col1, col2 = st.columns(2)
 
328
 
329
- with col1:
330
- max_features = st.slider("Max Features", 1000, 20000, 10000, 1000)
331
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, 0.05)
332
 
333
- with col2:
334
- random_state = st.number_input("Random State", 0, 100, 42)
 
335
 
336
- # Training button
337
  if st.button("๐Ÿš€ Start Training", type="primary"):
338
- with st.spinner("Training model... Please wait"):
339
  try:
340
- # Prepare data
341
- X_text = train_df['clean_text'].fillna('')
342
- y = train_df[st.session_state.target_col]
343
-
344
- # Label encoding
345
- label_encoder = LabelEncoder()
346
- y_encoded = label_encoder.fit_transform(y)
347
-
348
- # Vectorization
349
- if vectorizer_choice == "TF-IDF":
350
- vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
351
- else:
352
- vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
353
-
354
- X_vectorized = vectorizer.fit_transform(X_text)
355
-
356
- # Train-test split
357
- X_train, X_test, y_train, y_test = train_test_split(
358
- X_vectorized, y_encoded,
359
- test_size=test_size,
360
- random_state=random_state,
361
- stratify=y_encoded
362
- )
363
 
364
- # Train model
365
- model, accuracy = train_model(model_name, X_train, X_test, y_train, y_test)
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
- # Save to session state
368
- save_to_session(model, 'trained_model')
369
- save_to_session(vectorizer, 'vectorizer')
370
- save_to_session(label_encoder, 'label_encoder')
371
- save_to_session(model_name, 'model_name')
372
- save_to_session(vectorizer_choice, 'vectorizer_type')
373
-
374
- st.session_state.model_trained = True
375
-
376
- # Display results
377
- st.success(f"โœ… Model training completed!")
378
-
379
- col1, col2 = st.columns(2)
380
- with col1:
381
- st.metric("Model Accuracy", f"{accuracy:.4f}")
382
- with col2:
383
- st.metric("Training Samples", len(X_train))
384
-
385
- st.info("๐ŸŽ‰ You can now use the 'Predictions' section to classify new text!")
386
 
387
  except Exception as e:
388
- st.error(f"โŒ Error during training: {str(e)}")
389
-
390
  except Exception as e:
391
- st.error(f"โŒ Error in model training setup: {str(e)}")
392
  else:
393
- st.info("๐Ÿ”„ Please upload and analyze training data first")
394
 
395
  # Predictions Section
396
- elif section == "๐Ÿ”ฎ Predictions":
397
- st.header("๐Ÿ”ฎ Make Predictions")
398
 
399
- if st.session_state.get('model_trained', False):
400
-
401
- # Single text prediction
402
- st.subheader("๐Ÿ“ Single Text Prediction")
403
 
404
- text_input = st.text_area(
405
- "Enter text to classify:",
406
- height=120,
407
- placeholder="Type or paste your text here..."
408
- )
409
 
410
- col1, col2 = st.columns([1, 3])
411
- with col1:
412
- if st.button("๐Ÿ”ฎ Predict", type="primary"):
 
 
413
  if text_input.strip():
414
- try:
415
- model = load_from_session('trained_model')
416
- vectorizer = load_from_session('vectorizer')
417
- encoder = load_from_session('label_encoder')
418
-
419
  predicted_label, prediction_proba = predict_text(
420
- text_input, model, vectorizer, encoder
 
 
421
  )
422
 
423
  if predicted_label is not None:
424
  st.success("โœ… Prediction completed!")
425
 
426
  # Display results
427
- st.markdown("### ๐Ÿ“Š Results")
428
- st.markdown(f"**Predicted Class:** `{predicted_label}`")
 
 
 
 
 
429
 
430
  # Display probabilities if available
431
  if prediction_proba is not None:
432
  st.markdown("**Class Probabilities:**")
433
 
434
- classes = encoder.classes_
435
- prob_data = pd.DataFrame({
436
- 'Class': classes,
437
- 'Probability': prediction_proba
438
- }).sort_values('Probability', ascending=False)
439
-
440
- # Show as bar chart
441
- st.bar_chart(prob_data.set_index('Class'))
442
-
443
- # Show as table
444
- st.dataframe(prob_data, use_container_width=True)
445
-
446
- except Exception as e:
447
- st.error(f"โŒ Prediction error: {str(e)}")
448
  else:
449
  st.warning("โš ๏ธ Please enter some text to classify")
 
 
 
 
450
 
451
- # Batch predictions
452
- st.markdown("---")
453
- st.subheader("๐Ÿ“ Batch Predictions")
454
-
455
- uploaded_batch = st.file_uploader(
456
- "Upload CSV file for batch predictions",
457
- type=['csv'],
458
- help="Upload a CSV file with text data to classify multiple texts at once"
459
- )
460
-
461
- if uploaded_batch is not None:
462
- try:
463
- # Load batch data
464
- encoding_option = st.selectbox(
465
- "Batch CSV Encoding",
466
- ["utf-8", "latin-1", "cp1252", "iso-8859-1"],
467
- key="batch_encoding"
468
- )
469
-
470
- batch_df = pd.read_csv(uploaded_batch, encoding=encoding_option)
471
- st.write("๐Ÿ“‹ **Batch Data Preview:**")
472
- st.dataframe(batch_df.head())
473
 
474
  # Select text column
475
- text_column = st.selectbox(
476
- "Select the text column:",
477
- batch_df.columns.tolist()
478
- )
479
 
480
- if st.button("๐Ÿš€ Run Batch Predictions", type="primary"):
481
- with st.spinner("Processing batch predictions..."):
482
- try:
483
- model = load_from_session('trained_model')
484
- vectorizer = load_from_session('vectorizer')
485
- encoder = load_from_session('label_encoder')
486
-
487
  predictions = []
488
- confidences = []
489
-
490
  progress_bar = st.progress(0)
491
- total_rows = len(batch_df)
492
 
493
  for idx, text in enumerate(batch_df[text_column]):
494
- pred, pred_proba = predict_text(
495
- str(text), model, vectorizer, encoder
 
 
496
  )
497
  predictions.append(pred if pred is not None else "Error")
498
-
499
- # Get confidence (max probability)
500
- if pred_proba is not None:
501
- confidences.append(max(pred_proba))
502
- else:
503
- confidences.append(0.0)
504
-
505
- progress_bar.progress((idx + 1) / total_rows)
506
 
507
  batch_df['Predicted_Class'] = predictions
508
- batch_df['Confidence'] = confidences
509
 
510
  st.success("โœ… Batch predictions completed!")
511
-
512
- # Show results
513
- st.write("๐Ÿ“Š **Prediction Results:**")
514
- st.dataframe(batch_df[[text_column, 'Predicted_Class', 'Confidence']])
515
 
516
  # Download results
517
  csv = batch_df.to_csv(index=False)
518
  st.download_button(
519
- label="๐Ÿ“ฅ Download Results as CSV",
520
  data=csv,
521
  file_name="batch_predictions.csv",
522
  mime="text/csv"
523
  )
524
 
525
- except Exception as e:
526
- st.error(f"โŒ Batch prediction error: {str(e)}")
527
-
528
- except Exception as e:
529
- st.error(f"โŒ Error loading batch file: {str(e)}")
530
-
531
- else:
532
- st.info("๐Ÿ”„ Please train a model first before making predictions")
533
-
534
- # Show model info if available
535
- if st.session_state.get('training_data_processed', False):
536
- st.write("๐Ÿ’ก **Tip:** Go to the 'Train Model' section to train a model first!")
537
-
538
- # Footer
539
- st.markdown("---")
540
- st.markdown(
541
- """
542
- <div style='text-align: center; color: #666; padding: 20px;'>
543
- <p>๐Ÿ“ No Code Text Classification App</p>
544
- <p>Built with Streamlit โ€ข Upload CSV โ†’ Analyze โ†’ Train โ†’ Predict</p>
545
- </div>
546
- """,
547
- unsafe_allow_html=True
548
- )
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
+ from NoCodeTextClassifier.EDA import Informations, Visualizations
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
+ from NoCodeTextClassifier.models import Models
 
 
 
 
 
 
9
  import os
10
  import pickle
11
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
+ import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Set page config
15
+ st.set_page_config(page_title="Text Classification App", page_icon="๐Ÿ“", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Utility functions
18
+ def save_artifacts(obj, folder_name, file_name):
19
+ """Save artifacts like encoders and vectorizers"""
20
+ try:
21
+ os.makedirs(folder_name, exist_ok=True)
22
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
23
+ pickle.dump(obj, f)
24
+ return True
25
+ except Exception as e:
26
+ st.error(f"Error saving {file_name}: {str(e)}")
27
+ return False
28
 
29
+ def load_artifacts(folder_name, file_name):
30
+ """Load saved artifacts"""
31
+ try:
32
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
33
+ return pickle.load(f)
34
+ except FileNotFoundError:
35
+ st.error(f"File {file_name} not found in {folder_name} folder")
36
+ return None
37
+ except Exception as e:
38
+ st.error(f"Error loading {file_name}: {str(e)}")
39
+ return None
40
 
41
+ def load_model(model_name):
42
+ """Load trained model"""
43
+ try:
44
+ with open(os.path.join('models', model_name), 'rb') as f:
45
+ return pickle.load(f)
46
+ except FileNotFoundError:
47
+ st.error(f"Model {model_name} not found. Please train a model first.")
48
+ return None
49
+ except Exception as e:
50
+ st.error(f"Error loading model {model_name}: {str(e)}")
51
+ return None
52
+
53
+ def safe_read_csv(uploaded_file, encoding_options=['utf-8', 'latin1', 'iso-8859-1', 'cp1252']):
54
+ """Safely read CSV with multiple encoding options"""
55
+ for encoding in encoding_options:
56
+ try:
57
+ # Reset file pointer
58
+ uploaded_file.seek(0)
59
+ # Read as bytes first, then decode
60
+ content = uploaded_file.read()
61
+ if isinstance(content, bytes):
62
+ content = content.decode(encoding)
63
+
64
+ # Use StringIO to create a file-like object
65
+ df = pd.read_csv(io.StringIO(content))
66
+ st.success(f"File loaded successfully with {encoding} encoding")
67
+ return df
68
+
69
+ except UnicodeDecodeError:
70
+ continue
71
+ except Exception as e:
72
+ st.warning(f"Failed to read with {encoding} encoding: {str(e)}")
73
+ continue
74
 
75
+ # If all encodings fail, try pandas default
76
+ try:
77
+ uploaded_file.seek(0)
78
+ df = pd.read_csv(uploaded_file)
79
+ st.success("File loaded with default encoding")
80
+ return df
81
+ except Exception as e:
82
+ st.error(f"All encoding attempts failed. Error: {str(e)}")
83
+ return None
84
 
85
+ def predict_text(model_name, text, vectorizer_type="tfidf"):
86
  """Make prediction on new text"""
87
  try:
88
+ # Load model
89
+ model = load_model(model_name)
90
+ if model is None:
91
+ return None, None
92
+
93
+ # Load vectorizer
94
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
95
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
96
+ if vectorizer is None:
97
+ return None, None
98
+
99
+ # Load label encoder
100
+ encoder = load_artifacts("artifacts", "encoder.pkl")
101
+ if encoder is None:
102
+ return None, None
103
+
104
+ # Clean and vectorize text
105
  text_cleaner = TextCleaner()
106
  clean_text = text_cleaner.clean_text(text)
107
 
108
+ # Transform text using the same vectorizer used during training
109
  text_vector = vectorizer.transform([clean_text])
110
 
111
  # Make prediction
 
128
  st.error(f"Error during prediction: {str(e)}")
129
  return None, None
130
 
131
+ # Streamlit App
 
 
 
 
 
 
132
  st.title('๐Ÿ“ No Code Text Classification App')
133
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
 
 
 
 
 
 
134
 
135
  # Sidebar
136
  st.sidebar.title("Navigation")
137
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
 
 
 
 
138
 
139
+ # Upload Data
 
140
  st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
141
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"], key="train_upload")
142
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"], key="test_upload")
143
 
144
+ # Global variables to store data and settings
145
+ if 'vectorizer_type' not in st.session_state:
146
+ st.session_state.vectorizer_type = "tfidf"
147
+ if 'train_df' not in st.session_state:
148
+ st.session_state.train_df = None
149
+ if 'info' not in st.session_state:
150
+ st.session_state.info = None
 
 
 
 
 
 
 
 
 
151
 
152
  # Process uploaded data
153
  if train_data is not None:
154
  try:
155
+ # Use safe CSV reading function
156
+ train_df = safe_read_csv(train_data)
 
 
 
 
157
 
158
+ if train_df is not None:
159
+ st.session_state.train_df = train_df
 
 
 
 
160
 
161
+ if test_data is not None:
162
+ test_df = safe_read_csv(test_data)
163
+ st.session_state.test_df = test_df
164
+ else:
165
+ st.session_state.test_df = None
166
+
167
+ st.sidebar.success("โœ… Data loaded successfully!")
168
+ st.write("Training Data Preview:")
169
+ st.write(train_df.head(3))
170
+
171
+ columns = train_df.columns.tolist()
172
+ text_data = st.sidebar.selectbox("Choose the text column:", columns, key="text_col")
173
+ target = st.sidebar.selectbox("Choose the target column:", columns, key="target_col")
174
+
175
+ if text_data and target:
176
+ try:
177
+ # Process data
178
+ info = Informations(train_df, text_data, target)
179
+ train_df['clean_text'] = info.clean_text()
180
+ train_df['text_length'] = info.text_length()
181
+
182
+ # Handle label encoding manually
183
+ from sklearn.preprocessing import LabelEncoder
184
+ label_encoder = LabelEncoder()
185
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
186
+
187
+ # Save label encoder for later use
188
+ if save_artifacts(label_encoder, "artifacts", "encoder.pkl"):
189
+ st.sidebar.success("โœ… Data processed successfully!")
190
+
191
+ st.session_state.train_df = train_df
192
+ st.session_state.info = info
193
+
194
+ except Exception as e:
195
+ st.error(f"Error processing data: {str(e)}")
196
+ st.session_state.train_df = None
197
+ st.session_state.info = None
198
 
199
  except Exception as e:
200
+ st.error(f"Error loading data: {str(e)}")
201
+ st.session_state.train_df = None
202
+ st.session_state.info = None
203
+
204
+ # Get data from session state
205
+ train_df = st.session_state.get('train_df')
206
+ info = st.session_state.get('info')
207
 
208
  # Data Analysis Section
209
+ if section == "Data Analysis":
210
+ if train_data is not None and train_df is not None:
 
 
211
  try:
212
+ st.subheader("๐Ÿ“Š Get Insights from the Data")
 
 
213
 
 
 
 
 
 
 
 
 
214
  col1, col2, col3 = st.columns(3)
 
215
  with col1:
216
+ st.metric("Data Shape", f"{info.shape()[0]} rows ร— {info.shape()[1]} cols")
 
217
  with col2:
218
+ st.metric("Classes", len(train_df['target'].unique()))
 
 
219
  with col3:
220
+ st.metric("Missing Values", info.missing_values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
+ st.write("**Class Distribution:**", info.class_imbalanced())
223
+
224
+ st.write("**Processed Data Preview:**")
225
+ st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ st.markdown("**Text Length Analysis**")
228
+ st.write(info.analysis_text_length('text_length'))
 
 
 
229
 
230
+ # Calculate correlation manually
231
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
232
+ st.write(f"**Correlation between Text Length and Target:** {correlation:.4f}")
233
+
234
+ st.subheader("๐Ÿ“ˆ Visualizations")
235
 
236
+ try:
237
+ columns = train_df.columns.tolist()
238
+ text_col = next((col for col in columns if 'text' in col.lower() or col in ['message', 'content', 'review']), columns[0])
239
+ target_col = next((col for col in columns if col in ['label', 'target', 'class', 'category']), columns[-1])
240
+
241
+ vis = Visualizations(train_df, text_col, target_col)
242
+ vis.class_distribution()
243
+ vis.text_length_distribution()
244
+ except Exception as e:
245
+ st.error(f"Error generating visualizations: {str(e)}")
246
+
247
  except Exception as e:
248
+ st.error(f"Error in data analysis: {str(e)}")
249
  else:
250
+ st.warning("โš ๏ธ Please upload training data to get insights")
251
 
252
  # Train Model Section
253
+ elif section == "Train Model":
254
+ if train_data is not None and train_df is not None:
 
 
255
  try:
256
+ st.subheader("๐Ÿค– Train a Model")
257
+
258
+ # Create two columns for model selection
 
 
 
 
 
 
 
 
 
 
259
  col1, col2 = st.columns(2)
260
+
261
  with col1:
262
+ st.markdown("**Select Model:**")
263
+ model = st.radio("Choose the Model", [
264
  "Logistic Regression", "Decision Tree",
265
  "Random Forest", "Linear SVC", "SVC",
266
  "Multinomial Naive Bayes", "Gaussian Naive Bayes"
267
  ])
268
 
269
  with col2:
270
+ st.markdown("**Select Vectorizer:**")
271
+ vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
272
+
273
+ # Initialize vectorizer
274
+ if vectorizer_choice == "Tfidf Vectorizer":
275
+ vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
276
+ st.session_state.vectorizer_type = "tfidf"
277
+ else:
278
+ vectorizer = CountVectorizer(max_features=10000, stop_words='english')
279
+ st.session_state.vectorizer_type = "count"
280
+
281
+ st.write("**Training Data Preview:**")
282
+ st.write(train_df[['clean_text', 'target']].head(3))
283
 
284
+ # Vectorize text data
285
+ with st.spinner("Vectorizing text data..."):
286
+ X = vectorizer.fit_transform(train_df['clean_text'])
287
+ y = train_df['target']
288
 
289
+ # Split data
290
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
291
+ st.write(f"**Data split** - Train: {X_train.shape}, Test: {X_test.shape}")
292
 
293
+ # Save vectorizer for later use
294
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
295
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
296
 
 
297
  if st.button("๐Ÿš€ Start Training", type="primary"):
298
+ with st.spinner("Training model..."):
299
  try:
300
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ # Train selected model
303
+ if model == "Logistic Regression":
304
+ models.LogisticRegression()
305
+ elif model == "Decision Tree":
306
+ models.DecisionTree()
307
+ elif model == "Linear SVC":
308
+ models.LinearSVC()
309
+ elif model == "SVC":
310
+ models.SVC()
311
+ elif model == "Multinomial Naive Bayes":
312
+ models.MultinomialNB()
313
+ elif model == "Random Forest":
314
+ models.RandomForestClassifier()
315
+ elif model == "Gaussian Naive Bayes":
316
+ models.GaussianNB()
317
 
318
+ st.success("๐ŸŽ‰ Model training completed!")
319
+ st.info("You can now use the 'Predictions' section to classify new text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  except Exception as e:
322
+ st.error(f"Error during model training: {str(e)}")
323
+
324
  except Exception as e:
325
+ st.error(f"Error in model training: {str(e)}")
326
  else:
327
+ st.warning("โš ๏ธ Please upload training data to train a model")
328
 
329
  # Predictions Section
330
+ elif section == "Predictions":
331
+ st.subheader("๐Ÿ”ฎ Perform Predictions on New Text")
332
 
333
+ # Check if models exist
334
+ if os.path.exists("models") and os.listdir("models"):
335
+ # Text input for prediction
336
+ text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type your text here...")
337
 
338
+ # Model selection
339
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
 
 
 
340
 
341
+ if available_models:
342
+ selected_model = st.selectbox("Choose the trained model:", available_models)
343
+
344
+ # Prediction button
345
+ if st.button("๐ŸŽฏ Predict", key="single_predict", type="primary"):
346
  if text_input.strip():
347
+ with st.spinner("Making prediction..."):
 
 
 
 
348
  predicted_label, prediction_proba = predict_text(
349
+ selected_model,
350
+ text_input,
351
+ st.session_state.get('vectorizer_type', 'tfidf')
352
  )
353
 
354
  if predicted_label is not None:
355
  st.success("โœ… Prediction completed!")
356
 
357
  # Display results
358
+ st.markdown("### ๐Ÿ“Š Prediction Results")
359
+
360
+ col1, col2 = st.columns([2, 1])
361
+ with col1:
362
+ st.markdown(f"**Input Text:** {text_input}")
363
+ with col2:
364
+ st.markdown(f"**Predicted Class:** `{predicted_label}`")
365
 
366
  # Display probabilities if available
367
  if prediction_proba is not None:
368
  st.markdown("**Class Probabilities:**")
369
 
370
+ # Load encoder to get class names
371
+ encoder = load_artifacts("artifacts", "encoder.pkl")
372
+ if encoder is not None:
373
+ classes = encoder.classes_
374
+ prob_df = pd.DataFrame({
375
+ 'Class': classes,
376
+ 'Probability': prediction_proba
377
+ }).sort_values('Probability', ascending=False)
378
+
379
+ col1, col2 = st.columns(2)
380
+ with col1:
381
+ st.bar_chart(prob_df.set_index('Class'))
382
+ with col2:
383
+ st.dataframe(prob_df, use_container_width=True)
384
  else:
385
  st.warning("โš ๏ธ Please enter some text to classify")
386
+ else:
387
+ st.warning("โš ๏ธ No trained models found. Please train a model first.")
388
+ else:
389
+ st.warning("โš ๏ธ No trained models found. Please go to 'Train Model' section to train a model first.")
390
 
391
+ # Option to classify multiple texts
392
+ st.markdown("---")
393
+ st.subheader("๐Ÿ“Š Batch Predictions")
394
+
395
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'], key="batch_upload")
396
+
397
+ if uploaded_file is not None:
398
+ try:
399
+ batch_df = safe_read_csv(uploaded_file)
400
+
401
+ if batch_df is not None:
402
+ st.write("**Uploaded data preview:**")
403
+ st.write(batch_df.head())
 
 
 
 
 
 
 
 
 
404
 
405
  # Select text column
406
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
 
 
 
407
 
408
+ if os.path.exists("models") and os.listdir("models"):
409
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
410
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
411
+
412
+ if st.button("๐Ÿš€ Run Batch Predictions", key="batch_predict", type="primary"):
413
+ with st.spinner("Processing batch predictions..."):
 
414
  predictions = []
 
 
415
  progress_bar = st.progress(0)
 
416
 
417
  for idx, text in enumerate(batch_df[text_column]):
418
+ pred, _ = predict_text(
419
+ batch_model,
420
+ str(text),
421
+ st.session_state.get('vectorizer_type', 'tfidf')
422
  )
423
  predictions.append(pred if pred is not None else "Error")
424
+ progress_bar.progress((idx + 1) / len(batch_df))
 
 
 
 
 
 
 
425
 
426
  batch_df['Predicted_Class'] = predictions
 
427
 
428
  st.success("โœ… Batch predictions completed!")
429
+ st.write("**Results:**")
430
+ st.write(batch_df[[text_column, 'Predicted_Class']])
 
 
431
 
432
  # Download results
433
  csv = batch_df.to_csv(index=False)
434
  st.download_button(
435
+ label="๐Ÿ“ฅ Download predictions as CSV",
436
  data=csv,
437
  file_name="batch_predictions.csv",
438
  mime="text/csv"
439
  )
440
 
441
+ except Exception as e:
442
+ st.error(f"Error in batch prediction: {str(e)}")