Alamgirapi commited on
Commit
1ce2ad9
ยท
verified ยท
1 Parent(s): 23e4994

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +415 -464
app.py CHANGED
@@ -8,63 +8,65 @@ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorizati
8
  from NoCodeTextClassifier.models import Models
9
  import os
10
  import pickle
11
- import io
12
- import base64
13
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
- from sklearn.preprocessing import LabelEncoder
15
 
16
- # Configure page
17
- st.set_page_config(page_title="Text Classifier", page_icon="๐Ÿ“", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- # Utility functions
20
- def safe_read_csv(uploaded_file, encoding_options=['utf-8', 'latin1', 'iso-8859-1', 'cp1252']):
21
- """Safely read CSV with multiple encoding attempts"""
22
- if uploaded_file is None:
23
- return None
24
-
25
- # Reset file pointer
26
- uploaded_file.seek(0)
27
 
28
- for encoding in encoding_options:
29
- try:
30
- # Read the file content as bytes
31
- bytes_data = uploaded_file.read()
32
-
33
- # Convert bytes to string with the current encoding
34
- string_data = bytes_data.decode(encoding)
35
-
36
- # Use StringIO to create a file-like object
37
- df = pd.read_csv(io.StringIO(string_data))
38
- st.success(f"File loaded successfully with {encoding} encoding")
39
- return df
 
 
40
 
41
- except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
42
- st.warning(f"Failed to read with {encoding} encoding: {str(e)}")
43
- continue
44
- except Exception as e:
45
- st.error(f"Unexpected error with {encoding} encoding: {str(e)}")
46
- continue
47
-
48
- st.error("Failed to read the file with any supported encoding")
49
- return None
50
 
51
- def create_sample_data():
52
- """Create sample data for testing"""
53
- sample_data = {
54
- 'text': [
55
- "I love this product, it's amazing!",
56
- "This is the worst thing I've ever bought",
57
- "Great quality and fast delivery",
58
- "Terrible customer service, very disappointed",
59
- "Excellent value for money",
60
- "Poor quality, broke after one day",
61
- "Highly recommend this to everyone",
62
- "Waste of money, don't buy this"
63
- ],
64
- 'sentiment': ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative']
65
- }
66
- return pd.DataFrame(sample_data)
67
 
 
68
  def save_artifacts(obj, folder_name, file_name):
69
  """Save artifacts like encoders and vectorizers"""
70
  try:
@@ -82,7 +84,7 @@ def load_artifacts(folder_name, file_name):
82
  with open(os.path.join(folder_name, file_name), 'rb') as f:
83
  return pickle.load(f)
84
  except FileNotFoundError:
85
- st.error(f"File {file_name} not found in {folder_name} folder")
86
  return None
87
  except Exception as e:
88
  st.error(f"Error loading {file_name}: {str(e)}")
@@ -100,6 +102,29 @@ def load_model(model_name):
100
  st.error(f"Error loading model: {str(e)}")
101
  return None
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def predict_text(model_name, text, vectorizer_type="tfidf"):
104
  """Make prediction on new text"""
105
  try:
@@ -146,293 +171,210 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
146
  st.error(f"Error during prediction: {str(e)}")
147
  return None, None
148
 
149
- def download_sample_csv():
150
- """Generate sample CSV for download"""
151
- sample_df = create_sample_data()
152
- csv = sample_df.to_csv(index=False)
153
- b64 = base64.b64encode(csv.encode()).decode()
154
- href = f'<a href="data:file/csv;base64,{b64}" download="sample_data.csv">Download Sample CSV</a>'
155
- return href
156
-
157
- # Main App
158
- st.title('๐Ÿ“ No Code Text Classification App')
159
- st.markdown('---')
160
- st.write('Understand the behavior of your text data and train a model to classify the text data')
161
-
162
- # Initialize session state
163
- if 'vectorizer_type' not in st.session_state:
164
- st.session_state.vectorizer_type = "tfidf"
165
- if 'train_df' not in st.session_state:
166
- st.session_state.train_df = None
167
-
168
- # Sidebar
169
- st.sidebar.title("Navigation")
170
- section = st.sidebar.radio("Choose Section", ["๐Ÿ“Š Data Analysis", "๐Ÿ”ง Train Model", "๐ŸŽฏ Predictions"])
171
-
172
- # Data Upload Section
173
- st.sidebar.markdown("---")
174
- st.sidebar.subheader("๐Ÿ“ Data Upload")
175
-
176
- # Option to use sample data
177
- if st.sidebar.button("Use Sample Data"):
178
- st.session_state.train_df = create_sample_data()
179
- st.sidebar.success("Sample data loaded!")
180
-
181
- # Sample data download
182
- st.sidebar.markdown("**Download Sample Data:**")
183
- st.sidebar.markdown(download_sample_csv(), unsafe_allow_html=True)
184
-
185
- st.sidebar.markdown("**Or upload your own data:**")
186
-
187
- # File upload with better error handling
188
- train_data = st.sidebar.file_uploader(
189
- "Upload training data",
190
- type=["csv"],
191
- help="Upload a CSV file with text and target columns"
192
- )
193
-
194
- test_data = st.sidebar.file_uploader(
195
- "Upload test data (optional)",
196
- type=["csv"],
197
- help="Optional: Upload separate test data"
198
- )
199
-
200
- # Alternative text input method
201
- st.sidebar.markdown("**Or paste CSV data:**")
202
- if st.sidebar.checkbox("Enter data manually"):
203
- csv_text = st.sidebar.text_area(
204
- "Paste CSV data here:",
205
- height=100,
206
- placeholder="text,sentiment\n\"Great product!\",positive\n\"Poor quality\",negative"
207
  )
208
 
209
- if csv_text and st.sidebar.button("Load from text"):
210
- try:
211
- train_df = pd.read_csv(io.StringIO(csv_text))
212
- st.session_state.train_df = train_df
213
- st.sidebar.success("Data loaded from text!")
214
- except Exception as e:
215
- st.sidebar.error(f"Error parsing CSV text: {str(e)}")
216
-
217
- # Load data
218
- train_df = None
219
- test_df = None
220
-
221
- # Try to load from uploaded file first
222
- if train_data is not None:
223
- train_df = safe_read_csv(train_data)
224
- if train_df is not None:
225
- st.session_state.train_df = train_df
226
-
227
- # Use session state data if available
228
- if st.session_state.train_df is not None:
229
- train_df = st.session_state.train_df
230
-
231
- if test_data is not None:
232
- test_df = safe_read_csv(test_data)
233
 
234
- # Process data if available
235
- if train_df is not None:
236
- try:
237
- st.sidebar.success("โœ… Training data loaded successfully!")
238
-
239
- # Show data info in sidebar
240
- st.sidebar.write(f"**Rows:** {len(train_df)}")
241
- st.sidebar.write(f"**Columns:** {len(train_df.columns)}")
242
-
243
- with st.expander("๐Ÿ“‹ Data Preview", expanded=False):
244
- st.write("**Training Data Preview:**")
245
- st.dataframe(train_df.head())
246
-
247
- columns = train_df.columns.tolist()
248
-
249
- # Column selection with validation
250
- if len(columns) >= 2:
251
- text_data = st.sidebar.selectbox("Choose the text column:", columns, index=0)
252
- # Default to second column for target, or first if same as text
253
- target_default = 1 if len(columns) > 1 and columns[1] != text_data else 0
254
- target = st.sidebar.selectbox("Choose the target column:", columns, index=target_default)
255
-
256
- if text_data == target:
257
- st.sidebar.error("Text and target columns must be different!")
258
- st.stop()
259
- else:
260
- st.sidebar.error("Data must have at least 2 columns (text and target)")
261
- st.stop()
262
 
263
- # Process data
264
- try:
265
- info = Informations(train_df, text_data, target)
266
- train_df['clean_text'] = info.clean_text()
267
- train_df['text_length'] = info.text_length()
268
-
269
- # Handle label encoding
270
- label_encoder = LabelEncoder()
271
- train_df['target'] = label_encoder.fit_transform(train_df[target])
272
-
273
- # Save label encoder
274
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
275
-
276
- except Exception as e:
277
- st.error(f"Error processing data: {str(e)}")
278
- st.stop()
279
-
280
- except Exception as e:
281
- st.error(f"Error loading data: {str(e)}")
282
- train_df = None
283
 
284
- # Main Content Based on Section
285
- if section == "๐Ÿ“Š Data Analysis":
286
- if train_df is not None:
287
- try:
288
- st.header("๐Ÿ“Š Data Analysis & Insights")
289
-
290
- # Create columns for metrics
291
- col1, col2, col3, col4 = st.columns(4)
292
-
293
- with col1:
294
- st.metric("Total Samples", info.shape()[0])
295
- with col2:
296
- st.metric("Features", info.shape()[1])
297
- with col3:
298
- st.metric("Classes", len(train_df[target].unique()))
299
- with col4:
300
- missing_pct = (info.missing_values().sum() / len(train_df)) * 100
301
- st.metric("Missing Data %", f"{missing_pct:.1f}%")
302
-
303
- st.markdown("---")
304
-
305
- # Class distribution
306
- col1, col2 = st.columns(2)
307
 
308
- with col1:
309
- st.subheader("Class Distribution")
310
- class_dist = train_df[target].value_counts()
311
- st.bar_chart(class_dist)
 
 
 
 
312
 
313
- # Check for imbalance
314
- imbalance_ratio = class_dist.max() / class_dist.min()
315
- if imbalance_ratio > 2:
316
- st.warning(f"โš ๏ธ Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  else:
318
- st.success("โœ… Classes are relatively balanced")
319
-
320
- with col2:
321
- st.subheader("Text Length Distribution")
322
- fig, ax = plt.subplots(figsize=(8, 6))
323
- ax.hist(train_df['text_length'], bins=30, alpha=0.7, color='skyblue')
324
- ax.set_xlabel('Text Length (characters)')
325
- ax.set_ylabel('Frequency')
326
- ax.set_title('Distribution of Text Lengths')
327
- st.pyplot(fig)
328
-
329
- # Detailed analysis
330
- with st.expander("๐Ÿ“ˆ Detailed Analysis", expanded=False):
331
- st.write("**Class Imbalance Analysis:**")
332
- st.write(info.class_imbalanced())
333
-
334
- st.write("**Missing Values:**")
335
- st.write(info.missing_values())
336
-
337
- st.write("**Text Length Statistics:**")
338
- st.write(info.analysis_text_length('text_length'))
339
 
340
- # Correlation
341
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
342
- st.write(f"**Correlation between Text Length and Target:** {correlation:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- if abs(correlation) > 0.3:
345
- st.info(f"๐Ÿ“Š Moderate correlation detected ({correlation:.3f})")
346
- elif abs(correlation) > 0.1:
347
- st.info(f"๐Ÿ“Š Weak correlation detected ({correlation:.3f})")
348
- else:
349
- st.info("๐Ÿ“Š No significant correlation between text length and target")
350
-
351
- except Exception as e:
352
- st.error(f"Error in data analysis: {str(e)}")
353
- else:
354
- st.warning("๐Ÿ“ค Please upload training data or use sample data to get insights")
355
-
356
- # Show instructions
357
- st.info("""
358
- **To get started:**
359
- 1. Click "Use Sample Data" in the sidebar, OR
360
- 2. Upload your own CSV file with text and target columns, OR
361
- 3. Use the manual text input option in the sidebar
362
- """)
363
 
364
- # Train Model Section
365
- elif section == "๐Ÿ”ง Train Model":
366
- if train_df is not None:
367
- try:
368
- st.header("๐Ÿ”ง Train Classification Model")
369
-
370
- # Model and vectorizer selection
371
- col1, col2 = st.columns(2)
 
 
 
 
372
 
373
- with col1:
374
- st.subheader("Choose Model")
375
- model = st.selectbox("Select Algorithm:", [
376
- "Logistic Regression", "Decision Tree",
377
- "Random Forest", "Linear SVC", "SVC",
378
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
379
- ], help="Different algorithms have different strengths")
380
-
381
- with col2:
382
- st.subheader("Choose Vectorizer")
383
- vectorizer_choice = st.selectbox("Select Vectorization Method:",
384
- ["Tfidf Vectorizer", "Count Vectorizer"],
385
- help="TF-IDF is usually better for text classification")
386
 
387
- # Initialize vectorizer
388
- if vectorizer_choice == "Tfidf Vectorizer":
389
- vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
390
- st.session_state.vectorizer_type = "tfidf"
391
- else:
392
- vectorizer = CountVectorizer(max_features=10000, stop_words='english')
393
- st.session_state.vectorizer_type = "count"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- # Show processed data preview
396
- with st.expander("๐Ÿ” Processed Data Preview", expanded=False):
397
- preview_df = train_df[['clean_text', 'target']].head(10)
398
- st.dataframe(preview_df)
399
-
400
- st.markdown("---")
401
-
402
- # Training section
403
- if st.button("๐Ÿš€ Start Training", type="primary"):
404
- with st.spinner("Training model... This may take a few moments."):
405
- try:
406
- # Progress bar
407
- progress_bar = st.progress(0)
408
- status_text = st.empty()
409
-
410
- status_text.text("Vectorizing text data...")
 
 
 
 
 
 
411
  progress_bar.progress(20)
412
 
413
- # Vectorize text data
414
- X = vectorizer.fit_transform(train_df['clean_text'])
415
- y = train_df['target']
416
-
417
- status_text.text("Splitting data...")
418
- progress_bar.progress(40)
419
-
420
- # Split data
421
- X_train, X_test, y_train, y_test = process.split_data(X, y)
422
 
423
- status_text.text("Saving vectorizer...")
424
  progress_bar.progress(50)
425
 
426
- # Save vectorizer
427
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
428
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
429
-
430
- status_text.text(f"Training {model}...")
431
- progress_bar.progress(70)
432
-
433
- # Train model
434
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
435
-
436
  if model == "Logistic Regression":
437
  models.LogisticRegression()
438
  elif model == "Decision Tree":
@@ -450,162 +392,171 @@ elif section == "๐Ÿ”ง Train Model":
450
 
451
  progress_bar.progress(100)
452
  status_text.text("Training completed!")
453
-
454
- st.success("๐ŸŽ‰ Model training completed successfully!")
455
- st.balloons()
456
-
457
- # Show training info
458
- st.info(f"""
459
- **Training Summary:**
460
- - Model: {model}
461
- - Vectorizer: {vectorizer_choice}
462
- - Training samples: {X_train.shape[0]}
463
- - Test samples: {X_test.shape[0]}
464
- - Features: {X_train.shape[1]}
465
- """)
466
-
467
- except Exception as e:
468
- st.error(f"Training failed: {str(e)}")
469
 
470
- except Exception as e:
471
- st.error(f"Error in model training setup: {str(e)}")
472
- else:
473
- st.warning("๐Ÿ“ค Please upload training data to train a model")
 
474
 
475
- # Predictions Section
476
- elif section == "๐ŸŽฏ Predictions":
477
- st.header("๐ŸŽฏ Make Predictions")
478
-
479
- # Check if models exist
480
- if os.path.exists("models") and os.listdir("models"):
481
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
482
 
483
- if available_models:
484
- # Single prediction
485
- st.subheader("Single Text Prediction")
486
 
487
- col1, col2 = st.columns([3, 1])
488
-
489
- with col1:
490
- text_input = st.text_area(
491
- "Enter text to classify:",
492
- height=100,
493
- placeholder="Type or paste your text here..."
494
- )
495
-
496
- with col2:
497
- selected_model = st.selectbox("Choose model:", available_models)
498
- predict_btn = st.button("๐ŸŽฏ Predict", type="primary")
499
-
500
- if predict_btn and text_input.strip():
501
- with st.spinner("Making prediction..."):
502
- predicted_label, prediction_proba = predict_text(
503
- selected_model,
504
- text_input,
505
- st.session_state.get('vectorizer_type', 'tfidf')
506
- )
507
 
508
- if predicted_label is not None:
509
- st.success("Prediction completed!")
510
-
511
- # Results in columns
512
- col1, col2 = st.columns(2)
513
-
514
- with col1:
515
- st.markdown("### ๐Ÿ“ Input Text")
516
- st.text_area("", value=text_input, height=100, disabled=True)
517
-
518
- with col2:
519
- st.markdown("### ๐ŸŽฏ Prediction Result")
520
- st.markdown(f"**Predicted Class:** `{predicted_label}`")
521
-
522
- # Show probabilities if available
523
- if prediction_proba is not None:
524
- encoder = load_artifacts("artifacts", "encoder.pkl")
525
- if encoder is not None:
526
- classes = encoder.classes_
527
- prob_df = pd.DataFrame({
528
- 'Class': classes,
529
- 'Probability': prediction_proba
530
- }).sort_values('Probability', ascending=False)
531
 
532
- st.markdown("**Confidence Scores:**")
 
533
 
534
- # Show as progress bars
535
- for _, row in prob_df.iterrows():
536
- st.write(f"{row['Class']}: {row['Probability']:.3f}")
537
- st.progress(row['Probability'])
538
-
539
- elif predict_btn and not text_input.strip():
540
- st.warning("Please enter some text to classify")
541
-
542
- st.markdown("---")
543
-
544
- # Batch prediction
545
- st.subheader("Batch Predictions")
546
-
547
- uploaded_file = st.file_uploader(
548
- "Upload CSV file for batch predictions",
549
- type=['csv'],
550
- help="Upload a CSV with a text column to classify multiple texts at once"
551
- )
 
 
 
 
 
 
 
552
 
553
- if uploaded_file is not None:
554
- batch_df = safe_read_csv(uploaded_file)
555
 
556
- if batch_df is not None:
557
- col1, col2 = st.columns(2)
558
-
559
- with col1:
560
- text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
561
- with col2:
562
- batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
563
-
564
- st.write("**Data Preview:**")
565
- st.dataframe(batch_df.head())
566
-
567
- if st.button("๐Ÿš€ Run Batch Predictions"):
568
- with st.spinner("Processing batch predictions..."):
569
- predictions = []
570
-
571
- # Progress tracking
572
- progress_bar = st.progress(0)
573
- total_texts = len(batch_df)
574
-
575
- for i, text in enumerate(batch_df[text_column]):
576
- pred, _ = predict_text(
577
- batch_model,
578
- str(text),
579
- st.session_state.get('vectorizer_type', 'tfidf')
580
- )
581
- predictions.append(pred if pred is not None else "Error")
582
- progress_bar.progress((i + 1) / total_texts)
583
-
584
- batch_df['Predicted_Class'] = predictions
585
-
586
- st.success("โœ… Batch predictions completed!")
587
 
588
- # Results
589
- st.write("**Results:**")
590
- st.dataframe(batch_df[[text_column, 'Predicted_Class']])
591
 
592
- # Download button
593
- csv = batch_df.to_csv(index=False)
594
- st.download_button(
595
- label="โฌ‡๏ธ Download Results",
596
- data=csv,
597
- file_name="batch_predictions.csv",
598
- mime="text/csv"
599
- )
600
 
601
- # Show prediction distribution
602
- pred_dist = batch_df['Predicted_Class'].value_counts()
603
- st.bar_chart(pred_dist)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  else:
605
- st.warning("No trained models found.")
606
- else:
607
- st.warning("๐Ÿ”ง No models available. Please train a model first in the 'Train Model' section.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
- # Footer
610
- st.markdown("---")
611
- st.markdown("*Built with Streamlit โ€ข No-Code Text Classification*")
 
8
  from NoCodeTextClassifier.models import Models
9
  import os
10
  import pickle
11
+ import hashlib
12
+ import hmac
13
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
14
 
15
+ # Authentication Configuration
16
+ USERS = {
17
+ "admin": "admin123",
18
+ "user1": "password123",
19
+ "demo": "demo123"
20
+ }
21
+
22
+ def check_password():
23
+ """Returns True if the user has correct password."""
24
+ def password_entered():
25
+ """Checks whether a password entered by the user is correct."""
26
+ username = st.session_state["username"]
27
+ password = st.session_state["password"]
28
+
29
+ if username in USERS and hmac.compare_digest(USERS[username], password):
30
+ st.session_state["password_correct"] = True
31
+ st.session_state["authenticated_user"] = username
32
+ del st.session_state["password"] # Don't store passwords
33
+ else:
34
+ st.session_state["password_correct"] = False
35
 
36
+ # Return True if password is validated
37
+ if st.session_state.get("password_correct", False):
38
+ return True
39
+
40
+ # Show login form
41
+ st.markdown("## ๐Ÿ” Login Required")
42
+ st.markdown("Please enter your credentials to access the Text Classification App")
 
43
 
44
+ col1, col2, col3 = st.columns([1, 2, 1])
45
+ with col2:
46
+ st.text_input("Username", key="username", placeholder="Enter username")
47
+ st.text_input("Password", type="password", key="password", placeholder="Enter password")
48
+
49
+ if st.button("Login", use_container_width=True):
50
+ password_entered()
51
+
52
+ # Show demo credentials
53
+ with st.expander("Demo Credentials"):
54
+ st.info("""
55
+ **Demo Account:**
56
+ - Username: `demo`
57
+ - Password: `demo123`
58
 
59
+ **Admin Account:**
60
+ - Username: `admin`
61
+ - Password: `admin123`
62
+ """)
 
 
 
 
 
63
 
64
+ if st.session_state.get("password_correct", False) == False:
65
+ st.error("๐Ÿ˜ž Username or password incorrect")
66
+
67
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # Utility functions
70
  def save_artifacts(obj, folder_name, file_name):
71
  """Save artifacts like encoders and vectorizers"""
72
  try:
 
84
  with open(os.path.join(folder_name, file_name), 'rb') as f:
85
  return pickle.load(f)
86
  except FileNotFoundError:
87
+ st.warning(f"File {file_name} not found in {folder_name} folder")
88
  return None
89
  except Exception as e:
90
  st.error(f"Error loading {file_name}: {str(e)}")
 
102
  st.error(f"Error loading model: {str(e)}")
103
  return None
104
 
105
+ def safe_file_upload(uploaded_file, encoding='utf-8'):
106
+ """Safely read uploaded file with multiple encoding attempts"""
107
+ if uploaded_file is None:
108
+ return None
109
+
110
+ encodings_to_try = [encoding, 'latin1', 'cp1252', 'iso-8859-1']
111
+
112
+ for enc in encodings_to_try:
113
+ try:
114
+ # Reset file pointer
115
+ uploaded_file.seek(0)
116
+ df = pd.read_csv(uploaded_file, encoding=enc)
117
+ st.success(f"File loaded successfully with {enc} encoding")
118
+ return df
119
+ except UnicodeDecodeError:
120
+ continue
121
+ except Exception as e:
122
+ st.error(f"Error reading file with {enc}: {str(e)}")
123
+ continue
124
+
125
+ st.error("Could not read file with any common encoding. Please check your file format.")
126
+ return None
127
+
128
  def predict_text(model_name, text, vectorizer_type="tfidf"):
129
  """Make prediction on new text"""
130
  try:
 
171
  st.error(f"Error during prediction: {str(e)}")
172
  return None, None
173
 
174
+ # Main App Logic
175
+ def main_app():
176
+ # Header with user info
177
+ col1, col2 = st.columns([3, 1])
178
+ with col1:
179
+ st.title('๐Ÿค– No Code Text Classification App')
180
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
181
+ with col2:
182
+ st.markdown(f"**๐Ÿ‘ค User:** {st.session_state.get('authenticated_user', 'Unknown')}")
183
+ if st.button("Logout", type="secondary"):
184
+ for key in list(st.session_state.keys()):
185
+ del st.session_state[key]
186
+ st.rerun()
187
+
188
+ # Sidebar
189
+ section = st.sidebar.radio("Choose Section", ["๐Ÿ“Š Data Analysis", "๐Ÿš€ Train Model", "๐Ÿ”ฎ Predictions"])
190
+
191
+ # Upload Data with improved error handling
192
+ st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
193
+
194
+ # File encoding selection
195
+ encoding_choice = st.sidebar.selectbox(
196
+ "File Encoding",
197
+ ["utf-8", "latin1", "cp1252", "iso-8859-1"],
198
+ help="If file upload fails, try different encodings"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  )
200
 
201
+ train_data = st.sidebar.file_uploader(
202
+ "Upload training data",
203
+ type=["csv"],
204
+ help="Upload a CSV file with your training data"
205
+ )
206
+
207
+ test_data = st.sidebar.file_uploader(
208
+ "Upload test data (optional)",
209
+ type=["csv"],
210
+ help="Optional: Upload separate test data"
211
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ # Global variables to store data and settings
214
+ if 'vectorizer_type' not in st.session_state:
215
+ st.session_state.vectorizer_type = "tfidf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ train_df = None
218
+ test_df = None
219
+ info = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ if train_data is not None:
222
+ with st.spinner("Loading training data..."):
223
+ train_df = safe_file_upload(train_data, encoding_choice)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ if train_df is not None:
226
+ try:
227
+ if test_data is not None:
228
+ test_df = safe_file_upload(test_data, encoding_choice)
229
+
230
+ st.sidebar.success(f"โœ… Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
231
+ st.write("๐Ÿ“‹ Training Data Preview:")
232
+ st.dataframe(train_df.head(3), use_container_width=True)
233
 
234
+ columns = train_df.columns.tolist()
235
+ text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
236
+ target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
237
+
238
+ # Process data
239
+ if text_data and target and text_data != target:
240
+ with st.spinner("Processing data..."):
241
+ info = Informations(train_df, text_data, target)
242
+ train_df['clean_text'] = info.clean_text()
243
+ train_df['text_length'] = info.text_length()
244
+
245
+ # Handle label encoding manually if the class doesn't store encoder
246
+ from sklearn.preprocessing import LabelEncoder
247
+ label_encoder = LabelEncoder()
248
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
249
+
250
+ # Save label encoder for later use
251
+ if save_artifacts(label_encoder, "artifacts", "encoder.pkl"):
252
+ st.sidebar.success("โœ… Data processed successfully")
253
  else:
254
+ st.sidebar.warning("Please select different columns for text and target")
255
+
256
+ except Exception as e:
257
+ st.error(f"โŒ Error processing data: {str(e)}")
258
+ train_df = None
259
+ info = None
260
+
261
+ # Data Analysis Section
262
+ if section == "๐Ÿ“Š Data Analysis":
263
+ st.header("๐Ÿ“Š Data Analysis & Insights")
264
+
265
+ if train_data is not None and train_df is not None and info is not None:
266
+ try:
267
+ # Create tabs for better organization
268
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“ˆ Basic Stats", "๐Ÿ“ Text Analysis", "๐Ÿ“Š Visualizations"])
 
 
 
 
 
 
269
 
270
+ with tab1:
271
+ col1, col2, col3 = st.columns(3)
272
+
273
+ with col1:
274
+ st.metric("๐Ÿ“Š Data Shape", f"{info.shape()[0]} x {info.shape()[1]}")
275
+
276
+ with col2:
277
+ imbalance_info = info.class_imbalanced()
278
+ st.metric("โš–๏ธ Class Balance", "Balanced" if not imbalance_info else "Imbalanced")
279
+
280
+ with col3:
281
+ missing_info = info.missing_values()
282
+ total_missing = sum(missing_info.values()) if isinstance(missing_info, dict) else 0
283
+ st.metric("โŒ Missing Values", str(total_missing))
284
+
285
+ st.subheader("๐Ÿ“‹ Processed Data Preview")
286
+ st.dataframe(train_df[['clean_text', 'text_length', 'target']].head(), use_container_width=True)
287
 
288
+ with tab2:
289
+ st.subheader("๐Ÿ“ Text Length Analysis")
290
+ text_analysis = info.analysis_text_length('text_length')
291
+
292
+ # Display stats in a nice format
293
+ stats_col1, stats_col2 = st.columns(2)
294
+ with stats_col1:
295
+ st.json(text_analysis)
296
+
297
+ with stats_col2:
298
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
299
+ st.metric("๐Ÿ”— Text Length-Target Correlation", f"{correlation:.4f}")
 
 
 
 
 
 
 
300
 
301
+ with tab3:
302
+ st.subheader("๐Ÿ“Š Data Visualizations")
303
+ vis = Visualizations(train_df, text_data, target)
304
+
305
+ col1, col2 = st.columns(2)
306
+ with col1:
307
+ st.write("**Class Distribution**")
308
+ vis.class_distribution()
309
+
310
+ with col2:
311
+ st.write("**Text Length Distribution**")
312
+ vis.text_length_distribution()
313
 
314
+ except Exception as e:
315
+ st.error(f"โŒ Error in data analysis: {str(e)}")
316
+ else:
317
+ st.info("๐Ÿ‘† Please upload training data in the sidebar to get insights")
 
 
 
 
 
 
 
 
 
318
 
319
+ # Train Model Section
320
+ elif section == "๐Ÿš€ Train Model":
321
+ st.header("๐Ÿš€ Train Classification Model")
322
+
323
+ if train_data is not None and train_df is not None:
324
+ try:
325
+ # Create two columns for model selection
326
+ col1, col2 = st.columns(2)
327
+
328
+ with col1:
329
+ st.subheader("๐Ÿค– Choose Model")
330
+ model = st.radio("Select Algorithm:", [
331
+ "Logistic Regression", "Decision Tree",
332
+ "Random Forest", "Linear SVC", "SVC",
333
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
334
+ ])
335
+
336
+ with col2:
337
+ st.subheader("๐Ÿ”ค Choose Vectorizer")
338
+ vectorizer_choice = st.radio("Select Vectorizer:", ["Tfidf Vectorizer", "Count Vectorizer"])
339
+
340
+ # Initialize vectorizer
341
+ if vectorizer_choice == "Tfidf Vectorizer":
342
+ vectorizer = TfidfVectorizer(max_features=10000)
343
+ st.session_state.vectorizer_type = "tfidf"
344
+ else:
345
+ vectorizer = CountVectorizer(max_features=10000)
346
+ st.session_state.vectorizer_type = "count"
347
 
348
+ st.subheader("๐Ÿ“‹ Training Data Preview")
349
+ st.dataframe(train_df[['clean_text', 'target']].head(3), use_container_width=True)
350
+
351
+ # Vectorize text data
352
+ with st.spinner("Preparing data..."):
353
+ X = vectorizer.fit_transform(train_df['clean_text'])
354
+ y = train_df['target']
355
+
356
+ # Split data
357
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
358
+ st.success(f"โœ… Data prepared - Train: {X_train.shape}, Test: {X_test.shape}")
359
+
360
+ # Save vectorizer for later use
361
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
362
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
363
+
364
+ if st.button("๐Ÿš€ Start Training", type="primary", use_container_width=True):
365
+ progress_bar = st.progress(0)
366
+ status_text = st.empty()
367
+
368
+ with st.spinner(f"Training {model} model..."):
369
+ status_text.text("Initializing model...")
370
  progress_bar.progress(20)
371
 
372
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 
 
 
 
 
 
 
 
373
 
374
+ status_text.text("Training in progress...")
375
  progress_bar.progress(50)
376
 
377
+ # Train selected model
 
 
 
 
 
 
 
 
 
378
  if model == "Logistic Regression":
379
  models.LogisticRegression()
380
  elif model == "Decision Tree":
 
392
 
393
  progress_bar.progress(100)
394
  status_text.text("Training completed!")
395
+
396
+ st.success("๐ŸŽ‰ Model training completed successfully!")
397
+ st.balloons()
398
+ st.info("๐Ÿ’ก You can now use the 'Predictions' section to classify new text.")
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
+ except Exception as e:
401
+ st.error(f"โŒ Error in model training: {str(e)}")
402
+ st.exception(e)
403
+ else:
404
+ st.info("๐Ÿ‘† Please upload training data in the sidebar to train a model")
405
 
406
+ # Predictions Section
407
+ elif section == "๐Ÿ”ฎ Predictions":
408
+ st.header("๐Ÿ”ฎ Text Classification Predictions")
 
 
 
 
409
 
410
+ # Check if models exist
411
+ if os.path.exists("models") and os.listdir("models"):
412
+ tab1, tab2 = st.tabs(["๐ŸŽฏ Single Prediction", "๐Ÿ“Š Batch Predictions"])
413
 
414
+ with tab1:
415
+ st.subheader("๐ŸŽฏ Classify Single Text")
416
+
417
+ # Text input for prediction
418
+ text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type or paste your text here...")
419
+
420
+ # Model selection
421
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
422
+
423
+ if available_models:
424
+ selected_model = st.selectbox("๐Ÿค– Choose the trained model:", available_models)
 
 
 
 
 
 
 
 
 
425
 
426
+ # Prediction button
427
+ if st.button("๐Ÿ”ฎ Predict", key="single_predict", type="primary"):
428
+ if text_input.strip():
429
+ with st.spinner("Making prediction..."):
430
+ predicted_label, prediction_proba = predict_text(
431
+ selected_model,
432
+ text_input,
433
+ st.session_state.get('vectorizer_type', 'tfidf')
434
+ )
435
+
436
+ if predicted_label is not None:
437
+ st.success("๐ŸŽ‰ Prediction completed!")
 
 
 
 
 
 
 
 
 
 
 
438
 
439
+ # Display results
440
+ st.markdown("### ๐Ÿ“‹ Prediction Results")
441
 
442
+ # Create result container
443
+ result_container = st.container()
444
+ with result_container:
445
+ st.markdown(f"**๐Ÿ“ Input Text:** {text_input}")
446
+ st.markdown(f"**๐Ÿท๏ธ Predicted Class:** `{predicted_label}`")
447
+
448
+ # Display probabilities if available
449
+ if prediction_proba is not None:
450
+ st.markdown("**๐Ÿ“Š Class Probabilities:**")
451
+
452
+ # Load encoder to get class names
453
+ encoder = load_artifacts("artifacts", "encoder.pkl")
454
+ if encoder is not None:
455
+ classes = encoder.classes_
456
+ prob_df = pd.DataFrame({
457
+ 'Class': classes,
458
+ 'Probability': prediction_proba
459
+ }).sort_values('Probability', ascending=False)
460
+
461
+ st.bar_chart(prob_df.set_index('Class'))
462
+ st.dataframe(prob_df, use_container_width=True)
463
+ else:
464
+ st.warning("โš ๏ธ Please enter some text to classify")
465
+ else:
466
+ st.warning("โš ๏ธ No trained models found. Please train a model first.")
467
 
468
+ with tab2:
469
+ st.subheader("๐Ÿ“Š Batch Classification")
470
 
471
+ uploaded_file = st.file_uploader(
472
+ "Upload a CSV file with text to classify",
473
+ type=['csv'],
474
+ help="Upload a CSV file containing text data for batch classification"
475
+ )
476
+
477
+ if uploaded_file is not None:
478
+ try:
479
+ batch_df = safe_file_upload(uploaded_file)
480
+ if batch_df is not None:
481
+ st.write("๐Ÿ“‹ Uploaded data preview:")
482
+ st.dataframe(batch_df.head(), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
+ # Select text column
485
+ text_column = st.selectbox("๐Ÿ“ Select the text column:", batch_df.columns.tolist())
 
486
 
487
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
488
+ batch_model = st.selectbox("๐Ÿค– Choose model for batch prediction:", available_models, key="batch_model")
 
 
 
 
 
 
489
 
490
+ if st.button("๐Ÿš€ Run Batch Predictions", key="batch_predict", type="primary"):
491
+ progress_bar = st.progress(0)
492
+ status_text = st.empty()
493
+
494
+ with st.spinner("Processing batch predictions..."):
495
+ predictions = []
496
+ total_texts = len(batch_df)
497
+
498
+ for i, text in enumerate(batch_df[text_column]):
499
+ status_text.text(f"Processing {i+1}/{total_texts} texts...")
500
+ progress_bar.progress((i+1)/total_texts)
501
+
502
+ pred, _ = predict_text(
503
+ batch_model,
504
+ str(text),
505
+ st.session_state.get('vectorizer_type', 'tfidf')
506
+ )
507
+ predictions.append(pred if pred is not None else "Error")
508
+
509
+ batch_df['Predicted_Class'] = predictions
510
+
511
+ st.success("๐ŸŽ‰ Batch predictions completed!")
512
+ st.write("๐Ÿ“Š Results:")
513
+ st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
514
+
515
+ # Download results
516
+ csv = batch_df.to_csv(index=False)
517
+ st.download_button(
518
+ label="๐Ÿ“ฅ Download predictions as CSV",
519
+ data=csv,
520
+ file_name="batch_predictions.csv",
521
+ mime="text/csv",
522
+ type="primary"
523
+ )
524
+ except Exception as e:
525
+ st.error(f"โŒ Error in batch prediction: {str(e)}")
526
  else:
527
+ st.info("โš ๏ธ No trained models found. Please go to 'Train Model' section to train a model first.")
528
+
529
+ # Main execution
530
+ def main():
531
+ # Page config
532
+ st.set_page_config(
533
+ page_title="Text Classification App",
534
+ page_icon="๐Ÿค–",
535
+ layout="wide",
536
+ initial_sidebar_state="expanded"
537
+ )
538
+
539
+ # Custom CSS for better styling
540
+ st.markdown("""
541
+ <style>
542
+ .main {
543
+ padding-top: 1rem;
544
+ }
545
+ .stAlert {
546
+ margin-top: 1rem;
547
+ }
548
+ .metric-container {
549
+ background-color: #f0f2f6;
550
+ padding: 1rem;
551
+ border-radius: 0.5rem;
552
+ margin: 0.5rem 0;
553
+ }
554
+ </style>
555
+ """, unsafe_allow_html=True)
556
+
557
+ # Check authentication
558
+ if check_password():
559
+ main_app()
560
 
561
+ if __name__ == "__main__":
562
+ main()