Alamgirapi commited on
Commit
0a50c6f
·
verified ·
1 Parent(s): 8d810b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +366 -490
app.py CHANGED
@@ -5,146 +5,200 @@ import numpy as np
5
  import seaborn as sns
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
  from sklearn.model_selection import train_test_split
8
- from sklearn.preprocessing import LabelEncoder
9
  from sklearn.linear_model import LogisticRegression
10
  from sklearn.tree import DecisionTreeClassifier
11
  from sklearn.ensemble import RandomForestClassifier
12
  from sklearn.svm import LinearSVC, SVC
13
  from sklearn.naive_bayes import MultinomialNB, GaussianNB
 
14
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
15
  import os
16
  import pickle
17
  import re
18
  import string
19
  from collections import Counter
20
- import plotly.express as px
21
- import plotly.graph_objects as go
22
 
23
- # Configure Streamlit page
24
- st.set_page_config(
25
- page_title="Text Classification App",
26
- page_icon="📝",
27
- layout="wide"
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Text preprocessing class
31
- class TextCleaner:
32
- def __init__(self):
33
- self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
 
34
 
35
- def clean_text(self, text):
36
- """Clean and preprocess text"""
37
- if pd.isna(text):
38
- return ""
39
-
40
- text = str(text).lower()
41
- text = re.sub(r'http\S+', '', text) # Remove URLs
42
- text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters
43
- text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
44
- text = text.strip()
45
-
46
- # Remove stop words (optional)
47
- words = text.split()
48
- words = [word for word in words if word not in self.stop_words]
49
-
50
- return ' '.join(words)
51
 
52
- # Data analysis functions
53
- def get_data_insights(df, text_col, target_col):
54
- """Get basic insights from the dataset"""
55
- insights = {
56
- 'shape': df.shape,
57
- 'missing_values': df.isnull().sum().to_dict(),
58
- 'class_distribution': df[target_col].value_counts().to_dict(),
59
- 'text_length_stats': {
60
- 'mean': df[text_col].str.len().mean(),
61
- 'median': df[text_col].str.len().median(),
62
- 'min': df[text_col].str.len().min(),
63
- 'max': df[text_col].str.len().max()
64
- }
65
- }
66
- return insights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # Model training functions
69
  def train_model(model_name, X_train, X_test, y_train, y_test):
70
- """Train and evaluate a model"""
71
- models = {
72
- 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
73
- 'Decision Tree': DecisionTreeClassifier(random_state=42),
74
- 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
75
- 'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
76
- 'SVC': SVC(random_state=42, probability=True),
77
- 'Multinomial Naive Bayes': MultinomialNB(),
78
- 'Gaussian Naive Bayes': GaussianNB()
79
  }
80
 
81
- model = models[model_name]
 
82
 
83
- # For Gaussian NB, convert sparse matrix to dense
84
- if model_name == 'Gaussian Naive Bayes':
85
- X_train = X_train.toarray()
86
- X_test = X_test.toarray()
 
 
 
 
 
87
 
88
  # Train model
89
- model.fit(X_train, y_train)
90
 
91
  # Make predictions
92
- y_pred = model.predict(X_test)
93
 
94
  # Calculate metrics
95
  accuracy = accuracy_score(y_test, y_pred)
 
96
 
97
  # Save model
98
  os.makedirs("models", exist_ok=True)
99
- model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
100
- with open(os.path.join("models", model_filename), 'wb') as f:
101
- pickle.dump(model, f)
102
 
103
- return model, accuracy, y_pred, model_filename
104
 
105
- # Utility functions
106
- def save_artifacts(obj, folder_name, file_name):
107
- """Save artifacts like encoders and vectorizers"""
108
- os.makedirs(folder_name, exist_ok=True)
109
- with open(os.path.join(folder_name, file_name), 'wb') as f:
110
- pickle.dump(obj, f)
111
-
112
- def load_artifacts(folder_name, file_name):
113
- """Load saved artifacts"""
114
- try:
115
- with open(os.path.join(folder_name, file_name), 'rb') as f:
116
- return pickle.load(f)
117
- except FileNotFoundError:
118
- st.error(f"File {file_name} not found in {folder_name} folder")
119
- return None
120
-
121
- def predict_text(model_filename, text, vectorizer_type="tfidf"):
122
  """Make prediction on new text"""
123
  try:
124
  # Load model
125
- with open(os.path.join('models', model_filename), 'rb') as f:
126
- model = pickle.load(f)
 
 
127
 
128
  # Load vectorizer
129
- vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
130
- vectorizer = load_artifacts("artifacts", vectorizer_file)
131
  if vectorizer is None:
132
  return None, None
133
 
134
  # Load label encoder
135
- encoder = load_artifacts("artifacts", "encoder.pkl")
136
  if encoder is None:
137
  return None, None
138
 
139
  # Clean and vectorize text
140
- text_cleaner = TextCleaner()
141
- clean_text = text_cleaner.clean_text(text)
142
 
143
- # Transform text
144
- text_vector = vectorizer.transform([clean_text])
145
-
146
- # For Gaussian NB, convert to dense
147
- if 'gaussian' in model_filename:
148
  text_vector = text_vector.toarray()
149
 
150
  # Make prediction
@@ -154,9 +208,12 @@ def predict_text(model_filename, text, vectorizer_type="tfidf"):
154
  # Get prediction probabilities if available
155
  if hasattr(model, 'predict_proba'):
156
  try:
157
- prediction_proba = model.predict_proba(text_vector)[0]
158
- except:
159
- pass
 
 
 
160
 
161
  # Decode prediction
162
  predicted_label = encoder.inverse_transform(prediction)[0]
@@ -167,479 +224,298 @@ def predict_text(model_filename, text, vectorizer_type="tfidf"):
167
  st.error(f"Error during prediction: {str(e)}")
168
  return None, None
169
 
170
- # Streamlit App
171
- st.title('📝 No Code Text Classification App')
172
- st.markdown('---')
173
- st.write('Analyze your text data and train machine learning models without coding!')
 
 
 
 
 
174
 
175
  # Sidebar
176
- st.sidebar.title("Navigation")
177
- section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"])
178
 
179
- # Upload Data
180
- st.sidebar.markdown("---")
181
- st.sidebar.subheader("📁 Upload Your Dataset")
182
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
183
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- # Global variables to store data and settings
186
- if 'vectorizer_type' not in st.session_state:
187
- st.session_state.vectorizer_type = "tfidf"
 
 
 
188
 
189
- if train_data is not None:
 
190
  try:
191
- # Try different encodings
192
- encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
193
- train_df = None
194
 
195
- for encoding in encodings:
196
- try:
197
- train_df = pd.read_csv(train_data, encoding=encoding)
198
- break
199
- except UnicodeDecodeError:
200
- continue
 
 
 
 
 
 
 
 
 
201
 
202
- if train_df is None:
203
- st.error("Unable to read the CSV file. Please check the file encoding.")
204
- else:
205
- if test_data is not None:
206
- for encoding in encodings:
207
- try:
208
- test_df = pd.read_csv(test_data, encoding=encoding)
209
- break
210
- except UnicodeDecodeError:
211
- continue
212
- else:
213
- test_df = None
214
-
215
- # Show data preview
216
- with st.sidebar.expander("📋 Data Preview", expanded=True):
217
- st.write("Shape:", train_df.shape)
218
- st.write(train_df.head(2))
219
-
220
- columns = train_df.columns.tolist()
221
- text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
222
- target = st.sidebar.selectbox("🎯 Choose the target column:", columns)
223
-
224
- # Process data
225
- if text_data and target:
226
- # Clean text
227
- text_cleaner = TextCleaner()
228
- train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text)
229
- train_df['text_length'] = train_df[text_data].str.len()
230
-
231
- # Handle label encoding
232
- label_encoder = LabelEncoder()
233
- train_df['target_encoded'] = label_encoder.fit_transform(train_df[target])
234
-
235
- # Save label encoder
236
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
237
-
238
  except Exception as e:
239
- st.error(f"Error loading data: {str(e)}")
240
- train_df = None
 
241
 
242
- # Data Analysis Section
243
  if section == "📊 Data Analysis":
244
- if train_data is not None and 'train_df' in locals() and train_df is not None:
245
- st.header("📊 Data Analysis")
246
 
247
- # Get insights
248
- insights = get_data_insights(train_df, text_data, target)
249
-
250
- # Display insights in columns
251
- col1, col2, col3, col4 = st.columns(4)
252
 
253
  with col1:
254
- st.metric("Total Samples", insights['shape'][0])
255
-
256
  with col2:
257
- st.metric("Features", insights['shape'][1])
258
-
259
  with col3:
260
- st.metric("Classes", len(insights['class_distribution']))
261
 
262
- with col4:
263
- st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}")
 
264
 
265
- st.markdown("---")
 
266
 
267
- # Data quality section
268
  col1, col2 = st.columns(2)
269
 
270
  with col1:
271
- st.subheader("📋 Dataset Overview")
272
- st.write("**Shape:**", insights['shape'])
273
- st.write("**Missing Values:**")
274
- missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count'])
275
- st.dataframe(missing_df[missing_df['Count'] > 0])
276
-
277
- st.write("**Sample Data:**")
278
- st.dataframe(train_df[[text_data, target, 'text_length']].head())
279
 
280
  with col2:
281
- st.subheader("📊 Class Distribution")
282
- class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count'])
 
283
  st.dataframe(class_dist)
284
-
285
- # Plot class distribution
286
- fig = px.bar(
287
- x=class_dist.index,
288
- y=class_dist['Count'],
289
- title="Class Distribution",
290
- labels={'x': 'Class', 'y': 'Count'}
291
- )
292
- st.plotly_chart(fig, use_container_width=True)
293
 
294
- st.markdown("---")
295
-
296
- # Text analysis section
297
- st.subheader("📝 Text Analysis")
298
-
299
- col1, col2 = st.columns(2)
300
-
301
- with col1:
302
- # Text length distribution
303
- fig = px.histogram(
304
- train_df,
305
- x='text_length',
306
- title="Text Length Distribution",
307
- nbins=30
308
- )
309
- st.plotly_chart(fig, use_container_width=True)
310
-
311
- with col2:
312
- # Text length by class
313
- fig = px.box(
314
- train_df,
315
- x=target,
316
- y='text_length',
317
- title="Text Length by Class"
318
- )
319
- st.plotly_chart(fig, use_container_width=True)
320
-
321
- # Word frequency analysis
322
- st.subheader("🔤 Most Common Words")
323
- all_text = ' '.join(train_df['clean_text'].astype(str))
324
- word_freq = Counter(all_text.split())
325
- top_words = word_freq.most_common(20)
326
-
327
- if top_words:
328
- words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
329
- fig = px.bar(
330
- words_df,
331
- x='Frequency',
332
- y='Word',
333
- orientation='h',
334
- title="Top 20 Most Common Words"
335
- )
336
- fig.update_layout(yaxis={'categoryorder': 'total ascending'})
337
- st.plotly_chart(fig, use_container_width=True)
338
 
339
  else:
340
- st.warning("📁 Please upload training data to perform analysis")
341
 
342
- # Train Model Section
343
  elif section == "🤖 Train Model":
344
- if train_data is not None and 'train_df' in locals() and train_df is not None:
345
- st.header("🤖 Train Machine Learning Model")
346
-
347
- col1, col2 = st.columns(2)
348
-
349
- with col1:
350
- st.subheader("⚙️ Model Configuration")
351
- model_name = st.selectbox("Choose Model", [
352
- "Logistic Regression", "Decision Tree",
353
- "Random Forest", "Linear SVC", "SVC",
354
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
355
- ])
356
 
357
- with col2:
358
- st.subheader("📊 Vectorization Method")
359
- vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"])
360
-
361
- # Model parameters
362
- st.subheader("🔧 Parameters")
363
  col1, col2 = st.columns(2)
364
 
365
  with col1:
366
- max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000)
367
- test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05)
368
-
369
- with col2:
370
- random_state = st.number_input("Random State", 0, 1000, 42)
371
- min_df = st.slider("Min Document Frequency", 1, 10, 1)
372
-
373
- # Initialize vectorizer
374
- if vectorizer_choice == "TF-IDF":
375
- vectorizer = TfidfVectorizer(
376
- max_features=max_features,
377
- min_df=min_df,
378
- stop_words='english'
379
  )
380
- st.session_state.vectorizer_type = "tfidf"
381
- else:
382
- vectorizer = CountVectorizer(
383
- max_features=max_features,
384
- min_df=min_df,
385
- stop_words='english'
386
- )
387
- st.session_state.vectorizer_type = "count"
388
-
389
- # Show data info
390
- st.subheader("📋 Training Data Info")
391
- col1, col2, col3 = st.columns(3)
392
-
393
- with col1:
394
- st.metric("Total Samples", len(train_df))
395
 
396
  with col2:
397
- st.metric("Unique Classes", train_df[target].nunique())
 
 
 
 
398
 
399
- with col3:
400
- st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}")
 
401
 
402
  if st.button("🚀 Start Training", type="primary"):
403
- with st.spinner("Training model... This may take a few minutes."):
404
  try:
405
- # Vectorize text data
406
- X = vectorizer.fit_transform(train_df['clean_text'])
407
- y = train_df['target_encoded']
 
 
 
 
 
 
 
 
408
 
409
  # Split data
410
  X_train, X_test, y_train, y_test = train_test_split(
411
- X, y,
412
- test_size=test_size,
413
- random_state=random_state,
414
- stratify=y
415
  )
416
 
417
- st.success(f"✅ Data split - Train: {X_train.shape}, Test: {X_test.shape}")
418
-
419
  # Save vectorizer
420
  vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
421
  save_artifacts(vectorizer, "artifacts", vectorizer_filename)
422
 
423
  # Train model
424
- model, accuracy, y_pred, model_filename = train_model(
425
- model_name, X_train, X_test, y_train, y_test
426
- )
427
-
428
- st.success("🎉 Model training completed!")
429
 
430
- # Display results
431
- col1, col2 = st.columns(2)
432
-
433
- with col1:
434
- st.metric("🎯 Test Accuracy", f"{accuracy:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
- # Classification report
437
- st.subheader("📊 Classification Report")
438
- report = classification_report(
439
- y_test, y_pred,
440
- target_names=label_encoder.classes_,
441
- output_dict=True
442
- )
443
- report_df = pd.DataFrame(report).transpose()
444
- st.dataframe(report_df.round(4))
445
-
446
- with col2:
447
- # Confusion matrix
448
- st.subheader("🔄 Confusion Matrix")
449
- cm = confusion_matrix(y_test, y_pred)
450
- fig = px.imshow(
451
- cm,
452
- text_auto=True,
453
- aspect="auto",
454
- title="Confusion Matrix",
455
- labels=dict(x="Predicted", y="Actual"),
456
- x=label_encoder.classes_,
457
- y=label_encoder.classes_
458
- )
459
- st.plotly_chart(fig, use_container_width=True)
460
-
461
- st.info(f"✅ Model saved as: {model_filename}")
462
- st.info("🔮 You can now use the 'Predictions' section to classify new text!")
463
-
464
  except Exception as e:
465
- st.error(f"❌ Error during training: {str(e)}")
466
-
467
  else:
468
- st.warning("📁 Please upload training data to train a model")
469
 
470
- # Predictions Section
471
  elif section == "🔮 Predictions":
472
- st.header("🔮 Text Classification Predictions")
473
 
474
- # Check if models exist
475
  if os.path.exists("models") and os.listdir("models"):
476
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
 
477
 
478
  if available_models:
479
  # Single prediction
480
- st.subheader("📝 Single Text Classification")
481
 
482
- col1, col2 = st.columns([2, 1])
483
 
484
  with col1:
485
- text_input = st.text_area("Enter text to classify:", height=150)
 
 
 
 
486
 
487
  with col2:
488
- selected_model = st.selectbox("Choose model:", available_models)
489
- predict_button = st.button("🔮 Predict", type="primary")
490
-
491
- if predict_button and text_input.strip():
492
- with st.spinner("Making prediction..."):
493
- predicted_label, prediction_proba = predict_text(
494
- selected_model,
495
- text_input,
496
- st.session_state.get('vectorizer_type', 'tfidf')
497
- )
498
-
499
- if predicted_label is not None:
500
- st.success("✅ Prediction completed!")
501
-
502
- # Display results
503
- col1, col2 = st.columns(2)
504
-
505
- with col1:
506
- st.markdown("### 🎯 Results")
507
- st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}")
508
- st.markdown(f"**Predicted Class:** `{predicted_label}`")
509
-
510
- with col2:
511
- # Display probabilities if available
512
- if prediction_proba is not None:
513
- st.markdown("### 📊 Class Probabilities")
514
 
515
- encoder = load_artifacts("artifacts", "encoder.pkl")
516
- if encoder is not None:
517
- prob_df = pd.DataFrame({
518
- 'Class': encoder.classes_,
519
- 'Probability': prediction_proba
520
- }).sort_values('Probability', ascending=False)
521
-
522
- fig = px.bar(
523
- prob_df,
524
- x='Probability',
525
- y='Class',
526
- orientation='h',
527
- title="Prediction Confidence"
528
- )
529
- fig.update_layout(yaxis={'categoryorder': 'total ascending'})
530
- st.plotly_chart(fig, use_container_width=True)
531
-
532
- elif predict_button:
533
- st.warning("⚠️ Please enter some text to classify")
534
 
535
  # Batch predictions
536
  st.markdown("---")
537
- st.subheader("📊 Batch Predictions")
538
 
539
- uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv'])
540
 
541
- if uploaded_file is not None:
542
  try:
543
- # Try different encodings for batch file
544
- encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
545
- batch_df = None
546
 
547
- for encoding in encodings:
548
- try:
549
- batch_df = pd.read_csv(uploaded_file, encoding=encoding)
550
- break
551
- except UnicodeDecodeError:
552
- continue
553
 
554
- if batch_df is not None:
555
- st.write("📋 Uploaded data preview:")
556
- st.dataframe(batch_df.head())
557
-
558
- col1, col2 = st.columns(2)
559
-
560
- with col1:
561
- text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
562
-
563
- with col2:
564
- batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
565
-
566
- if st.button("🚀 Run Batch Predictions", type="primary"):
567
- with st.spinner("Processing batch predictions..."):
568
- predictions = []
569
- confidences = []
570
-
571
- progress_bar = st.progress(0)
572
- total_texts = len(batch_df)
573
-
574
- for i, text in enumerate(batch_df[text_column]):
575
- pred, proba = predict_text(
576
- batch_model,
577
- str(text),
578
- st.session_state.get('vectorizer_type', 'tfidf')
579
- )
580
- predictions.append(pred if pred is not None else "Error")
581
-
582
- # Get confidence (max probability)
583
- if proba is not None:
584
- confidences.append(max(proba))
585
- else:
586
- confidences.append(0.0)
587
-
588
- progress_bar.progress((i + 1) / total_texts)
589
-
590
- batch_df['Predicted_Class'] = predictions
591
- batch_df['Confidence'] = confidences
592
-
593
- st.success("✅ Batch predictions completed!")
594
-
595
- # Show results
596
- st.subheader("📊 Results")
597
- result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']]
598
- st.dataframe(result_df)
599
-
600
- # Summary statistics
601
- st.subheader("📈 Summary")
602
- col1, col2, col3 = st.columns(3)
603
-
604
- with col1:
605
- st.metric("Total Predictions", len(predictions))
606
-
607
- with col2:
608
- successful_preds = sum(1 for p in predictions if p != "Error")
609
- st.metric("Successful", successful_preds)
610
-
611
- with col3:
612
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0
613
- st.metric("Avg Confidence", f"{avg_confidence:.3f}")
614
-
615
- # Class distribution of predictions
616
- pred_counts = pd.Series(predictions).value_counts()
617
- if len(pred_counts) > 0:
618
- fig = px.pie(
619
- values=pred_counts.values,
620
- names=pred_counts.index,
621
- title="Distribution of Predictions"
622
- )
623
- st.plotly_chart(fig, use_container_width=True)
624
-
625
- # Download results
626
- csv = batch_df.to_csv(index=False)
627
- st.download_button(
628
- label="📥 Download Results as CSV",
629
- data=csv,
630
- file_name="batch_predictions.csv",
631
- mime="text/csv"
632
  )
633
- else:
634
- st.error("❌ Unable to read the CSV file. Please check the file encoding.")
635
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  except Exception as e:
637
- st.error(f"❌ Error in batch prediction: {str(e)}")
638
  else:
639
- st.warning("⚠️ No trained models found. Please train a model first.")
640
  else:
641
- st.warning("⚠️ No models directory found. Please go to 'Train Model' section to train a model first.")
642
 
643
  # Footer
644
  st.markdown("---")
645
- st.markdown("🚀 Built with Streamlit | 📊 No-Code Text Classification")
 
5
  import seaborn as sns
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
  from sklearn.model_selection import train_test_split
 
8
  from sklearn.linear_model import LogisticRegression
9
  from sklearn.tree import DecisionTreeClassifier
10
  from sklearn.ensemble import RandomForestClassifier
11
  from sklearn.svm import LinearSVC, SVC
12
  from sklearn.naive_bayes import MultinomialNB, GaussianNB
13
+ from sklearn.preprocessing import LabelEncoder
14
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
15
  import os
16
  import pickle
17
  import re
18
  import string
19
  from collections import Counter
 
 
20
 
21
+ # Set page config
22
+ st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")
23
+
24
+ # Custom CSS for better styling
25
+ st.markdown("""
26
+ <style>
27
+ .main-header {
28
+ font-size: 2.5rem;
29
+ color: #1f77b4;
30
+ text-align: center;
31
+ margin-bottom: 2rem;
32
+ }
33
+ .section-header {
34
+ font-size: 1.8rem;
35
+ color: #ff7f0e;
36
+ border-bottom: 2px solid #ff7f0e;
37
+ padding-bottom: 0.5rem;
38
+ }
39
+ </style>
40
+ """, unsafe_allow_html=True)
41
 
42
+ # Utility functions
43
+ def clean_text(text):
44
+ """Clean text data"""
45
+ if pd.isna(text):
46
+ return ""
47
 
48
+ text = str(text).lower()
49
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
50
+ text = re.sub(r'\s+', ' ', text)
51
+ text = text.strip()
52
+ return text
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ def save_artifacts(obj, folder_name, file_name):
55
+ """Save artifacts like encoders and vectorizers"""
56
+ try:
57
+ os.makedirs(folder_name, exist_ok=True)
58
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
59
+ pickle.dump(obj, f)
60
+ return True
61
+ except Exception as e:
62
+ st.error(f"Error saving {file_name}: {str(e)}")
63
+ return False
64
+
65
+ def load_artifacts(folder_name, file_name):
66
+ """Load saved artifacts"""
67
+ try:
68
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
69
+ return pickle.load(f)
70
+ except FileNotFoundError:
71
+ st.error(f"File {file_name} not found in {folder_name} folder")
72
+ return None
73
+ except Exception as e:
74
+ st.error(f"Error loading {file_name}: {str(e)}")
75
+ return None
76
+
77
+ def analyze_data(df, text_col, target_col):
78
+ """Perform data analysis"""
79
+ analysis = {}
80
+
81
+ # Basic info
82
+ analysis['shape'] = df.shape
83
+ analysis['columns'] = df.columns.tolist()
84
+ analysis['missing_values'] = df.isnull().sum().to_dict()
85
+
86
+ # Text analysis
87
+ df['text_length'] = df[text_col].astype(str).apply(len)
88
+ analysis['avg_text_length'] = df['text_length'].mean()
89
+ analysis['text_length_stats'] = df['text_length'].describe().to_dict()
90
+
91
+ # Target analysis
92
+ analysis['class_distribution'] = df[target_col].value_counts().to_dict()
93
+ analysis['num_classes'] = df[target_col].nunique()
94
+
95
+ return analysis
96
+
97
+ def create_visualizations(df, text_col, target_col):
98
+ """Create visualizations"""
99
+ fig, axes = plt.subplots(2, 2, figsize=(15, 10))
100
+
101
+ # Class distribution
102
+ class_counts = df[target_col].value_counts()
103
+ axes[0, 0].bar(class_counts.index, class_counts.values)
104
+ axes[0, 0].set_title('Class Distribution')
105
+ axes[0, 0].set_xlabel('Classes')
106
+ axes[0, 0].set_ylabel('Count')
107
+ plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
108
+
109
+ # Text length distribution
110
+ axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
111
+ axes[0, 1].set_title('Text Length Distribution')
112
+ axes[0, 1].set_xlabel('Text Length')
113
+ axes[0, 1].set_ylabel('Frequency')
114
+
115
+ # Box plot of text length by class
116
+ df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
117
+ axes[1, 0].set_title('Text Length by Class')
118
+ axes[1, 0].set_xlabel('Class')
119
+ axes[1, 0].set_ylabel('Text Length')
120
+
121
+ # Correlation plot (if applicable)
122
+ if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
123
+ correlation = df[['text_length', target_col]].corr()
124
+ sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
125
+ axes[1, 1].set_title('Correlation Matrix')
126
+ else:
127
+ axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
128
+ ha='center', va='center', transform=axes[1, 1].transAxes)
129
+ axes[1, 1].set_title('Correlation Analysis')
130
+
131
+ plt.tight_layout()
132
+ return fig
133
 
 
134
  def train_model(model_name, X_train, X_test, y_train, y_test):
135
+ """Train selected model"""
136
+ models_dict = {
137
+ "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
138
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
139
+ "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
140
+ "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
141
+ "SVC": SVC(random_state=42, probability=True),
142
+ "Multinomial Naive Bayes": MultinomialNB(),
143
+ "Gaussian Naive Bayes": GaussianNB()
144
  }
145
 
146
+ if model_name not in models_dict:
147
+ return None, None, None
148
 
149
+ model = models_dict[model_name]
150
+
151
+ # Special handling for Gaussian NB (needs dense array)
152
+ if model_name == "Gaussian Naive Bayes":
153
+ X_train_model = X_train.toarray()
154
+ X_test_model = X_test.toarray()
155
+ else:
156
+ X_train_model = X_train
157
+ X_test_model = X_test
158
 
159
  # Train model
160
+ model.fit(X_train_model, y_train)
161
 
162
  # Make predictions
163
+ y_pred = model.predict(X_test_model)
164
 
165
  # Calculate metrics
166
  accuracy = accuracy_score(y_test, y_pred)
167
+ report = classification_report(y_test, y_pred, output_dict=True)
168
 
169
  # Save model
170
  os.makedirs("models", exist_ok=True)
171
+ model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
172
+ save_artifacts(model, "models", model_filename)
 
173
 
174
+ return model, accuracy, report
175
 
176
+ def predict_text(model_name, text, vectorizer_type="tfidf"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  """Make prediction on new text"""
178
  try:
179
  # Load model
180
+ model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
181
+ model = load_artifacts("models", model_filename)
182
+ if model is None:
183
+ return None, None
184
 
185
  # Load vectorizer
186
+ vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
187
+ vectorizer = load_artifacts("artifacts", vectorizer_filename)
188
  if vectorizer is None:
189
  return None, None
190
 
191
  # Load label encoder
192
+ encoder = load_artifacts("artifacts", "label_encoder.pkl")
193
  if encoder is None:
194
  return None, None
195
 
196
  # Clean and vectorize text
197
+ clean_text_input = clean_text(text)
198
+ text_vector = vectorizer.transform([clean_text_input])
199
 
200
+ # Special handling for Gaussian NB
201
+ if "gaussian" in model_name.lower():
 
 
 
202
  text_vector = text_vector.toarray()
203
 
204
  # Make prediction
 
208
  # Get prediction probabilities if available
209
  if hasattr(model, 'predict_proba'):
210
  try:
211
+ if "gaussian" in model_name.lower():
212
+ prediction_proba = model.predict_proba(text_vector)[0]
213
+ else:
214
+ prediction_proba = model.predict_proba(text_vector)[0]
215
+ except Exception as e:
216
+ st.warning(f"Could not get prediction probabilities: {str(e)}")
217
 
218
  # Decode prediction
219
  predicted_label = encoder.inverse_transform(prediction)[0]
 
224
  st.error(f"Error during prediction: {str(e)}")
225
  return None, None
226
 
227
+ # Main App
228
+ st.markdown('<h1 class="main-header">📊 No Code Text Classification App</h1>', unsafe_allow_html=True)
229
+ st.markdown("### Analyze your text data and train machine learning models without coding!")
230
+
231
+ # Initialize session state
232
+ if 'vectorizer_type' not in st.session_state:
233
+ st.session_state.vectorizer_type = "tfidf"
234
+ if 'trained_models' not in st.session_state:
235
+ st.session_state.trained_models = []
236
 
237
  # Sidebar
238
+ st.sidebar.markdown("## 📁 Upload Your Dataset")
 
239
 
240
+ # File upload with better error handling
241
+ try:
242
+ uploaded_file = st.sidebar.file_uploader(
243
+ "Choose a CSV file",
244
+ type="csv",
245
+ help="Upload your training dataset (CSV format)"
246
+ )
247
+
248
+ # Encoding selection
249
+ encoding = st.sidebar.selectbox(
250
+ "Select file encoding",
251
+ ["utf-8", "latin1", "iso-8859-1", "cp1252"],
252
+ help="Try different encodings if you get reading errors"
253
+ )
254
+
255
+ except Exception as e:
256
+ st.sidebar.error(f"File upload error: {str(e)}")
257
+ uploaded_file = None
258
 
259
+ # Navigation
260
+ section = st.sidebar.radio(
261
+ "Choose Section",
262
+ ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
263
+ help="Navigate through different sections of the app"
264
+ )
265
 
266
+ # Main content based on section
267
+ if uploaded_file is not None:
268
  try:
269
+ # Load data with selected encoding
270
+ df = pd.read_csv(uploaded_file, encoding=encoding)
 
271
 
272
+ st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")
273
+
274
+ # Column selection
275
+ columns = df.columns.tolist()
276
+ text_column = st.sidebar.selectbox("📝 Select text column:", columns)
277
+ target_column = st.sidebar.selectbox("🎯 Select target column:", columns)
278
+
279
+ # Data preprocessing
280
+ df['clean_text'] = df[text_column].apply(clean_text)
281
+ df['text_length'] = df[text_column].astype(str).apply(len)
282
+
283
+ # Process target column
284
+ label_encoder = LabelEncoder()
285
+ df['encoded_target'] = label_encoder.fit_transform(df[target_column])
286
+ save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  except Exception as e:
289
+ st.error(f"Error loading data: {str(e)}")
290
+ st.info("💡 Try selecting a different encoding from the sidebar.")
291
+ df = None
292
 
293
+ # Section: Data Analysis
294
  if section == "📊 Data Analysis":
295
+ if uploaded_file is not None and df is not None:
296
+ st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)
297
 
298
+ # Data overview
299
+ col1, col2, col3 = st.columns(3)
 
 
 
300
 
301
  with col1:
302
+ st.metric("📋 Total Records", df.shape[0])
 
303
  with col2:
304
+ st.metric("📊 Features", df.shape[1])
 
305
  with col3:
306
+ st.metric("🏷️ Classes", df[target_column].nunique())
307
 
308
+ # Data preview
309
+ st.subheader("📖 Data Preview")
310
+ st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
311
 
312
+ # Analysis results
313
+ analysis = analyze_data(df, text_column, target_column)
314
 
 
315
  col1, col2 = st.columns(2)
316
 
317
  with col1:
318
+ st.subheader("📈 Text Statistics")
319
+ st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
320
+ st.write("**Text length distribution:**")
321
+ st.write(pd.DataFrame([analysis['text_length_stats']]).T)
 
 
 
 
322
 
323
  with col2:
324
+ st.subheader("🏷️ Class Distribution")
325
+ class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
326
+ columns=['Class', 'Count'])
327
  st.dataframe(class_dist)
 
 
 
 
 
 
 
 
 
328
 
329
+ # Visualizations
330
+ st.subheader("📊 Visualizations")
331
+ try:
332
+ fig = create_visualizations(df, text_column, target_column)
333
+ st.pyplot(fig)
334
+ except Exception as e:
335
+ st.error(f"Error creating visualizations: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  else:
338
+ st.warning("📁 Please upload a dataset to analyze.")
339
 
340
+ # Section: Train Model
341
  elif section == "🤖 Train Model":
342
+ if uploaded_file is not None and df is not None:
343
+ st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
344
 
 
 
 
 
 
 
345
  col1, col2 = st.columns(2)
346
 
347
  with col1:
348
+ st.subheader("🤖 Select Model")
349
+ model_name = st.selectbox(
350
+ "Choose algorithm:",
351
+ ["Logistic Regression", "Decision Tree", "Random Forest",
352
+ "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
 
 
 
 
 
 
 
 
353
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  with col2:
356
+ st.subheader("🔤 Select Vectorizer")
357
+ vectorizer_choice = st.selectbox(
358
+ "Choose text vectorizer:",
359
+ ["TF-IDF Vectorizer", "Count Vectorizer"]
360
+ )
361
 
362
+ # Vectorizer parameters
363
+ max_features = st.slider("Max features", 1000, 50000, 10000)
364
+ test_size = st.slider("Test size", 0.1, 0.5, 0.2)
365
 
366
  if st.button("🚀 Start Training", type="primary"):
367
+ with st.spinner("🔄 Training model..."):
368
  try:
369
+ # Initialize vectorizer
370
+ if vectorizer_choice == "TF-IDF Vectorizer":
371
+ vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
372
+ st.session_state.vectorizer_type = "tfidf"
373
+ else:
374
+ vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
375
+ st.session_state.vectorizer_type = "count"
376
+
377
+ # Vectorize text
378
+ X = vectorizer.fit_transform(df['clean_text'])
379
+ y = df['encoded_target']
380
 
381
  # Split data
382
  X_train, X_test, y_train, y_test = train_test_split(
383
+ X, y, test_size=test_size, random_state=42, stratify=y
 
 
 
384
  )
385
 
 
 
386
  # Save vectorizer
387
  vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
388
  save_artifacts(vectorizer, "artifacts", vectorizer_filename)
389
 
390
  # Train model
391
+ model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
 
 
 
 
392
 
393
+ if model is not None:
394
+ st.success(f"✅ Model trained successfully!")
395
+ st.session_state.trained_models.append(model_name)
396
+
397
+ # Display results
398
+ col1, col2 = st.columns(2)
399
+
400
+ with col1:
401
+ st.metric("🎯 Accuracy", f"{accuracy:.4f}")
402
+
403
+ with col2:
404
+ st.metric("🏷️ Classes", len(report) - 3) # Exclude avg metrics
405
+
406
+ # Detailed metrics
407
+ st.subheader("📊 Detailed Metrics")
408
+ metrics_df = pd.DataFrame(report).transpose()
409
+ st.dataframe(metrics_df.round(4))
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  except Exception as e:
412
+ st.error(f"❌ Training failed: {str(e)}")
 
413
  else:
414
+ st.warning("📁 Please upload a dataset to train a model.")
415
 
416
+ # Section: Predictions
417
  elif section == "🔮 Predictions":
418
+ st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)
419
 
420
+ # Check for trained models
421
  if os.path.exists("models") and os.listdir("models"):
422
+ available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
423
+ for f in os.listdir("models") if f.endswith('.pkl')]
424
 
425
  if available_models:
426
  # Single prediction
427
+ st.subheader("🔮 Single Text Prediction")
428
 
429
+ col1, col2 = st.columns([3, 1])
430
 
431
  with col1:
432
+ text_input = st.text_area(
433
+ "Enter text to classify:",
434
+ height=100,
435
+ placeholder="Type or paste your text here..."
436
+ )
437
 
438
  with col2:
439
+ selected_model = st.selectbox("Select model:", available_models)
440
+
441
+ if st.button("🔍 Predict", type="primary"):
442
+ if text_input.strip():
443
+ with st.spinner("🔄 Making prediction..."):
444
+ predicted_label, prediction_proba = predict_text(
445
+ selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
446
+ )
447
+
448
+ if predicted_label is not None:
449
+ st.success("✅ Prediction completed!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
+ # Results
452
+ st.markdown("### 📋 Results")
453
+ st.info(f"**Predicted Class:** {predicted_label}")
454
+
455
+ # Probabilities
456
+ if prediction_proba is not None:
457
+ encoder = load_artifacts("artifacts", "label_encoder.pkl")
458
+ if encoder is not None:
459
+ classes = encoder.classes_
460
+ prob_df = pd.DataFrame({
461
+ 'Class': classes,
462
+ 'Probability': prediction_proba
463
+ }).sort_values('Probability', ascending=False)
464
+
465
+ st.markdown("### 📊 Class Probabilities")
466
+ st.bar_chart(prob_df.set_index('Class'))
467
+ else:
468
+ st.warning("⚠️ Please enter some text to classify.")
 
469
 
470
  # Batch predictions
471
  st.markdown("---")
472
+ st.subheader("📦 Batch Predictions")
473
 
474
+ batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
475
 
476
+ if batch_file is not None:
477
  try:
478
+ batch_df = pd.read_csv(batch_file, encoding=encoding)
479
+ st.write("📖 Preview:")
480
+ st.dataframe(batch_df.head())
481
 
482
+ batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
483
+ batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
 
 
 
 
484
 
485
+ if st.button("🚀 Run Batch Predictions"):
486
+ with st.spinner("🔄 Processing batch predictions..."):
487
+ predictions = []
488
+ progress_bar = st.progress(0)
489
+
490
+ for i, text in enumerate(batch_df[batch_text_col]):
491
+ pred, _ = predict_text(
492
+ batch_model, str(text),
493
+ st.session_state.get('vectorizer_type', 'tfidf')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  )
495
+ predictions.append(pred if pred is not None else "Error")
496
+ progress_bar.progress((i + 1) / len(batch_df))
497
+
498
+ batch_df['Predicted_Class'] = predictions
499
+
500
+ st.success("✅ Batch predictions completed!")
501
+ st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
502
+
503
+ # Download option
504
+ csv = batch_df.to_csv(index=False)
505
+ st.download_button(
506
+ "📥 Download Results",
507
+ csv,
508
+ "batch_predictions.csv",
509
+ "text/csv"
510
+ )
511
+
512
  except Exception as e:
513
+ st.error(f"❌ Batch prediction error: {str(e)}")
514
  else:
515
+ st.warning("⚠️ No trained models found.")
516
  else:
517
+ st.warning("⚠️ No models available. Please train a model first.")
518
 
519
  # Footer
520
  st.markdown("---")
521
+ st.markdown("*Built with Streamlit Text Classification Made Easy*")