Alamgirapi commited on
Commit
23e4994
·
verified ·
1 Parent(s): 4dcb991

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +484 -339
app.py CHANGED
@@ -2,465 +2,610 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
 
 
 
 
5
  import os
6
  import pickle
7
  import io
8
- import traceback
9
- import sys
10
  import base64
11
- from datetime import datetime
12
-
13
- # Import ML libraries with error handling
14
- try:
15
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
16
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
17
- from sklearn.preprocessing import LabelEncoder
18
- st.success("✅ Sklearn imported successfully")
19
- except ImportError as e:
20
- st.error(f"❌ Sklearn import error: {e}")
21
-
22
- # Import custom modules with error handling
23
- try:
24
- from NoCodeTextClassifier.EDA import Informations, Visualizations
25
- from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
26
- from NoCodeTextClassifier.models import Models
27
- st.success("✅ NoCodeTextClassifier imported successfully")
28
- except ImportError as e:
29
- st.error(f"❌ NoCodeTextClassifier import error: {e}")
30
- st.info("Please ensure NoCodeTextClassifier package is installed")
31
-
32
- # Set page config
33
- st.set_page_config(page_title="Fixed Text Classification", page_icon="🔧", layout="wide")
34
-
35
- # Debug section
36
- st.sidebar.header("🔍 Debug Information")
37
- debug_mode = st.sidebar.checkbox("Enable Debug Mode", value=True)
38
-
39
- def debug_log(message, level="INFO"):
40
- """Debug logging function"""
41
- if debug_mode:
42
- timestamp = datetime.now().strftime("%H:%M:%S")
43
- st.sidebar.write(f"**{timestamp} [{level}]:** {message}")
44
-
45
- # Alternative file upload methods
46
- def alternative_file_upload():
47
- """Alternative file upload methods to bypass 403 error"""
48
- st.subheader("🔧 Alternative File Upload Methods")
49
-
50
- # Method 1: Text area paste
51
- st.markdown("### Method 1: Copy-Paste CSV Content")
52
- st.info("Copy your CSV content and paste it in the text area below")
53
-
54
- csv_content = st.text_area(
55
- "Paste your CSV content here:",
56
- height=200,
57
- placeholder="name,age,city\nJohn,25,New York\nJane,30,London"
58
- )
59
-
60
- if csv_content and st.button("Load from Text Area", type="primary"):
61
- try:
62
- df = pd.read_csv(io.StringIO(csv_content))
63
- st.success("✅ CSV loaded from text area!")
64
- return df, "text_area"
65
- except Exception as e:
66
- st.error(f"Error parsing CSV: {e}")
67
- return None, None
68
-
69
- # Method 2: Base64 upload (for advanced users)
70
- st.markdown("### Method 2: Base64 Upload")
71
- with st.expander("For Advanced Users - Base64 Upload"):
72
- st.info("Convert your CSV to base64 and paste here")
73
- st.code("""
74
- # Python code to convert CSV to base64:
75
- import base64
76
- with open('your_file.csv', 'rb') as f:
77
- encoded = base64.b64encode(f.read()).decode()
78
- print(encoded)
79
- """)
80
-
81
- base64_content = st.text_area("Paste base64 encoded CSV:", height=100)
82
-
83
- if base64_content and st.button("Load from Base64"):
84
- try:
85
- decoded = base64.b64decode(base64_content)
86
- df = pd.read_csv(io.BytesIO(decoded))
87
- st.success("✅ CSV loaded from base64!")
88
- return df, "base64"
89
- except Exception as e:
90
- st.error(f"Error decoding base64: {e}")
91
- return None, None
92
-
93
- # Method 3: Sample data
94
- st.markdown("### Method 3: Use Sample Data")
95
- if st.button("Load Sample Text Classification Data"):
96
- # Create sample data
97
- sample_data = {
98
- 'text': [
99
- 'I love this product, it works great!',
100
- 'This is terrible, waste of money',
101
- 'Good quality and fast delivery',
102
- 'Not satisfied with the purchase',
103
- 'Excellent service and support',
104
- 'Poor quality, arrived damaged',
105
- 'Amazing product, highly recommend',
106
- 'Disappointed with the results'
107
- ],
108
- 'label': ['positive', 'negative', 'positive', 'negative',
109
- 'positive', 'negative', 'positive', 'negative']
110
- }
111
- df = pd.DataFrame(sample_data)
112
- st.success("✅ Sample data loaded!")
113
- return df, "sample"
114
-
115
- return None, None
116
 
117
- def safe_file_uploader_with_fallback():
118
- """Try normal upload first, then fallback methods"""
119
- st.markdown("### 📁 Upload Your CSV File")
 
 
 
 
 
120
 
121
- # Try standard uploader first
122
- uploaded_file = st.file_uploader(
123
- "Choose a CSV file",
124
- type=['csv'],
125
- help="If upload fails with 403 error, use alternative methods below"
126
- )
127
 
128
- if uploaded_file is not None:
129
  try:
130
- debug_log("📁 File uploaded successfully via standard method")
131
- df = pd.read_csv(uploaded_file)
132
- st.success("✅ File uploaded successfully!")
133
- return df, "standard"
 
 
 
 
 
 
 
 
 
 
134
  except Exception as e:
135
- st.error(f"Error reading uploaded file: {e}")
136
- debug_log(f"❌ Standard upload failed: {e}", "ERROR")
137
 
138
- # If standard upload fails or no file uploaded, show alternatives
139
- st.markdown("---")
140
- st.markdown("### 🔄 Alternative Upload Methods")
141
- st.warning("If you're getting a 403 error, try one of these alternative methods:")
142
-
143
- return alternative_file_upload()
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # Utility functions (same as before but with debug)
146
  def save_artifacts(obj, folder_name, file_name):
147
- """Save artifacts with debugging"""
148
- debug_log(f"💾 Saving {file_name} to {folder_name}")
149
  try:
150
  os.makedirs(folder_name, exist_ok=True)
151
- full_path = os.path.join(folder_name, file_name)
152
-
153
- with open(full_path, 'wb') as f:
154
  pickle.dump(obj, f)
155
-
156
- debug_log(f"✅ Successfully saved {file_name}")
157
  return True
158
-
159
  except Exception as e:
160
- debug_log(f"Error saving {file_name}: {str(e)}", "ERROR")
161
- st.error(f"Save error: {str(e)}")
162
  return False
163
 
164
  def load_artifacts(folder_name, file_name):
165
- """Load artifacts with debugging"""
166
- debug_log(f"📂 Loading {file_name} from {folder_name}")
167
  try:
168
- full_path = os.path.join(folder_name, file_name)
169
-
170
- if not os.path.exists(full_path):
171
- debug_log(f"File not found: {full_path}", "ERROR")
172
- return None
173
-
174
- with open(full_path, 'rb') as f:
175
- obj = pickle.load(f)
176
-
177
- debug_log(f"✅ Successfully loaded {file_name}")
178
- return obj
179
-
180
  except Exception as e:
181
- debug_log(f"Error loading {file_name}: {str(e)}", "ERROR")
182
  return None
183
 
184
  def load_model(model_name):
185
- """Load model with debugging"""
186
- debug_log(f"🤖 Loading model: {model_name}")
187
- return load_artifacts("models", model_name)
 
 
 
 
 
 
 
188
 
189
  def predict_text(model_name, text, vectorizer_type="tfidf"):
190
- """Make prediction with debugging"""
191
- debug_log(f"🔮 Starting prediction with {model_name}")
192
-
193
  try:
194
- # Load components
195
  model = load_model(model_name)
196
  if model is None:
197
  return None, None
198
 
 
199
  vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
200
  vectorizer = load_artifacts("artifacts", vectorizer_file)
201
  if vectorizer is None:
202
  return None, None
203
 
 
204
  encoder = load_artifacts("artifacts", "encoder.pkl")
205
  if encoder is None:
206
  return None, None
207
 
208
- debug_log("🧹 Cleaning text...")
209
  text_cleaner = TextCleaner()
210
  clean_text = text_cleaner.clean_text(text)
211
 
212
- debug_log("🔢 Vectorizing text...")
213
  text_vector = vectorizer.transform([clean_text])
214
 
215
- debug_log("🎯 Making prediction...")
216
  prediction = model.predict(text_vector)
217
  prediction_proba = None
218
 
 
219
  if hasattr(model, 'predict_proba'):
220
  try:
221
  prediction_proba = model.predict_proba(text_vector)[0]
222
  except:
223
- debug_log("No prediction probabilities available", "WARNING")
224
 
 
225
  predicted_label = encoder.inverse_transform(prediction)[0]
226
- debug_log(f"✅ Prediction complete: {predicted_label}")
227
 
228
  return predicted_label, prediction_proba
229
 
230
  except Exception as e:
231
- debug_log(f" Prediction error: {str(e)}", "ERROR")
232
- st.error(f"Prediction error: {str(e)}")
233
  return None, None
234
 
 
 
 
 
 
 
 
 
235
  # Main App
236
- st.title('🔧 Fixed Text Classification App')
237
- st.write('Workaround version to bypass 403 upload errors')
238
-
239
- # Show environment info in sidebar if debug mode
240
- if debug_mode:
241
- st.sidebar.subheader("🖥️ Environment Info")
242
- st.sidebar.write(f"Python version: {sys.version}")
243
- st.sidebar.write(f"Streamlit version: {st.__version__}")
244
- st.sidebar.write(f"Current directory: {os.getcwd()}")
245
-
246
- # Navigation
247
- section = st.sidebar.radio("Choose Section", [
248
- "Upload Data", "Data Analysis", "Train Model", "Predictions"
249
- ])
250
-
251
- # Session state
252
- if 'train_df' not in st.session_state:
253
- st.session_state.train_df = None
254
- if 'upload_method' not in st.session_state:
255
- st.session_state.upload_method = None
256
  if 'vectorizer_type' not in st.session_state:
257
  st.session_state.vectorizer_type = "tfidf"
 
 
258
 
259
- # Upload Data Section
260
- if section == "Upload Data":
261
- st.subheader("📁 Upload Your Dataset")
262
-
263
- df, method = safe_file_uploader_with_fallback()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- if df is not None:
266
- st.session_state.train_df = df
267
- st.session_state.upload_method = method
268
-
269
- st.write("### �� Data Preview")
270
- st.dataframe(df.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- st.write("### 📈 Basic Info")
273
- col1, col2, col3 = st.columns(3)
274
- with col1:
275
- st.metric("Rows", df.shape[0])
276
- with col2:
277
- st.metric("Columns", df.shape[1])
278
- with col3:
279
- st.metric("Missing Values", df.isnull().sum().sum())
280
 
281
- st.write("### 🏷️ Select Columns")
282
- columns = df.columns.tolist()
 
283
 
284
- col1, col2 = st.columns(2)
285
- with col1:
286
- text_column = st.selectbox("Select text column:", columns)
287
- with col2:
288
- target_column = st.selectbox("Select target/label column:", columns)
289
 
290
- if text_column and target_column:
291
- st.session_state.text_column = text_column
292
- st.session_state.target_column = target_column
 
 
 
293
 
294
- # Show sample data
295
- st.write("### 📝 Sample Data")
296
- sample_df = df[[text_column, target_column]].head()
297
- st.dataframe(sample_df)
 
 
 
 
 
 
 
 
298
 
299
- # Show target distribution
300
- st.write("### 🎯 Target Distribution")
301
- target_counts = df[target_column].value_counts()
302
- st.bar_chart(target_counts)
303
 
304
- st.success("✅ Data ready for processing!")
305
-
306
- # Data Analysis Section
307
- elif section == "Data Analysis":
308
- if st.session_state.train_df is not None:
309
- df = st.session_state.train_df
310
- text_col = st.session_state.get('text_column')
311
- target_col = st.session_state.get('target_column')
312
-
313
- if text_col and target_col:
314
- st.subheader("📊 Data Analysis")
315
 
316
- try:
317
- # Process data using custom classes
318
- info = Informations(df, text_col, target_col)
319
- df['clean_text'] = info.clean_text()
320
- df['text_length'] = info.text_length()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- # Update session state
323
- st.session_state.train_df = df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
- # Show analysis
326
- st.write("**Data Shape:**", info.shape())
327
- st.write("**Class Distribution:**", info.class_imbalanced())
328
- st.write("**Missing Values:**", info.missing_values())
329
 
330
- # Text length analysis
331
- st.write("**Text Length Analysis:**")
332
  st.write(info.analysis_text_length('text_length'))
333
 
334
- # Visualizations
335
- vis = Visualizations(df, text_col, target_col)
 
336
 
337
- col1, col2 = st.columns(2)
338
- with col1:
339
- st.write("**Class Distribution:**")
340
- vis.class_distribution()
341
-
342
- with col2:
343
- st.write("**Text Length Distribution:**")
344
- vis.text_length_distribution()
345
-
346
- except Exception as e:
347
- st.error(f"Error in analysis: {e}")
348
- debug_log(f"Analysis error: {e}", "ERROR")
349
- else:
350
- st.warning("Please select text and target columns in the Upload Data section.")
351
  else:
352
- st.warning("Please upload data first.")
353
-
354
- # Train Model Section
355
- elif section == "Train Model":
356
- if st.session_state.train_df is not None:
357
- df = st.session_state.train_df
358
- text_col = st.session_state.get('text_column')
359
- target_col = st.session_state.get('target_column')
360
 
361
- if text_col and target_col and 'clean_text' in df.columns:
362
- st.subheader("🤖 Train Model")
 
 
 
 
 
 
 
 
 
 
 
363
 
 
364
  col1, col2 = st.columns(2)
365
-
366
  with col1:
367
- model_choice = st.selectbox("Choose Model:", [
368
- "Logistic Regression", "Decision Tree", "Random Forest",
369
- "Linear SVC", "SVC", "Multinomial Naive Bayes"
370
- ])
 
 
371
 
372
  with col2:
373
- vectorizer_choice = st.selectbox("Choose Vectorizer:",
374
- ["Tfidf Vectorizer", "Count Vectorizer"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
- if st.button("🚀 Train Model", type="primary"):
377
- with st.spinner("Training model..."):
 
 
 
378
  try:
379
- # Prepare data
380
- if vectorizer_choice == "Tfidf Vectorizer":
381
- vectorizer = TfidfVectorizer(max_features=10000)
382
- st.session_state.vectorizer_type = "tfidf"
383
- else:
384
- vectorizer = CountVectorizer(max_features=10000)
385
- st.session_state.vectorizer_type = "count"
386
 
387
- # Label encoding
388
- label_encoder = LabelEncoder()
389
- y = label_encoder.fit_transform(df[target_col])
390
- X = vectorizer.fit_transform(df['clean_text'])
 
 
391
 
392
  # Split data
393
  X_train, X_test, y_train, y_test = process.split_data(X, y)
394
 
395
- # Save artifacts
396
- save_artifacts(vectorizer, "artifacts", f"{st.session_state.vectorizer_type}_vectorizer.pkl")
397
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
 
 
 
 
 
 
398
 
399
  # Train model
400
  models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
401
 
402
- if model_choice == "Logistic Regression":
403
  models.LogisticRegression()
404
- elif model_choice == "Decision Tree":
405
  models.DecisionTree()
406
- elif model_choice == "Random Forest":
407
- models.RandomForestClassifier()
408
- elif model_choice == "Linear SVC":
409
  models.LinearSVC()
410
- elif model_choice == "SVC":
411
  models.SVC()
412
- elif model_choice == "Multinomial Naive Bayes":
413
  models.MultinomialNB()
 
 
 
 
 
 
 
414
 
415
- st.success("🎉 Model trained successfully!")
 
 
 
 
 
 
 
 
 
 
 
416
 
417
  except Exception as e:
418
- st.error(f"Training error: {e}")
419
- debug_log(f"Training error: {e}", "ERROR")
420
- else:
421
- st.warning("Please complete data analysis first to process the text data.")
422
  else:
423
- st.warning("Please upload data first.")
424
 
425
  # Predictions Section
426
- elif section == "Predictions":
427
- st.subheader("🔮 Make Predictions")
428
 
429
- # Check for models
430
  if os.path.exists("models") and os.listdir("models"):
431
  available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
432
 
433
  if available_models:
434
- selected_model = st.selectbox("Choose trained model:", available_models)
435
-
436
  # Single prediction
437
- st.write("### Single Text Prediction")
438
- text_input = st.text_area("Enter text to classify:", height=100)
 
439
 
440
- if st.button("🎯 Predict") and text_input:
441
- prediction, probabilities = predict_text(
442
- selected_model,
443
- text_input,
444
- st.session_state.get('vectorizer_type', 'tfidf')
445
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
- if prediction is not None:
448
- st.success(f"**Prediction:** {prediction}")
449
 
450
- if probabilities is not None:
451
- encoder = load_artifacts("artifacts", "encoder.pkl")
452
- if encoder is not None:
453
- prob_df = pd.DataFrame({
454
- 'Class': encoder.classes_,
455
- 'Probability': probabilities
456
- }).sort_values('Probability', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
- st.bar_chart(prob_df.set_index('Class'))
 
 
459
  else:
460
- st.info("No trained models found. Train a model first.")
461
  else:
462
- st.info("No models directory found. Train a model first.")
463
 
464
- # Show upload method used in sidebar
465
- if st.session_state.upload_method:
466
- st.sidebar.success(f" Data loaded via: {st.session_state.upload_method}")
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
+ from NoCodeTextClassifier.EDA import Informations, Visualizations
6
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
+ from NoCodeTextClassifier.models import Models
9
  import os
10
  import pickle
11
  import io
 
 
12
  import base64
13
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
+ from sklearn.preprocessing import LabelEncoder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Configure page
17
+ st.set_page_config(page_title="Text Classifier", page_icon="📝", layout="wide")
18
+
19
+ # Utility functions
20
+ def safe_read_csv(uploaded_file, encoding_options=['utf-8', 'latin1', 'iso-8859-1', 'cp1252']):
21
+ """Safely read CSV with multiple encoding attempts"""
22
+ if uploaded_file is None:
23
+ return None
24
 
25
+ # Reset file pointer
26
+ uploaded_file.seek(0)
 
 
 
 
27
 
28
+ for encoding in encoding_options:
29
  try:
30
+ # Read the file content as bytes
31
+ bytes_data = uploaded_file.read()
32
+
33
+ # Convert bytes to string with the current encoding
34
+ string_data = bytes_data.decode(encoding)
35
+
36
+ # Use StringIO to create a file-like object
37
+ df = pd.read_csv(io.StringIO(string_data))
38
+ st.success(f"File loaded successfully with {encoding} encoding")
39
+ return df
40
+
41
+ except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
42
+ st.warning(f"Failed to read with {encoding} encoding: {str(e)}")
43
+ continue
44
  except Exception as e:
45
+ st.error(f"Unexpected error with {encoding} encoding: {str(e)}")
46
+ continue
47
 
48
+ st.error("Failed to read the file with any supported encoding")
49
+ return None
50
+
51
+ def create_sample_data():
52
+ """Create sample data for testing"""
53
+ sample_data = {
54
+ 'text': [
55
+ "I love this product, it's amazing!",
56
+ "This is the worst thing I've ever bought",
57
+ "Great quality and fast delivery",
58
+ "Terrible customer service, very disappointed",
59
+ "Excellent value for money",
60
+ "Poor quality, broke after one day",
61
+ "Highly recommend this to everyone",
62
+ "Waste of money, don't buy this"
63
+ ],
64
+ 'sentiment': ['positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative']
65
+ }
66
+ return pd.DataFrame(sample_data)
67
 
 
68
  def save_artifacts(obj, folder_name, file_name):
69
+ """Save artifacts like encoders and vectorizers"""
 
70
  try:
71
  os.makedirs(folder_name, exist_ok=True)
72
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
 
 
73
  pickle.dump(obj, f)
 
 
74
  return True
 
75
  except Exception as e:
76
+ st.error(f"Error saving {file_name}: {str(e)}")
 
77
  return False
78
 
79
  def load_artifacts(folder_name, file_name):
80
+ """Load saved artifacts"""
 
81
  try:
82
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
83
+ return pickle.load(f)
84
+ except FileNotFoundError:
85
+ st.error(f"File {file_name} not found in {folder_name} folder")
86
+ return None
 
 
 
 
 
 
 
87
  except Exception as e:
88
+ st.error(f"Error loading {file_name}: {str(e)}")
89
  return None
90
 
91
  def load_model(model_name):
92
+ """Load trained model"""
93
+ try:
94
+ with open(os.path.join('models', model_name), 'rb') as f:
95
+ return pickle.load(f)
96
+ except FileNotFoundError:
97
+ st.error(f"Model {model_name} not found. Please train a model first.")
98
+ return None
99
+ except Exception as e:
100
+ st.error(f"Error loading model: {str(e)}")
101
+ return None
102
 
103
  def predict_text(model_name, text, vectorizer_type="tfidf"):
104
+ """Make prediction on new text"""
 
 
105
  try:
106
+ # Load model
107
  model = load_model(model_name)
108
  if model is None:
109
  return None, None
110
 
111
+ # Load vectorizer
112
  vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
113
  vectorizer = load_artifacts("artifacts", vectorizer_file)
114
  if vectorizer is None:
115
  return None, None
116
 
117
+ # Load label encoder
118
  encoder = load_artifacts("artifacts", "encoder.pkl")
119
  if encoder is None:
120
  return None, None
121
 
122
+ # Clean and vectorize text
123
  text_cleaner = TextCleaner()
124
  clean_text = text_cleaner.clean_text(text)
125
 
126
+ # Transform text using the same vectorizer used during training
127
  text_vector = vectorizer.transform([clean_text])
128
 
129
+ # Make prediction
130
  prediction = model.predict(text_vector)
131
  prediction_proba = None
132
 
133
+ # Get prediction probabilities if available
134
  if hasattr(model, 'predict_proba'):
135
  try:
136
  prediction_proba = model.predict_proba(text_vector)[0]
137
  except:
138
+ pass
139
 
140
+ # Decode prediction
141
  predicted_label = encoder.inverse_transform(prediction)[0]
 
142
 
143
  return predicted_label, prediction_proba
144
 
145
  except Exception as e:
146
+ st.error(f"Error during prediction: {str(e)}")
 
147
  return None, None
148
 
149
+ def download_sample_csv():
150
+ """Generate sample CSV for download"""
151
+ sample_df = create_sample_data()
152
+ csv = sample_df.to_csv(index=False)
153
+ b64 = base64.b64encode(csv.encode()).decode()
154
+ href = f'<a href="data:file/csv;base64,{b64}" download="sample_data.csv">Download Sample CSV</a>'
155
+ return href
156
+
157
  # Main App
158
+ st.title('📝 No Code Text Classification App')
159
+ st.markdown('---')
160
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
161
+
162
+ # Initialize session state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  if 'vectorizer_type' not in st.session_state:
164
  st.session_state.vectorizer_type = "tfidf"
165
+ if 'train_df' not in st.session_state:
166
+ st.session_state.train_df = None
167
 
168
+ # Sidebar
169
+ st.sidebar.title("Navigation")
170
+ section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🔧 Train Model", "🎯 Predictions"])
171
+
172
+ # Data Upload Section
173
+ st.sidebar.markdown("---")
174
+ st.sidebar.subheader("📁 Data Upload")
175
+
176
+ # Option to use sample data
177
+ if st.sidebar.button("Use Sample Data"):
178
+ st.session_state.train_df = create_sample_data()
179
+ st.sidebar.success("Sample data loaded!")
180
+
181
+ # Sample data download
182
+ st.sidebar.markdown("**Download Sample Data:**")
183
+ st.sidebar.markdown(download_sample_csv(), unsafe_allow_html=True)
184
+
185
+ st.sidebar.markdown("**Or upload your own data:**")
186
+
187
+ # File upload with better error handling
188
+ train_data = st.sidebar.file_uploader(
189
+ "Upload training data",
190
+ type=["csv"],
191
+ help="Upload a CSV file with text and target columns"
192
+ )
193
+
194
+ test_data = st.sidebar.file_uploader(
195
+ "Upload test data (optional)",
196
+ type=["csv"],
197
+ help="Optional: Upload separate test data"
198
+ )
199
+
200
+ # Alternative text input method
201
+ st.sidebar.markdown("**Or paste CSV data:**")
202
+ if st.sidebar.checkbox("Enter data manually"):
203
+ csv_text = st.sidebar.text_area(
204
+ "Paste CSV data here:",
205
+ height=100,
206
+ placeholder="text,sentiment\n\"Great product!\",positive\n\"Poor quality\",negative"
207
+ )
208
 
209
+ if csv_text and st.sidebar.button("Load from text"):
210
+ try:
211
+ train_df = pd.read_csv(io.StringIO(csv_text))
212
+ st.session_state.train_df = train_df
213
+ st.sidebar.success("Data loaded from text!")
214
+ except Exception as e:
215
+ st.sidebar.error(f"Error parsing CSV text: {str(e)}")
216
+
217
+ # Load data
218
+ train_df = None
219
+ test_df = None
220
+
221
+ # Try to load from uploaded file first
222
+ if train_data is not None:
223
+ train_df = safe_read_csv(train_data)
224
+ if train_df is not None:
225
+ st.session_state.train_df = train_df
226
+
227
+ # Use session state data if available
228
+ if st.session_state.train_df is not None:
229
+ train_df = st.session_state.train_df
230
+
231
+ if test_data is not None:
232
+ test_df = safe_read_csv(test_data)
233
+
234
+ # Process data if available
235
+ if train_df is not None:
236
+ try:
237
+ st.sidebar.success("✅ Training data loaded successfully!")
238
 
239
+ # Show data info in sidebar
240
+ st.sidebar.write(f"**Rows:** {len(train_df)}")
241
+ st.sidebar.write(f"**Columns:** {len(train_df.columns)}")
 
 
 
 
 
242
 
243
+ with st.expander("📋 Data Preview", expanded=False):
244
+ st.write("**Training Data Preview:**")
245
+ st.dataframe(train_df.head())
246
 
247
+ columns = train_df.columns.tolist()
 
 
 
 
248
 
249
+ # Column selection with validation
250
+ if len(columns) >= 2:
251
+ text_data = st.sidebar.selectbox("Choose the text column:", columns, index=0)
252
+ # Default to second column for target, or first if same as text
253
+ target_default = 1 if len(columns) > 1 and columns[1] != text_data else 0
254
+ target = st.sidebar.selectbox("Choose the target column:", columns, index=target_default)
255
 
256
+ if text_data == target:
257
+ st.sidebar.error("Text and target columns must be different!")
258
+ st.stop()
259
+ else:
260
+ st.sidebar.error("Data must have at least 2 columns (text and target)")
261
+ st.stop()
262
+
263
+ # Process data
264
+ try:
265
+ info = Informations(train_df, text_data, target)
266
+ train_df['clean_text'] = info.clean_text()
267
+ train_df['text_length'] = info.text_length()
268
 
269
+ # Handle label encoding
270
+ label_encoder = LabelEncoder()
271
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
 
272
 
273
+ # Save label encoder
274
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
 
 
 
 
 
 
 
 
 
275
 
276
+ except Exception as e:
277
+ st.error(f"Error processing data: {str(e)}")
278
+ st.stop()
279
+
280
+ except Exception as e:
281
+ st.error(f"Error loading data: {str(e)}")
282
+ train_df = None
283
+
284
+ # Main Content Based on Section
285
+ if section == "📊 Data Analysis":
286
+ if train_df is not None:
287
+ try:
288
+ st.header("📊 Data Analysis & Insights")
289
+
290
+ # Create columns for metrics
291
+ col1, col2, col3, col4 = st.columns(4)
292
+
293
+ with col1:
294
+ st.metric("Total Samples", info.shape()[0])
295
+ with col2:
296
+ st.metric("Features", info.shape()[1])
297
+ with col3:
298
+ st.metric("Classes", len(train_df[target].unique()))
299
+ with col4:
300
+ missing_pct = (info.missing_values().sum() / len(train_df)) * 100
301
+ st.metric("Missing Data %", f"{missing_pct:.1f}%")
302
+
303
+ st.markdown("---")
304
+
305
+ # Class distribution
306
+ col1, col2 = st.columns(2)
307
+
308
+ with col1:
309
+ st.subheader("Class Distribution")
310
+ class_dist = train_df[target].value_counts()
311
+ st.bar_chart(class_dist)
312
 
313
+ # Check for imbalance
314
+ imbalance_ratio = class_dist.max() / class_dist.min()
315
+ if imbalance_ratio > 2:
316
+ st.warning(f"⚠️ Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")
317
+ else:
318
+ st.success("✅ Classes are relatively balanced")
319
+
320
+ with col2:
321
+ st.subheader("Text Length Distribution")
322
+ fig, ax = plt.subplots(figsize=(8, 6))
323
+ ax.hist(train_df['text_length'], bins=30, alpha=0.7, color='skyblue')
324
+ ax.set_xlabel('Text Length (characters)')
325
+ ax.set_ylabel('Frequency')
326
+ ax.set_title('Distribution of Text Lengths')
327
+ st.pyplot(fig)
328
+
329
+ # Detailed analysis
330
+ with st.expander("📈 Detailed Analysis", expanded=False):
331
+ st.write("**Class Imbalance Analysis:**")
332
+ st.write(info.class_imbalanced())
333
 
334
+ st.write("**Missing Values:**")
335
+ st.write(info.missing_values())
 
 
336
 
337
+ st.write("**Text Length Statistics:**")
 
338
  st.write(info.analysis_text_length('text_length'))
339
 
340
+ # Correlation
341
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
342
+ st.write(f"**Correlation between Text Length and Target:** {correlation:.4f}")
343
 
344
+ if abs(correlation) > 0.3:
345
+ st.info(f"📊 Moderate correlation detected ({correlation:.3f})")
346
+ elif abs(correlation) > 0.1:
347
+ st.info(f"📊 Weak correlation detected ({correlation:.3f})")
348
+ else:
349
+ st.info("📊 No significant correlation between text length and target")
350
+
351
+ except Exception as e:
352
+ st.error(f"Error in data analysis: {str(e)}")
 
 
 
 
 
353
  else:
354
+ st.warning("📤 Please upload training data or use sample data to get insights")
 
 
 
 
 
 
 
355
 
356
+ # Show instructions
357
+ st.info("""
358
+ **To get started:**
359
+ 1. Click "Use Sample Data" in the sidebar, OR
360
+ 2. Upload your own CSV file with text and target columns, OR
361
+ 3. Use the manual text input option in the sidebar
362
+ """)
363
+
364
+ # Train Model Section
365
+ elif section == "🔧 Train Model":
366
+ if train_df is not None:
367
+ try:
368
+ st.header("🔧 Train Classification Model")
369
 
370
+ # Model and vectorizer selection
371
  col1, col2 = st.columns(2)
372
+
373
  with col1:
374
+ st.subheader("Choose Model")
375
+ model = st.selectbox("Select Algorithm:", [
376
+ "Logistic Regression", "Decision Tree",
377
+ "Random Forest", "Linear SVC", "SVC",
378
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
379
+ ], help="Different algorithms have different strengths")
380
 
381
  with col2:
382
+ st.subheader("Choose Vectorizer")
383
+ vectorizer_choice = st.selectbox("Select Vectorization Method:",
384
+ ["Tfidf Vectorizer", "Count Vectorizer"],
385
+ help="TF-IDF is usually better for text classification")
386
+
387
+ # Initialize vectorizer
388
+ if vectorizer_choice == "Tfidf Vectorizer":
389
+ vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
390
+ st.session_state.vectorizer_type = "tfidf"
391
+ else:
392
+ vectorizer = CountVectorizer(max_features=10000, stop_words='english')
393
+ st.session_state.vectorizer_type = "count"
394
+
395
+ # Show processed data preview
396
+ with st.expander("🔍 Processed Data Preview", expanded=False):
397
+ preview_df = train_df[['clean_text', 'target']].head(10)
398
+ st.dataframe(preview_df)
399
 
400
+ st.markdown("---")
401
+
402
+ # Training section
403
+ if st.button("🚀 Start Training", type="primary"):
404
+ with st.spinner("Training model... This may take a few moments."):
405
  try:
406
+ # Progress bar
407
+ progress_bar = st.progress(0)
408
+ status_text = st.empty()
409
+
410
+ status_text.text("Vectorizing text data...")
411
+ progress_bar.progress(20)
 
412
 
413
+ # Vectorize text data
414
+ X = vectorizer.fit_transform(train_df['clean_text'])
415
+ y = train_df['target']
416
+
417
+ status_text.text("Splitting data...")
418
+ progress_bar.progress(40)
419
 
420
  # Split data
421
  X_train, X_test, y_train, y_test = process.split_data(X, y)
422
 
423
+ status_text.text("Saving vectorizer...")
424
+ progress_bar.progress(50)
425
+
426
+ # Save vectorizer
427
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
428
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
429
+
430
+ status_text.text(f"Training {model}...")
431
+ progress_bar.progress(70)
432
 
433
  # Train model
434
  models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
435
 
436
+ if model == "Logistic Regression":
437
  models.LogisticRegression()
438
+ elif model == "Decision Tree":
439
  models.DecisionTree()
440
+ elif model == "Linear SVC":
 
 
441
  models.LinearSVC()
442
+ elif model == "SVC":
443
  models.SVC()
444
+ elif model == "Multinomial Naive Bayes":
445
  models.MultinomialNB()
446
+ elif model == "Random Forest":
447
+ models.RandomForestClassifier()
448
+ elif model == "Gaussian Naive Bayes":
449
+ models.GaussianNB()
450
+
451
+ progress_bar.progress(100)
452
+ status_text.text("Training completed!")
453
 
454
+ st.success("🎉 Model training completed successfully!")
455
+ st.balloons()
456
+
457
+ # Show training info
458
+ st.info(f"""
459
+ **Training Summary:**
460
+ - Model: {model}
461
+ - Vectorizer: {vectorizer_choice}
462
+ - Training samples: {X_train.shape[0]}
463
+ - Test samples: {X_test.shape[0]}
464
+ - Features: {X_train.shape[1]}
465
+ """)
466
 
467
  except Exception as e:
468
+ st.error(f"Training failed: {str(e)}")
469
+
470
+ except Exception as e:
471
+ st.error(f"Error in model training setup: {str(e)}")
472
  else:
473
+ st.warning("📤 Please upload training data to train a model")
474
 
475
  # Predictions Section
476
+ elif section == "🎯 Predictions":
477
+ st.header("🎯 Make Predictions")
478
 
479
+ # Check if models exist
480
  if os.path.exists("models") and os.listdir("models"):
481
  available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
482
 
483
  if available_models:
 
 
484
  # Single prediction
485
+ st.subheader("Single Text Prediction")
486
+
487
+ col1, col2 = st.columns([3, 1])
488
 
489
+ with col1:
490
+ text_input = st.text_area(
491
+ "Enter text to classify:",
492
+ height=100,
493
+ placeholder="Type or paste your text here..."
494
  )
495
+
496
+ with col2:
497
+ selected_model = st.selectbox("Choose model:", available_models)
498
+ predict_btn = st.button("🎯 Predict", type="primary")
499
+
500
+ if predict_btn and text_input.strip():
501
+ with st.spinner("Making prediction..."):
502
+ predicted_label, prediction_proba = predict_text(
503
+ selected_model,
504
+ text_input,
505
+ st.session_state.get('vectorizer_type', 'tfidf')
506
+ )
507
+
508
+ if predicted_label is not None:
509
+ st.success("Prediction completed!")
510
+
511
+ # Results in columns
512
+ col1, col2 = st.columns(2)
513
+
514
+ with col1:
515
+ st.markdown("### 📝 Input Text")
516
+ st.text_area("", value=text_input, height=100, disabled=True)
517
+
518
+ with col2:
519
+ st.markdown("### 🎯 Prediction Result")
520
+ st.markdown(f"**Predicted Class:** `{predicted_label}`")
521
+
522
+ # Show probabilities if available
523
+ if prediction_proba is not None:
524
+ encoder = load_artifacts("artifacts", "encoder.pkl")
525
+ if encoder is not None:
526
+ classes = encoder.classes_
527
+ prob_df = pd.DataFrame({
528
+ 'Class': classes,
529
+ 'Probability': prediction_proba
530
+ }).sort_values('Probability', ascending=False)
531
+
532
+ st.markdown("**Confidence Scores:**")
533
+
534
+ # Show as progress bars
535
+ for _, row in prob_df.iterrows():
536
+ st.write(f"{row['Class']}: {row['Probability']:.3f}")
537
+ st.progress(row['Probability'])
538
+
539
+ elif predict_btn and not text_input.strip():
540
+ st.warning("Please enter some text to classify")
541
+
542
+ st.markdown("---")
543
+
544
+ # Batch prediction
545
+ st.subheader("Batch Predictions")
546
+
547
+ uploaded_file = st.file_uploader(
548
+ "Upload CSV file for batch predictions",
549
+ type=['csv'],
550
+ help="Upload a CSV with a text column to classify multiple texts at once"
551
+ )
552
+
553
+ if uploaded_file is not None:
554
+ batch_df = safe_read_csv(uploaded_file)
555
 
556
+ if batch_df is not None:
557
+ col1, col2 = st.columns(2)
558
 
559
+ with col1:
560
+ text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
561
+ with col2:
562
+ batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
563
+
564
+ st.write("**Data Preview:**")
565
+ st.dataframe(batch_df.head())
566
+
567
+ if st.button("🚀 Run Batch Predictions"):
568
+ with st.spinner("Processing batch predictions..."):
569
+ predictions = []
570
+
571
+ # Progress tracking
572
+ progress_bar = st.progress(0)
573
+ total_texts = len(batch_df)
574
+
575
+ for i, text in enumerate(batch_df[text_column]):
576
+ pred, _ = predict_text(
577
+ batch_model,
578
+ str(text),
579
+ st.session_state.get('vectorizer_type', 'tfidf')
580
+ )
581
+ predictions.append(pred if pred is not None else "Error")
582
+ progress_bar.progress((i + 1) / total_texts)
583
+
584
+ batch_df['Predicted_Class'] = predictions
585
+
586
+ st.success("✅ Batch predictions completed!")
587
+
588
+ # Results
589
+ st.write("**Results:**")
590
+ st.dataframe(batch_df[[text_column, 'Predicted_Class']])
591
+
592
+ # Download button
593
+ csv = batch_df.to_csv(index=False)
594
+ st.download_button(
595
+ label="⬇️ Download Results",
596
+ data=csv,
597
+ file_name="batch_predictions.csv",
598
+ mime="text/csv"
599
+ )
600
 
601
+ # Show prediction distribution
602
+ pred_dist = batch_df['Predicted_Class'].value_counts()
603
+ st.bar_chart(pred_dist)
604
  else:
605
+ st.warning("No trained models found.")
606
  else:
607
+ st.warning("🔧 No models available. Please train a model first in the 'Train Model' section.")
608
 
609
+ # Footer
610
+ st.markdown("---")
611
+ st.markdown("*Built with Streamlit No-Code Text Classification*")