Alamgirapi commited on
Commit
9773b59
·
verified ·
1 Parent(s): 0a50c6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -434
app.py CHANGED
@@ -2,65 +2,20 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
- import seaborn as sns
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
- from sklearn.model_selection import train_test_split
8
- from sklearn.linear_model import LogisticRegression
9
- from sklearn.tree import DecisionTreeClassifier
10
- from sklearn.ensemble import RandomForestClassifier
11
- from sklearn.svm import LinearSVC, SVC
12
- from sklearn.naive_bayes import MultinomialNB, GaussianNB
13
- from sklearn.preprocessing import LabelEncoder
14
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
15
  import os
16
  import pickle
17
- import re
18
- import string
19
- from collections import Counter
20
-
21
- # Set page config
22
- st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")
23
-
24
- # Custom CSS for better styling
25
- st.markdown("""
26
- <style>
27
- .main-header {
28
- font-size: 2.5rem;
29
- color: #1f77b4;
30
- text-align: center;
31
- margin-bottom: 2rem;
32
- }
33
- .section-header {
34
- font-size: 1.8rem;
35
- color: #ff7f0e;
36
- border-bottom: 2px solid #ff7f0e;
37
- padding-bottom: 0.5rem;
38
- }
39
- </style>
40
- """, unsafe_allow_html=True)
41
 
42
  # Utility functions
43
- def clean_text(text):
44
- """Clean text data"""
45
- if pd.isna(text):
46
- return ""
47
-
48
- text = str(text).lower()
49
- text = re.sub(r'[^a-zA-Z\s]', '', text)
50
- text = re.sub(r'\s+', ' ', text)
51
- text = text.strip()
52
- return text
53
-
54
  def save_artifacts(obj, folder_name, file_name):
55
  """Save artifacts like encoders and vectorizers"""
56
- try:
57
- os.makedirs(folder_name, exist_ok=True)
58
- with open(os.path.join(folder_name, file_name), 'wb') as f:
59
- pickle.dump(obj, f)
60
- return True
61
- except Exception as e:
62
- st.error(f"Error saving {file_name}: {str(e)}")
63
- return False
64
 
65
  def load_artifacts(folder_name, file_name):
66
  """Load saved artifacts"""
@@ -70,136 +25,41 @@ def load_artifacts(folder_name, file_name):
70
  except FileNotFoundError:
71
  st.error(f"File {file_name} not found in {folder_name} folder")
72
  return None
73
- except Exception as e:
74
- st.error(f"Error loading {file_name}: {str(e)}")
75
- return None
76
-
77
- def analyze_data(df, text_col, target_col):
78
- """Perform data analysis"""
79
- analysis = {}
80
-
81
- # Basic info
82
- analysis['shape'] = df.shape
83
- analysis['columns'] = df.columns.tolist()
84
- analysis['missing_values'] = df.isnull().sum().to_dict()
85
-
86
- # Text analysis
87
- df['text_length'] = df[text_col].astype(str).apply(len)
88
- analysis['avg_text_length'] = df['text_length'].mean()
89
- analysis['text_length_stats'] = df['text_length'].describe().to_dict()
90
-
91
- # Target analysis
92
- analysis['class_distribution'] = df[target_col].value_counts().to_dict()
93
- analysis['num_classes'] = df[target_col].nunique()
94
-
95
- return analysis
96
-
97
- def create_visualizations(df, text_col, target_col):
98
- """Create visualizations"""
99
- fig, axes = plt.subplots(2, 2, figsize=(15, 10))
100
-
101
- # Class distribution
102
- class_counts = df[target_col].value_counts()
103
- axes[0, 0].bar(class_counts.index, class_counts.values)
104
- axes[0, 0].set_title('Class Distribution')
105
- axes[0, 0].set_xlabel('Classes')
106
- axes[0, 0].set_ylabel('Count')
107
- plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')
108
-
109
- # Text length distribution
110
- axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
111
- axes[0, 1].set_title('Text Length Distribution')
112
- axes[0, 1].set_xlabel('Text Length')
113
- axes[0, 1].set_ylabel('Frequency')
114
-
115
- # Box plot of text length by class
116
- df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
117
- axes[1, 0].set_title('Text Length by Class')
118
- axes[1, 0].set_xlabel('Class')
119
- axes[1, 0].set_ylabel('Text Length')
120
-
121
- # Correlation plot (if applicable)
122
- if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
123
- correlation = df[['text_length', target_col]].corr()
124
- sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
125
- axes[1, 1].set_title('Correlation Matrix')
126
- else:
127
- axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
128
- ha='center', va='center', transform=axes[1, 1].transAxes)
129
- axes[1, 1].set_title('Correlation Analysis')
130
-
131
- plt.tight_layout()
132
- return fig
133
 
134
- def train_model(model_name, X_train, X_test, y_train, y_test):
135
- """Train selected model"""
136
- models_dict = {
137
- "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
138
- "Decision Tree": DecisionTreeClassifier(random_state=42),
139
- "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
140
- "Linear SVC": LinearSVC(random_state=42, max_iter=1000),
141
- "SVC": SVC(random_state=42, probability=True),
142
- "Multinomial Naive Bayes": MultinomialNB(),
143
- "Gaussian Naive Bayes": GaussianNB()
144
- }
145
-
146
- if model_name not in models_dict:
147
- return None, None, None
148
-
149
- model = models_dict[model_name]
150
-
151
- # Special handling for Gaussian NB (needs dense array)
152
- if model_name == "Gaussian Naive Bayes":
153
- X_train_model = X_train.toarray()
154
- X_test_model = X_test.toarray()
155
- else:
156
- X_train_model = X_train
157
- X_test_model = X_test
158
-
159
- # Train model
160
- model.fit(X_train_model, y_train)
161
-
162
- # Make predictions
163
- y_pred = model.predict(X_test_model)
164
-
165
- # Calculate metrics
166
- accuracy = accuracy_score(y_test, y_pred)
167
- report = classification_report(y_test, y_pred, output_dict=True)
168
-
169
- # Save model
170
- os.makedirs("models", exist_ok=True)
171
- model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
172
- save_artifacts(model, "models", model_filename)
173
-
174
- return model, accuracy, report
175
 
176
  def predict_text(model_name, text, vectorizer_type="tfidf"):
177
  """Make prediction on new text"""
178
  try:
179
  # Load model
180
- model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
181
- model = load_artifacts("models", model_filename)
182
  if model is None:
183
  return None, None
184
 
185
  # Load vectorizer
186
- vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
187
- vectorizer = load_artifacts("artifacts", vectorizer_filename)
188
  if vectorizer is None:
189
  return None, None
190
 
191
  # Load label encoder
192
- encoder = load_artifacts("artifacts", "label_encoder.pkl")
193
  if encoder is None:
194
  return None, None
195
 
196
  # Clean and vectorize text
197
- clean_text_input = clean_text(text)
198
- text_vector = vectorizer.transform([clean_text_input])
199
 
200
- # Special handling for Gaussian NB
201
- if "gaussian" in model_name.lower():
202
- text_vector = text_vector.toarray()
203
 
204
  # Make prediction
205
  prediction = model.predict(text_vector)
@@ -208,12 +68,9 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
208
  # Get prediction probabilities if available
209
  if hasattr(model, 'predict_proba'):
210
  try:
211
- if "gaussian" in model_name.lower():
212
- prediction_proba = model.predict_proba(text_vector)[0]
213
- else:
214
- prediction_proba = model.predict_proba(text_vector)[0]
215
- except Exception as e:
216
- st.warning(f"Could not get prediction probabilities: {str(e)}")
217
 
218
  # Decode prediction
219
  predicted_label = encoder.inverse_transform(prediction)[0]
@@ -224,298 +81,256 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
224
  st.error(f"Error during prediction: {str(e)}")
225
  return None, None
226
 
227
- # Main App
228
- st.markdown('<h1 class="main-header">📊 No Code Text Classification App</h1>', unsafe_allow_html=True)
229
- st.markdown("### Analyze your text data and train machine learning models without coding!")
230
-
231
- # Initialize session state
232
- if 'vectorizer_type' not in st.session_state:
233
- st.session_state.vectorizer_type = "tfidf"
234
- if 'trained_models' not in st.session_state:
235
- st.session_state.trained_models = []
236
 
237
  # Sidebar
238
- st.sidebar.markdown("## 📁 Upload Your Dataset")
239
 
240
- # File upload with better error handling
241
- try:
242
- uploaded_file = st.sidebar.file_uploader(
243
- "Choose a CSV file",
244
- type="csv",
245
- help="Upload your training dataset (CSV format)"
246
- )
247
-
248
- # Encoding selection
249
- encoding = st.sidebar.selectbox(
250
- "Select file encoding",
251
- ["utf-8", "latin1", "iso-8859-1", "cp1252"],
252
- help="Try different encodings if you get reading errors"
253
- )
254
-
255
- except Exception as e:
256
- st.sidebar.error(f"File upload error: {str(e)}")
257
- uploaded_file = None
258
 
259
- # Navigation
260
- section = st.sidebar.radio(
261
- "Choose Section",
262
- ["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
263
- help="Navigate through different sections of the app"
264
- )
265
 
266
- # Main content based on section
267
- if uploaded_file is not None:
268
  try:
269
- # Load data with selected encoding
270
- df = pd.read_csv(uploaded_file, encoding=encoding)
271
-
272
- st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")
273
 
274
- # Column selection
275
- columns = df.columns.tolist()
276
- text_column = st.sidebar.selectbox("📝 Select text column:", columns)
277
- target_column = st.sidebar.selectbox("🎯 Select target column:", columns)
 
 
 
278
 
279
- # Data preprocessing
280
- df['clean_text'] = df[text_column].apply(clean_text)
281
- df['text_length'] = df[text_column].astype(str).apply(len)
 
 
 
 
 
282
 
283
- # Process target column
 
284
  label_encoder = LabelEncoder()
285
- df['encoded_target'] = label_encoder.fit_transform(df[target_column])
286
- save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")
 
 
 
287
 
288
  except Exception as e:
289
- st.error(f"Error loading data: {str(e)}")
290
- st.info("💡 Try selecting a different encoding from the sidebar.")
291
- df = None
292
 
293
- # Section: Data Analysis
294
- if section == "📊 Data Analysis":
295
- if uploaded_file is not None and df is not None:
296
- st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)
297
-
298
- # Data overview
299
- col1, col2, col3 = st.columns(3)
300
-
301
- with col1:
302
- st.metric("📋 Total Records", df.shape[0])
303
- with col2:
304
- st.metric("📊 Features", df.shape[1])
305
- with col3:
306
- st.metric("🏷️ Classes", df[target_column].nunique())
307
-
308
- # Data preview
309
- st.subheader("📖 Data Preview")
310
- st.dataframe(df[[text_column, target_column, 'text_length']].head(10))
311
-
312
- # Analysis results
313
- analysis = analyze_data(df, text_column, target_column)
314
-
315
- col1, col2 = st.columns(2)
316
-
317
- with col1:
318
- st.subheader("📈 Text Statistics")
319
- st.write(f"**Average text length:** {analysis['avg_text_length']:.2f}")
320
- st.write("**Text length distribution:**")
321
- st.write(pd.DataFrame([analysis['text_length_stats']]).T)
322
-
323
- with col2:
324
- st.subheader("🏷️ Class Distribution")
325
- class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
326
- columns=['Class', 'Count'])
327
- st.dataframe(class_dist)
328
-
329
- # Visualizations
330
- st.subheader("📊 Visualizations")
331
  try:
332
- fig = create_visualizations(df, text_column, target_column)
333
- st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  except Exception as e:
335
- st.error(f"Error creating visualizations: {str(e)}")
336
-
337
  else:
338
- st.warning("📁 Please upload a dataset to analyze.")
339
 
340
- # Section: Train Model
341
- elif section == "🤖 Train Model":
342
- if uploaded_file is not None and df is not None:
343
- st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)
344
-
345
- col1, col2 = st.columns(2)
346
-
347
- with col1:
348
- st.subheader("🤖 Select Model")
349
- model_name = st.selectbox(
350
- "Choose algorithm:",
351
- ["Logistic Regression", "Decision Tree", "Random Forest",
352
- "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
353
- )
354
-
355
- with col2:
356
- st.subheader("🔤 Select Vectorizer")
357
- vectorizer_choice = st.selectbox(
358
- "Choose text vectorizer:",
359
- ["TF-IDF Vectorizer", "Count Vectorizer"]
360
- )
361
-
362
- # Vectorizer parameters
363
- max_features = st.slider("Max features", 1000, 50000, 10000)
364
- test_size = st.slider("Test size", 0.1, 0.5, 0.2)
365
-
366
- if st.button("🚀 Start Training", type="primary"):
367
- with st.spinner("🔄 Training model..."):
368
- try:
369
- # Initialize vectorizer
370
- if vectorizer_choice == "TF-IDF Vectorizer":
371
- vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
372
- st.session_state.vectorizer_type = "tfidf"
373
- else:
374
- vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
375
- st.session_state.vectorizer_type = "count"
376
-
377
- # Vectorize text
378
- X = vectorizer.fit_transform(df['clean_text'])
379
- y = df['encoded_target']
380
-
381
- # Split data
382
- X_train, X_test, y_train, y_test = train_test_split(
383
- X, y, test_size=test_size, random_state=42, stratify=y
384
- )
385
-
386
- # Save vectorizer
387
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
388
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
389
-
390
- # Train model
391
- model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)
392
 
393
- if model is not None:
394
- st.success(f"✅ Model trained successfully!")
395
- st.session_state.trained_models.append(model_name)
396
-
397
- # Display results
398
- col1, col2 = st.columns(2)
399
-
400
- with col1:
401
- st.metric("🎯 Accuracy", f"{accuracy:.4f}")
402
-
403
- with col2:
404
- st.metric("🏷️ Classes", len(report) - 3) # Exclude avg metrics
405
-
406
- # Detailed metrics
407
- st.subheader("📊 Detailed Metrics")
408
- metrics_df = pd.DataFrame(report).transpose()
409
- st.dataframe(metrics_df.round(4))
410
-
411
- except Exception as e:
412
- st.error(f"❌ Training failed: {str(e)}")
 
413
  else:
414
- st.warning("📁 Please upload a dataset to train a model.")
415
 
416
- # Section: Predictions
417
- elif section == "🔮 Predictions":
418
- st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)
419
 
420
- # Check for trained models
421
  if os.path.exists("models") and os.listdir("models"):
422
- available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
423
- for f in os.listdir("models") if f.endswith('.pkl')]
 
 
 
424
 
425
  if available_models:
426
- # Single prediction
427
- st.subheader("🔮 Single Text Prediction")
428
 
429
- col1, col2 = st.columns([3, 1])
430
-
431
- with col1:
432
- text_input = st.text_area(
433
- "Enter text to classify:",
434
- height=100,
435
- placeholder="Type or paste your text here..."
436
- )
437
-
438
- with col2:
439
- selected_model = st.selectbox("Select model:", available_models)
440
-
441
- if st.button("🔍 Predict", type="primary"):
442
- if text_input.strip():
443
- with st.spinner("🔄 Making prediction..."):
444
- predicted_label, prediction_proba = predict_text(
445
- selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
446
- )
447
 
448
- if predicted_label is not None:
449
- st.success(" Prediction completed!")
450
-
451
- # Results
452
- st.markdown("### 📋 Results")
453
- st.info(f"**Predicted Class:** {predicted_label}")
 
 
454
 
455
- # Probabilities
456
- if prediction_proba is not None:
457
- encoder = load_artifacts("artifacts", "label_encoder.pkl")
458
- if encoder is not None:
459
- classes = encoder.classes_
460
- prob_df = pd.DataFrame({
461
- 'Class': classes,
462
- 'Probability': prediction_proba
463
- }).sort_values('Probability', ascending=False)
464
-
465
- st.markdown("### 📊 Class Probabilities")
466
- st.bar_chart(prob_df.set_index('Class'))
467
- else:
468
- st.warning("⚠️ Please enter some text to classify.")
469
-
470
- # Batch predictions
471
- st.markdown("---")
472
- st.subheader("📦 Batch Predictions")
 
 
 
 
 
 
 
 
 
 
 
473
 
474
- batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])
 
475
 
476
- if batch_file is not None:
477
- try:
478
- batch_df = pd.read_csv(batch_file, encoding=encoding)
479
- st.write("📖 Preview:")
480
- st.dataframe(batch_df.head())
481
-
482
- batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
483
- batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")
484
-
485
- if st.button("🚀 Run Batch Predictions"):
486
- with st.spinner("🔄 Processing batch predictions..."):
487
- predictions = []
488
- progress_bar = st.progress(0)
489
-
490
- for i, text in enumerate(batch_df[batch_text_col]):
491
- pred, _ = predict_text(
492
- batch_model, str(text),
493
- st.session_state.get('vectorizer_type', 'tfidf')
494
- )
495
- predictions.append(pred if pred is not None else "Error")
496
- progress_bar.progress((i + 1) / len(batch_df))
497
-
498
- batch_df['Predicted_Class'] = predictions
499
-
500
- st.success("✅ Batch predictions completed!")
501
- st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])
502
-
503
- # Download option
504
- csv = batch_df.to_csv(index=False)
505
- st.download_button(
506
- "📥 Download Results",
507
- csv,
508
- "batch_predictions.csv",
509
- "text/csv"
510
  )
511
-
512
- except Exception as e:
513
- st.error(f"❌ Batch prediction error: {str(e)}")
514
- else:
515
- st.warning("⚠️ No trained models found.")
516
- else:
517
- st.warning("⚠️ No models available. Please train a model first.")
518
-
519
- # Footer
520
- st.markdown("---")
521
- st.markdown("*Built with Streamlit • Text Classification Made Easy*")
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
+ from NoCodeTextClassifier.EDA import Informations, Visualizations
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
+ from NoCodeTextClassifier.models import Models
 
 
 
 
 
 
9
  import os
10
  import pickle
11
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Utility functions
 
 
 
 
 
 
 
 
 
 
 
14
  def save_artifacts(obj, folder_name, file_name):
15
  """Save artifacts like encoders and vectorizers"""
16
+ os.makedirs(folder_name, exist_ok=True)
17
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
18
+ pickle.dump(obj, f)
 
 
 
 
 
19
 
20
  def load_artifacts(folder_name, file_name):
21
  """Load saved artifacts"""
 
25
  except FileNotFoundError:
26
  st.error(f"File {file_name} not found in {folder_name} folder")
27
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ def load_model(model_name):
30
+ """Load trained model"""
31
+ try:
32
+ with open(os.path.join('models', model_name), 'rb') as f:
33
+ return pickle.load(f)
34
+ except FileNotFoundError:
35
+ st.error(f"Model {model_name} not found. Please train a model first.")
36
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def predict_text(model_name, text, vectorizer_type="tfidf"):
39
  """Make prediction on new text"""
40
  try:
41
  # Load model
42
+ model = load_model(model_name)
 
43
  if model is None:
44
  return None, None
45
 
46
  # Load vectorizer
47
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
49
  if vectorizer is None:
50
  return None, None
51
 
52
  # Load label encoder
53
+ encoder = load_artifacts("artifacts", "encoder.pkl")
54
  if encoder is None:
55
  return None, None
56
 
57
  # Clean and vectorize text
58
+ text_cleaner = TextCleaner()
59
+ clean_text = text_cleaner.clean_text(text)
60
 
61
+ # Transform text using the same vectorizer used during training
62
+ text_vector = vectorizer.transform([clean_text])
 
63
 
64
  # Make prediction
65
  prediction = model.predict(text_vector)
 
68
  # Get prediction probabilities if available
69
  if hasattr(model, 'predict_proba'):
70
  try:
71
+ prediction_proba = model.predict_proba(text_vector)[0]
72
+ except:
73
+ pass
 
 
 
74
 
75
  # Decode prediction
76
  predicted_label = encoder.inverse_transform(prediction)[0]
 
81
  st.error(f"Error during prediction: {str(e)}")
82
  return None, None
83
 
84
+ # Streamlit App
85
+ st.title('No Code Text Classification App')
86
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
 
 
 
 
 
 
87
 
88
  # Sidebar
89
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
 
91
+ # Upload Data
92
+ st.sidebar.subheader("Upload Your Dataset")
93
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ # Global variables to store data and settings
97
+ if 'vectorizer_type' not in st.session_state:
98
+ st.session_state.vectorizer_type = "tfidf"
 
 
 
99
 
100
+ if train_data is not None:
 
101
  try:
102
+ train_df = pd.read_csv(train_data, encoding='latin1')
 
 
 
103
 
104
+ if test_data is not None:
105
+ test_df = pd.read_csv(test_data, encoding='latin1')
106
+ else:
107
+ test_df = None
108
+
109
+ st.write("Training Data Preview:")
110
+ st.write(train_df.head(3))
111
 
112
+ columns = train_df.columns.tolist()
113
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
+ target = st.sidebar.selectbox("Choose the target column:", columns)
115
+
116
+ # Process data
117
+ info = Informations(train_df, text_data, target)
118
+ train_df['clean_text'] = info.clean_text()
119
+ train_df['text_length'] = info.text_length()
120
 
121
+ # Handle label encoding manually if the class doesn't store encoder
122
+ from sklearn.preprocessing import LabelEncoder
123
  label_encoder = LabelEncoder()
124
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
125
+
126
+ # Save label encoder for later use
127
+ os.makedirs("artifacts", exist_ok=True)
128
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
 
130
  except Exception as e:
131
+ st.error(f"Error loading data: {str(e)}")
132
+ train_df = None
133
+ info = None
134
 
135
+ # Data Analysis Section
136
+ if section == "Data Analysis":
137
+ if train_data is not None and train_df is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  try:
139
+ st.subheader("Get Insights from the Data")
140
+
141
+ st.write("Data Shape:", info.shape())
142
+ st.write("Class Imbalance:", info.class_imbalanced())
143
+ st.write("Missing Values:", info.missing_values())
144
+
145
+ st.write("Processed Data Preview:")
146
+ st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
+
148
+ st.markdown("**Text Length Analysis**")
149
+ st.write(info.analysis_text_length('text_length'))
150
+
151
+ # Calculate correlation manually since we handled encoding separately
152
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
+ st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
+
155
+ st.subheader("Visualizations")
156
+ vis = Visualizations(train_df, text_data, target)
157
+ vis.class_distribution()
158
+ vis.text_length_distribution()
159
+
160
  except Exception as e:
161
+ st.error(f"Error in data analysis: {str(e)}")
 
162
  else:
163
+ st.warning("Please upload training data to get insights")
164
 
165
+ # Train Model Section
166
+ elif section == "Train Model":
167
+ if train_data is not None and train_df is not None:
168
+ try:
169
+ st.subheader("Train a Model")
170
+
171
+ # Create two columns for model selection
172
+ col1, col2 = st.columns(2)
173
+
174
+ with col1:
175
+ model = st.radio("Choose the Model", [
176
+ "Logistic Regression", "Decision Tree",
177
+ "Random Forest", "Linear SVC", "SVC",
178
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
+ ])
180
+
181
+ with col2:
182
+ vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
+
184
+ # Initialize vectorizer
185
+ if vectorizer_choice == "Tfidf Vectorizer":
186
+ vectorizer = TfidfVectorizer(max_features=10000)
187
+ st.session_state.vectorizer_type = "tfidf"
188
+ else:
189
+ vectorizer = CountVectorizer(max_features=10000)
190
+ st.session_state.vectorizer_type = "count"
191
+
192
+ st.write("Training Data Preview:")
193
+ st.write(train_df[['clean_text', 'target']].head(3))
194
+
195
+ # Vectorize text data
196
+ X = vectorizer.fit_transform(train_df['clean_text'])
197
+ y = train_df['target']
198
+
199
+ # Split data
200
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
201
+ st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
+
203
+ # Save vectorizer for later use
204
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
206
+
207
+ if st.button("Start Training"):
208
+ with st.spinner("Training model..."):
209
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 
 
 
 
 
 
 
210
 
211
+ # Train selected model
212
+ if model == "Logistic Regression":
213
+ models.LogisticRegression()
214
+ elif model == "Decision Tree":
215
+ models.DecisionTree()
216
+ elif model == "Linear SVC":
217
+ models.LinearSVC()
218
+ elif model == "SVC":
219
+ models.SVC()
220
+ elif model == "Multinomial Naive Bayes":
221
+ models.MultinomialNB()
222
+ elif model == "Random Forest":
223
+ models.RandomForestClassifier()
224
+ elif model == "Gaussian Naive Bayes":
225
+ models.GaussianNB()
226
+
227
+ st.success("Model training completed!")
228
+ st.info("You can now use the 'Predictions' section to classify new text.")
229
+
230
+ except Exception as e:
231
+ st.error(f"Error in model training: {str(e)}")
232
  else:
233
+ st.warning("Please upload training data to train a model")
234
 
235
+ # Predictions Section
236
+ elif section == "Predictions":
237
+ st.subheader("Perform Predictions on New Text")
238
 
239
+ # Check if models exist
240
  if os.path.exists("models") and os.listdir("models"):
241
+ # Text input for prediction
242
+ text_input = st.text_area("Enter the text to classify:", height=100)
243
+
244
+ # Model selection
245
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
 
247
  if available_models:
248
+ selected_model = st.selectbox("Choose the trained model:", available_models)
 
249
 
250
+ # Prediction button
251
+ if st.button("Predict", key="single_predict"):
252
+ if text_input.strip():
253
+ with st.spinner("Making prediction..."):
254
+ predicted_label, prediction_proba = predict_text(
255
+ selected_model,
256
+ text_input,
257
+ st.session_state.get('vectorizer_type', 'tfidf')
258
+ )
259
+
260
+ if predicted_label is not None:
261
+ st.success("Prediction completed!")
 
 
 
 
 
 
262
 
263
+ # Display results
264
+ st.markdown("### Prediction Results")
265
+ st.markdown(f"**Input Text:** {text_input}")
266
+ st.markdown(f"**Predicted Class:** {predicted_label}")
267
+
268
+ # Display probabilities if available
269
+ if prediction_proba is not None:
270
+ st.markdown("**Class Probabilities:**")
271
 
272
+ # Load encoder to get class names
273
+ encoder = load_artifacts("artifacts", "encoder.pkl")
274
+ if encoder is not None:
275
+ classes = encoder.classes_
276
+ prob_df = pd.DataFrame({
277
+ 'Class': classes,
278
+ 'Probability': prediction_proba
279
+ }).sort_values('Probability', ascending=False)
280
+
281
+ st.bar_chart(prob_df.set_index('Class'))
282
+ st.dataframe(prob_df)
283
+ else:
284
+ st.warning("Please enter some text to classify")
285
+ else:
286
+ st.warning("No trained models found. Please train a model first.")
287
+ else:
288
+ st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
+
290
+ # Option to classify multiple texts
291
+ st.markdown("---")
292
+ st.subheader("Batch Predictions")
293
+
294
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
+
296
+ if uploaded_file is not None:
297
+ try:
298
+ batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
+ st.write("Uploaded data preview:")
300
+ st.write(batch_df.head())
301
 
302
+ # Select text column
303
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
 
305
+ if os.path.exists("models") and os.listdir("models"):
306
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
+
309
+ if st.button("Run Batch Predictions", key="batch_predict"):
310
+ with st.spinner("Processing batch predictions..."):
311
+ predictions = []
312
+
313
+ for text in batch_df[text_column]:
314
+ pred, _ = predict_text(
315
+ batch_model,
316
+ str(text),
317
+ st.session_state.get('vectorizer_type', 'tfidf')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  )
319
+ predictions.append(pred if pred is not None else "Error")
320
+
321
+ batch_df['Predicted_Class'] = predictions
322
+
323
+ st.success("Batch predictions completed!")
324
+ st.write("Results:")
325
+ st.write(batch_df[[text_column, 'Predicted_Class']])
326
+
327
+ # Download results
328
+ csv = batch_df.to_csv(index=False)
329
+ st.download_button(
330
+ label="Download predictions as CSV",
331
+ data=csv,
332
+ file_name="batch_predictions.csv",
333
+ mime="text/csv"
334
+ )
335
+ except Exception as e:
336
+ st.error(f"Error in batch prediction: {str(e)}")