Alamgirapi commited on
Commit
5ba4816
ยท
verified ยท
1 Parent(s): 4d55e84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +445 -233
app.py CHANGED
@@ -2,63 +2,106 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
- from NoCodeTextClassifier.EDA import Informations, Visualizations
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
- from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
- from NoCodeTextClassifier.models import Models
 
 
 
 
 
 
9
  import os
10
  import pickle
11
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Utility functions
14
- def save_artifacts(obj, folder_name, file_name):
15
- """Save artifacts like encoders and vectorizers"""
16
- os.makedirs(folder_name, exist_ok=True)
17
- with open(os.path.join(folder_name, file_name), 'wb') as f:
18
- pickle.dump(obj, f)
19
 
20
- def load_artifacts(folder_name, file_name):
21
- """Load saved artifacts"""
22
- try:
23
- with open(os.path.join(folder_name, file_name), 'rb') as f:
24
- return pickle.load(f)
25
- except FileNotFoundError:
26
- st.error(f"File {file_name} not found in {folder_name} folder")
27
- return None
28
 
29
- def load_model(model_name):
30
- """Load trained model"""
31
- try:
32
- with open(os.path.join('models', model_name), 'rb') as f:
33
- return pickle.load(f)
34
- except FileNotFoundError:
35
- st.error(f"Model {model_name} not found. Please train a model first.")
36
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def predict_text(model_name, text, vectorizer_type="tfidf"):
39
  """Make prediction on new text"""
40
  try:
41
- # Load model
42
- model = load_model(model_name)
43
- if model is None:
44
- return None, None
45
-
46
- # Load vectorizer
47
- vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
- vectorizer = load_artifacts("artifacts", vectorizer_file)
49
- if vectorizer is None:
50
- return None, None
51
-
52
- # Load label encoder
53
- encoder = load_artifacts("artifacts", "encoder.pkl")
54
- if encoder is None:
55
- return None, None
56
-
57
- # Clean and vectorize text
58
  text_cleaner = TextCleaner()
59
  clean_text = text_cleaner.clean_text(text)
60
 
61
- # Transform text using the same vectorizer used during training
62
  text_vector = vectorizer.transform([clean_text])
63
 
64
  # Make prediction
@@ -81,256 +124,425 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
81
  st.error(f"Error during prediction: {str(e)}")
82
  return None, None
83
 
84
- # Streamlit App
85
- st.title('No Code Text Classification App')
86
- st.write('Understand the behavior of your text data and train a model to classify the text data')
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # Sidebar
89
- section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
 
 
 
 
 
90
 
91
- # Upload Data
92
- st.sidebar.subheader("Upload Your Dataset")
93
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
 
96
- # Global variables to store data and settings
97
- if 'vectorizer_type' not in st.session_state:
98
- st.session_state.vectorizer_type = "tfidf"
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
 
100
  if train_data is not None:
101
  try:
102
- train_df = pd.read_csv(train_data, encoding='latin1')
 
 
 
 
 
 
 
103
 
104
  if test_data is not None:
105
- test_df = pd.read_csv(test_data, encoding='latin1')
106
  else:
107
  test_df = None
108
 
109
- st.write("Training Data Preview:")
110
- st.write(train_df.head(3))
111
 
 
112
  columns = train_df.columns.tolist()
113
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
- target = st.sidebar.selectbox("Choose the target column:", columns)
115
-
116
- # Process data
117
- info = Informations(train_df, text_data, target)
118
- train_df['clean_text'] = info.clean_text()
119
- train_df['text_length'] = info.text_length()
120
-
121
- # Handle label encoding manually if the class doesn't store encoder
122
- from sklearn.preprocessing import LabelEncoder
123
- label_encoder = LabelEncoder()
124
- train_df['target'] = label_encoder.fit_transform(train_df[target])
125
 
126
- # Save label encoder for later use
127
- os.makedirs("artifacts", exist_ok=True)
128
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
 
 
 
129
 
130
  except Exception as e:
131
- st.error(f"Error loading data: {str(e)}")
132
- train_df = None
133
- info = None
134
 
135
  # Data Analysis Section
136
- if section == "Data Analysis":
137
- if train_data is not None and train_df is not None:
 
 
138
  try:
139
- st.subheader("Get Insights from the Data")
 
 
140
 
141
- st.write("Data Shape:", info.shape())
142
- st.write("Class Imbalance:", info.class_imbalanced())
143
- st.write("Missing Values:", info.missing_values())
144
-
145
- st.write("Processed Data Preview:")
146
- st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
 
148
- st.markdown("**Text Length Analysis**")
149
- st.write(info.analysis_text_length('text_length'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- # Calculate correlation manually since we handled encoding separately
152
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
- st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
-
155
- st.subheader("Visualizations")
156
- vis = Visualizations(train_df, text_data, target)
157
- vis.class_distribution()
158
- vis.text_length_distribution()
159
-
160
  except Exception as e:
161
- st.error(f"Error in data analysis: {str(e)}")
162
  else:
163
- st.warning("Please upload training data to get insights")
164
 
165
  # Train Model Section
166
- elif section == "Train Model":
167
- if train_data is not None and train_df is not None:
 
 
168
  try:
169
- st.subheader("Train a Model")
170
-
171
- # Create two columns for model selection
 
 
 
 
 
 
 
 
 
 
172
  col1, col2 = st.columns(2)
173
-
174
  with col1:
175
- model = st.radio("Choose the Model", [
 
176
  "Logistic Regression", "Decision Tree",
177
  "Random Forest", "Linear SVC", "SVC",
178
  "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
  ])
180
 
181
  with col2:
182
- vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
-
184
- # Initialize vectorizer
185
- if vectorizer_choice == "Tfidf Vectorizer":
186
- vectorizer = TfidfVectorizer(max_features=10000)
187
- st.session_state.vectorizer_type = "tfidf"
188
- else:
189
- vectorizer = CountVectorizer(max_features=10000)
190
- st.session_state.vectorizer_type = "count"
191
-
192
- st.write("Training Data Preview:")
193
- st.write(train_df[['clean_text', 'target']].head(3))
194
 
195
- # Vectorize text data
196
- X = vectorizer.fit_transform(train_df['clean_text'])
197
- y = train_df['target']
198
 
199
- # Split data
200
- X_train, X_test, y_train, y_test = process.split_data(X, y)
201
- st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
 
203
- # Save vectorizer for later use
204
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- if st.button("Start Training"):
208
- with st.spinner("Training model..."):
209
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
210
-
211
- # Train selected model
212
- if model == "Logistic Regression":
213
- models.LogisticRegression()
214
- elif model == "Decision Tree":
215
- models.DecisionTree()
216
- elif model == "Linear SVC":
217
- models.LinearSVC()
218
- elif model == "SVC":
219
- models.SVC()
220
- elif model == "Multinomial Naive Bayes":
221
- models.MultinomialNB()
222
- elif model == "Random Forest":
223
- models.RandomForestClassifier()
224
- elif model == "Gaussian Naive Bayes":
225
- models.GaussianNB()
226
-
227
- st.success("Model training completed!")
228
- st.info("You can now use the 'Predictions' section to classify new text.")
229
-
230
  except Exception as e:
231
- st.error(f"Error in model training: {str(e)}")
232
  else:
233
- st.warning("Please upload training data to train a model")
234
 
235
  # Predictions Section
236
- elif section == "Predictions":
237
- st.subheader("Perform Predictions on New Text")
238
 
239
- # Check if models exist
240
- if os.path.exists("models") and os.listdir("models"):
241
- # Text input for prediction
242
- text_input = st.text_area("Enter the text to classify:", height=100)
243
 
244
- # Model selection
245
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
 
247
- if available_models:
248
- selected_model = st.selectbox("Choose the trained model:", available_models)
249
-
250
- # Prediction button
251
- if st.button("Predict", key="single_predict"):
 
 
 
 
252
  if text_input.strip():
253
- with st.spinner("Making prediction..."):
 
 
 
 
254
  predicted_label, prediction_proba = predict_text(
255
- selected_model,
256
- text_input,
257
- st.session_state.get('vectorizer_type', 'tfidf')
258
  )
259
 
260
  if predicted_label is not None:
261
- st.success("Prediction completed!")
262
 
263
  # Display results
264
- st.markdown("### Prediction Results")
265
- st.markdown(f"**Input Text:** {text_input}")
266
- st.markdown(f"**Predicted Class:** {predicted_label}")
267
 
268
  # Display probabilities if available
269
  if prediction_proba is not None:
270
  st.markdown("**Class Probabilities:**")
271
 
272
- # Load encoder to get class names
273
- encoder = load_artifacts("artifacts", "encoder.pkl")
274
- if encoder is not None:
275
- classes = encoder.classes_
276
- prob_df = pd.DataFrame({
277
- 'Class': classes,
278
- 'Probability': prediction_proba
279
- }).sort_values('Probability', ascending=False)
280
-
281
- st.bar_chart(prob_df.set_index('Class'))
282
- st.dataframe(prob_df)
 
 
 
283
  else:
284
- st.warning("Please enter some text to classify")
285
- else:
286
- st.warning("No trained models found. Please train a model first.")
287
- else:
288
- st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
 
290
- # Option to classify multiple texts
291
- st.markdown("---")
292
- st.subheader("Batch Predictions")
293
-
294
- uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
-
296
- if uploaded_file is not None:
297
- try:
298
- batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
- st.write("Uploaded data preview:")
300
- st.write(batch_df.head())
301
-
302
- # Select text column
303
- text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
-
305
- if os.path.exists("models") and os.listdir("models"):
306
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
- batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
 
 
 
 
308
 
309
- if st.button("Run Batch Predictions", key="batch_predict"):
 
 
 
 
 
 
310
  with st.spinner("Processing batch predictions..."):
311
- predictions = []
312
-
313
- for text in batch_df[text_column]:
314
- pred, _ = predict_text(
315
- batch_model,
316
- str(text),
317
- st.session_state.get('vectorizer_type', 'tfidf')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  )
319
- predictions.append(pred if pred is not None else "Error")
320
-
321
- batch_df['Predicted_Class'] = predictions
322
-
323
- st.success("Batch predictions completed!")
324
- st.write("Results:")
325
- st.write(batch_df[[text_column, 'Predicted_Class']])
326
-
327
- # Download results
328
- csv = batch_df.to_csv(index=False)
329
- st.download_button(
330
- label="Download predictions as CSV",
331
- data=csv,
332
- file_name="batch_predictions.csv",
333
- mime="text/csv"
334
- )
335
- except Exception as e:
336
- st.error(f"Error in batch prediction: {str(e)}")
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LogisticRegression
9
+ from sklearn.tree import DecisionTreeClassifier
10
+ from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.svm import LinearSVC, SVC
12
+ from sklearn.naive_bayes import MultinomialNB, GaussianNB
13
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
  import os
15
  import pickle
16
+ import tempfile
17
+ import re
18
+ import string
19
+ from collections import Counter
20
+
21
+ # Text Cleaning Class (replacing the custom module)
22
+ class TextCleaner:
23
+ def clean_text(self, text):
24
+ """Clean and preprocess text"""
25
+ if pd.isna(text):
26
+ return ""
27
+
28
+ # Convert to lowercase
29
+ text = str(text).lower()
30
+
31
+ # Remove special characters and digits
32
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
33
+
34
+ # Remove extra whitespace
35
+ text = ' '.join(text.split())
36
+
37
+ return text
38
+
39
+ # Information Analysis Class (replacing the custom module)
40
+ class TextInformations:
41
+ def __init__(self, df, text_col, target_col):
42
+ self.df = df
43
+ self.text_col = text_col
44
+ self.target_col = target_col
45
+
46
+ def shape(self):
47
+ return self.df.shape
48
+
49
+ def missing_values(self):
50
+ return self.df.isnull().sum().to_dict()
51
+
52
+ def class_imbalanced(self):
53
+ return self.df[self.target_col].value_counts().to_dict()
54
+
55
+ def clean_text(self):
56
+ cleaner = TextCleaner()
57
+ return self.df[self.text_col].apply(cleaner.clean_text)
58
+
59
+ def text_length(self):
60
+ return self.df[self.text_col].str.len()
61
 
62
  # Utility functions
63
+ def save_to_session(obj, key):
64
+ """Save objects to session state instead of files"""
65
+ st.session_state[key] = obj
 
 
66
 
67
+ def load_from_session(key):
68
+ """Load objects from session state"""
69
+ return st.session_state.get(key, None)
 
 
 
 
 
70
 
71
+ def train_model(model_name, X_train, X_test, y_train, y_test):
72
+ """Train the selected model"""
73
+ if model_name == "Logistic Regression":
74
+ model = LogisticRegression(random_state=42, max_iter=1000)
75
+ elif model_name == "Decision Tree":
76
+ model = DecisionTreeClassifier(random_state=42)
77
+ elif model_name == "Random Forest":
78
+ model = RandomForestClassifier(random_state=42, n_estimators=100)
79
+ elif model_name == "Linear SVC":
80
+ model = LinearSVC(random_state=42, max_iter=1000)
81
+ elif model_name == "SVC":
82
+ model = SVC(random_state=42, probability=True)
83
+ elif model_name == "Multinomial Naive Bayes":
84
+ model = MultinomialNB()
85
+ elif model_name == "Gaussian Naive Bayes":
86
+ model = GaussianNB()
87
+
88
+ # Train model
89
+ model.fit(X_train, y_train)
90
+
91
+ # Make predictions
92
+ y_pred = model.predict(X_test)
93
+ accuracy = accuracy_score(y_test, y_pred)
94
+
95
+ return model, accuracy
96
 
97
+ def predict_text(text, model, vectorizer, encoder):
98
  """Make prediction on new text"""
99
  try:
100
+ # Clean text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  text_cleaner = TextCleaner()
102
  clean_text = text_cleaner.clean_text(text)
103
 
104
+ # Transform text using the vectorizer
105
  text_vector = vectorizer.transform([clean_text])
106
 
107
  # Make prediction
 
124
  st.error(f"Error during prediction: {str(e)}")
125
  return None, None
126
 
127
+ # Streamlit App Configuration
128
+ st.set_page_config(
129
+ page_title="Text Classification App",
130
+ page_icon="๐Ÿ“",
131
+ layout="wide"
132
+ )
133
+
134
+ st.title('๐Ÿ“ No Code Text Classification App')
135
+ st.markdown('Analyze your text data and train machine learning models for text classification')
136
+
137
+ # Initialize session state
138
+ if 'model_trained' not in st.session_state:
139
+ st.session_state.model_trained = False
140
+ if 'training_data_processed' not in st.session_state:
141
+ st.session_state.training_data_processed = False
142
 
143
  # Sidebar
144
+ st.sidebar.title("Navigation")
145
+ section = st.sidebar.radio(
146
+ "Choose Section",
147
+ ["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ”ฎ Predictions"],
148
+ index=0
149
+ )
150
 
151
+ # Upload Data Section
152
+ st.sidebar.markdown("---")
153
+ st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
 
154
 
155
+ # File uploader with better error handling
156
+ try:
157
+ train_data = st.sidebar.file_uploader(
158
+ "Upload training data (CSV)",
159
+ type=["csv"],
160
+ help="Upload a CSV file with text and labels for training"
161
+ )
162
+
163
+ test_data = st.sidebar.file_uploader(
164
+ "Upload test data (CSV, optional)",
165
+ type=["csv"],
166
+ help="Optional: Upload a separate test dataset"
167
+ )
168
+ except Exception as e:
169
+ st.sidebar.error(f"File upload error: {str(e)}")
170
+ st.sidebar.info("Try refreshing the page or using a different browser")
171
 
172
+ # Process uploaded data
173
  if train_data is not None:
174
  try:
175
+ # Add encoding options to handle different CSV formats
176
+ encoding_option = st.sidebar.selectbox(
177
+ "CSV Encoding",
178
+ ["utf-8", "latin-1", "cp1252", "iso-8859-1"],
179
+ help="Try different encodings if you get errors"
180
+ )
181
+
182
+ train_df = pd.read_csv(train_data, encoding=encoding_option)
183
 
184
  if test_data is not None:
185
+ test_df = pd.read_csv(test_data, encoding=encoding_option)
186
  else:
187
  test_df = None
188
 
189
+ st.sidebar.success(f"โœ… Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
 
190
 
191
+ # Column selection
192
  columns = train_df.columns.tolist()
193
+ text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
194
+ target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
 
 
 
 
 
 
 
 
 
 
195
 
196
+ # Store processed data in session state
197
+ st.session_state.train_df = train_df
198
+ st.session_state.test_df = test_df
199
+ st.session_state.text_col = text_data
200
+ st.session_state.target_col = target
201
+ st.session_state.training_data_processed = True
202
 
203
  except Exception as e:
204
+ st.sidebar.error(f"โŒ Error loading data: {str(e)}")
205
+ st.sidebar.info("Please check your CSV file format and encoding")
 
206
 
207
  # Data Analysis Section
208
+ if section == "๐Ÿ“Š Data Analysis":
209
+ st.header("๐Ÿ“Š Data Analysis")
210
+
211
+ if st.session_state.get('training_data_processed', False):
212
  try:
213
+ train_df = st.session_state.train_df
214
+ text_col = st.session_state.text_col
215
+ target_col = st.session_state.target_col
216
 
217
+ # Create info object
218
+ info = TextInformations(train_df, text_col, target_col)
 
 
 
 
219
 
220
+ # Data preprocessing
221
+ train_df['clean_text'] = info.clean_text()
222
+ train_df['text_length'] = info.text_length()
223
+
224
+ # Display basic information
225
+ col1, col2, col3 = st.columns(3)
226
+
227
+ with col1:
228
+ st.metric("Dataset Shape", f"{info.shape()[0]} ร— {info.shape()[1]}")
229
+
230
+ with col2:
231
+ missing_vals = sum(info.missing_values().values())
232
+ st.metric("Missing Values", missing_vals)
233
+
234
+ with col3:
235
+ unique_classes = len(info.class_imbalanced())
236
+ st.metric("Unique Classes", unique_classes)
237
+
238
+ # Data preview
239
+ st.subheader("๐Ÿ“‹ Data Preview")
240
+ st.dataframe(train_df[[text_col, target_col, 'clean_text', 'text_length']].head(10))
241
+
242
+ # Class distribution
243
+ st.subheader("๐Ÿ“Š Class Distribution")
244
+ class_counts = info.class_imbalanced()
245
+
246
+ col1, col2 = st.columns(2)
247
+
248
+ with col1:
249
+ fig, ax = plt.subplots(figsize=(8, 6))
250
+ classes = list(class_counts.keys())
251
+ counts = list(class_counts.values())
252
+ ax.bar(classes, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8'])
253
+ ax.set_title('Class Distribution')
254
+ ax.set_xlabel('Classes')
255
+ ax.set_ylabel('Count')
256
+ plt.xticks(rotation=45)
257
+ st.pyplot(fig)
258
+
259
+ with col2:
260
+ st.write("**Class Distribution:**")
261
+ for class_name, count in class_counts.items():
262
+ percentage = (count / len(train_df)) * 100
263
+ st.write(f"- {class_name}: {count} ({percentage:.1f}%)")
264
+
265
+ # Text length analysis
266
+ st.subheader("๐Ÿ“ Text Length Analysis")
267
+
268
+ col1, col2 = st.columns(2)
269
+
270
+ with col1:
271
+ fig, ax = plt.subplots(figsize=(8, 6))
272
+ ax.hist(train_df['text_length'], bins=50, alpha=0.7, color='#4ECDC4')
273
+ ax.set_title('Text Length Distribution')
274
+ ax.set_xlabel('Text Length (characters)')
275
+ ax.set_ylabel('Frequency')
276
+ st.pyplot(fig)
277
+
278
+ with col2:
279
+ st.write("**Text Length Statistics:**")
280
+ length_stats = train_df['text_length'].describe()
281
+ for stat, value in length_stats.items():
282
+ st.write(f"- {stat.title()}: {value:.1f}")
283
+
284
+ # Update session state
285
+ st.session_state.processed_train_df = train_df
286
 
 
 
 
 
 
 
 
 
 
287
  except Exception as e:
288
+ st.error(f"โŒ Error in data analysis: {str(e)}")
289
  else:
290
+ st.info("๐Ÿ”„ Please upload training data to perform analysis")
291
 
292
  # Train Model Section
293
+ elif section == "๐Ÿค– Train Model":
294
+ st.header("๐Ÿค– Train Model")
295
+
296
+ if st.session_state.get('training_data_processed', False):
297
  try:
298
+ if 'processed_train_df' in st.session_state:
299
+ train_df = st.session_state.processed_train_df
300
+ else:
301
+ # Process data if not already processed
302
+ train_df = st.session_state.train_df
303
+ text_col = st.session_state.text_col
304
+ target_col = st.session_state.target_col
305
+
306
+ info = TextInformations(train_df, text_col, target_col)
307
+ train_df['clean_text'] = info.clean_text()
308
+ train_df['text_length'] = info.text_length()
309
+
310
+ # Model and vectorizer selection
311
  col1, col2 = st.columns(2)
312
+
313
  with col1:
314
+ st.subheader("๐ŸŽฏ Model Selection")
315
+ model_name = st.selectbox("Choose the Model", [
316
  "Logistic Regression", "Decision Tree",
317
  "Random Forest", "Linear SVC", "SVC",
318
  "Multinomial Naive Bayes", "Gaussian Naive Bayes"
319
  ])
320
 
321
  with col2:
322
+ st.subheader("๐Ÿ“Š Vectorizer Selection")
323
+ vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count"])
 
 
 
 
 
 
 
 
 
 
324
 
325
+ # Training parameters
326
+ st.subheader("โš™๏ธ Training Parameters")
327
+ col1, col2 = st.columns(2)
328
 
329
+ with col1:
330
+ max_features = st.slider("Max Features", 1000, 20000, 10000, 1000)
331
+ test_size = st.slider("Test Size", 0.1, 0.5, 0.2, 0.05)
332
 
333
+ with col2:
334
+ random_state = st.number_input("Random State", 0, 100, 42)
335
+
336
+ # Training button
337
+ if st.button("๐Ÿš€ Start Training", type="primary"):
338
+ with st.spinner("Training model... Please wait"):
339
+ try:
340
+ # Prepare data
341
+ X_text = train_df['clean_text'].fillna('')
342
+ y = train_df[st.session_state.target_col]
343
+
344
+ # Label encoding
345
+ label_encoder = LabelEncoder()
346
+ y_encoded = label_encoder.fit_transform(y)
347
+
348
+ # Vectorization
349
+ if vectorizer_choice == "TF-IDF":
350
+ vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
351
+ else:
352
+ vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
353
+
354
+ X_vectorized = vectorizer.fit_transform(X_text)
355
+
356
+ # Train-test split
357
+ X_train, X_test, y_train, y_test = train_test_split(
358
+ X_vectorized, y_encoded,
359
+ test_size=test_size,
360
+ random_state=random_state,
361
+ stratify=y_encoded
362
+ )
363
+
364
+ # Train model
365
+ model, accuracy = train_model(model_name, X_train, X_test, y_train, y_test)
366
+
367
+ # Save to session state
368
+ save_to_session(model, 'trained_model')
369
+ save_to_session(vectorizer, 'vectorizer')
370
+ save_to_session(label_encoder, 'label_encoder')
371
+ save_to_session(model_name, 'model_name')
372
+ save_to_session(vectorizer_choice, 'vectorizer_type')
373
+
374
+ st.session_state.model_trained = True
375
+
376
+ # Display results
377
+ st.success(f"โœ… Model training completed!")
378
+
379
+ col1, col2 = st.columns(2)
380
+ with col1:
381
+ st.metric("Model Accuracy", f"{accuracy:.4f}")
382
+ with col2:
383
+ st.metric("Training Samples", len(X_train))
384
+
385
+ st.info("๐ŸŽ‰ You can now use the 'Predictions' section to classify new text!")
386
+
387
+ except Exception as e:
388
+ st.error(f"โŒ Error during training: {str(e)}")
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  except Exception as e:
391
+ st.error(f"โŒ Error in model training setup: {str(e)}")
392
  else:
393
+ st.info("๐Ÿ”„ Please upload and analyze training data first")
394
 
395
  # Predictions Section
396
+ elif section == "๐Ÿ”ฎ Predictions":
397
+ st.header("๐Ÿ”ฎ Make Predictions")
398
 
399
+ if st.session_state.get('model_trained', False):
 
 
 
400
 
401
+ # Single text prediction
402
+ st.subheader("๐Ÿ“ Single Text Prediction")
403
 
404
+ text_input = st.text_area(
405
+ "Enter text to classify:",
406
+ height=120,
407
+ placeholder="Type or paste your text here..."
408
+ )
409
+
410
+ col1, col2 = st.columns([1, 3])
411
+ with col1:
412
+ if st.button("๐Ÿ”ฎ Predict", type="primary"):
413
  if text_input.strip():
414
+ try:
415
+ model = load_from_session('trained_model')
416
+ vectorizer = load_from_session('vectorizer')
417
+ encoder = load_from_session('label_encoder')
418
+
419
  predicted_label, prediction_proba = predict_text(
420
+ text_input, model, vectorizer, encoder
 
 
421
  )
422
 
423
  if predicted_label is not None:
424
+ st.success("โœ… Prediction completed!")
425
 
426
  # Display results
427
+ st.markdown("### ๐Ÿ“Š Results")
428
+ st.markdown(f"**Predicted Class:** `{predicted_label}`")
 
429
 
430
  # Display probabilities if available
431
  if prediction_proba is not None:
432
  st.markdown("**Class Probabilities:**")
433
 
434
+ classes = encoder.classes_
435
+ prob_data = pd.DataFrame({
436
+ 'Class': classes,
437
+ 'Probability': prediction_proba
438
+ }).sort_values('Probability', ascending=False)
439
+
440
+ # Show as bar chart
441
+ st.bar_chart(prob_data.set_index('Class'))
442
+
443
+ # Show as table
444
+ st.dataframe(prob_data, use_container_width=True)
445
+
446
+ except Exception as e:
447
+ st.error(f"โŒ Prediction error: {str(e)}")
448
  else:
449
+ st.warning("โš ๏ธ Please enter some text to classify")
 
 
 
 
450
 
451
+ # Batch predictions
452
+ st.markdown("---")
453
+ st.subheader("๐Ÿ“ Batch Predictions")
454
+
455
+ uploaded_batch = st.file_uploader(
456
+ "Upload CSV file for batch predictions",
457
+ type=['csv'],
458
+ help="Upload a CSV file with text data to classify multiple texts at once"
459
+ )
460
+
461
+ if uploaded_batch is not None:
462
+ try:
463
+ # Load batch data
464
+ encoding_option = st.selectbox(
465
+ "Batch CSV Encoding",
466
+ ["utf-8", "latin-1", "cp1252", "iso-8859-1"],
467
+ key="batch_encoding"
468
+ )
469
+
470
+ batch_df = pd.read_csv(uploaded_batch, encoding=encoding_option)
471
+ st.write("๐Ÿ“‹ **Batch Data Preview:**")
472
+ st.dataframe(batch_df.head())
473
 
474
+ # Select text column
475
+ text_column = st.selectbox(
476
+ "Select the text column:",
477
+ batch_df.columns.tolist()
478
+ )
479
+
480
+ if st.button("๐Ÿš€ Run Batch Predictions", type="primary"):
481
  with st.spinner("Processing batch predictions..."):
482
+ try:
483
+ model = load_from_session('trained_model')
484
+ vectorizer = load_from_session('vectorizer')
485
+ encoder = load_from_session('label_encoder')
486
+
487
+ predictions = []
488
+ confidences = []
489
+
490
+ progress_bar = st.progress(0)
491
+ total_rows = len(batch_df)
492
+
493
+ for idx, text in enumerate(batch_df[text_column]):
494
+ pred, pred_proba = predict_text(
495
+ str(text), model, vectorizer, encoder
496
+ )
497
+ predictions.append(pred if pred is not None else "Error")
498
+
499
+ # Get confidence (max probability)
500
+ if pred_proba is not None:
501
+ confidences.append(max(pred_proba))
502
+ else:
503
+ confidences.append(0.0)
504
+
505
+ progress_bar.progress((idx + 1) / total_rows)
506
+
507
+ batch_df['Predicted_Class'] = predictions
508
+ batch_df['Confidence'] = confidences
509
+
510
+ st.success("โœ… Batch predictions completed!")
511
+
512
+ # Show results
513
+ st.write("๐Ÿ“Š **Prediction Results:**")
514
+ st.dataframe(batch_df[[text_column, 'Predicted_Class', 'Confidence']])
515
+
516
+ # Download results
517
+ csv = batch_df.to_csv(index=False)
518
+ st.download_button(
519
+ label="๐Ÿ“ฅ Download Results as CSV",
520
+ data=csv,
521
+ file_name="batch_predictions.csv",
522
+ mime="text/csv"
523
  )
524
+
525
+ except Exception as e:
526
+ st.error(f"โŒ Batch prediction error: {str(e)}")
527
+
528
+ except Exception as e:
529
+ st.error(f"โŒ Error loading batch file: {str(e)}")
530
+
531
+ else:
532
+ st.info("๐Ÿ”„ Please train a model first before making predictions")
533
+
534
+ # Show model info if available
535
+ if st.session_state.get('training_data_processed', False):
536
+ st.write("๐Ÿ’ก **Tip:** Go to the 'Train Model' section to train a model first!")
537
+
538
+ # Footer
539
+ st.markdown("---")
540
+ st.markdown(
541
+ """
542
+ <div style='text-align: center; color: #666; padding: 20px;'>
543
+ <p>๐Ÿ“ No Code Text Classification App</p>
544
+ <p>Built with Streamlit โ€ข Upload CSV โ†’ Analyze โ†’ Train โ†’ Predict</p>
545
+ </div>
546
+ """,
547
+ unsafe_allow_html=True
548
+ )