Alamgirapi commited on
Commit
3b9b877
·
verified ·
1 Parent(s): b136104

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -431
app.py CHANGED
@@ -2,471 +2,335 @@ import streamlit as st
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
 
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.linear_model import LogisticRegression
8
- from sklearn.tree import DecisionTreeClassifier
9
- from sklearn.ensemble import RandomForestClassifier
10
- from sklearn.svm import LinearSVC, SVC
11
- from sklearn.naive_bayes import MultinomialNB, GaussianNB
12
- from sklearn.preprocessing import LabelEncoder
13
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
- import re
15
- import string
16
- import nltk
17
  import os
18
  import pickle
19
- import io
20
- import base64
21
-
22
- # Download required NLTK data
23
- try:
24
- nltk.data.find('corpora/stopwords')
25
- except LookupError:
26
- nltk.download('stopwords', quiet=True)
27
-
28
- try:
29
- nltk.data.find('corpora/wordnet')
30
- except LookupError:
31
- nltk.download('wordnet', quiet=True)
32
 
33
- from nltk.corpus import stopwords
34
- from nltk.stem import WordNetLemmatizer
 
 
 
 
35
 
36
- # Set page config
37
- st.set_page_config(
38
- page_title="No Code Text Classification",
39
- page_icon="📝",
40
- layout="wide"
41
- )
 
 
42
 
43
- # Initialize session state
44
- if 'trained_model' not in st.session_state:
45
- st.session_state.trained_model = None
46
- if 'vectorizer' not in st.session_state:
47
- st.session_state.vectorizer = None
48
- if 'label_encoder' not in st.session_state:
49
- st.session_state.label_encoder = None
50
- if 'vectorizer_type' not in st.session_state:
51
- st.session_state.vectorizer_type = 'tfidf'
52
- if 'train_df' not in st.session_state:
53
- st.session_state.train_df = None
54
 
55
- # Text cleaning class
56
- class TextCleaner:
57
- def __init__(self):
58
- self.stop_words = set(stopwords.words('english'))
59
- self.lemmatizer = WordNetLemmatizer()
60
-
61
- def clean_text(self, text):
62
- if pd.isna(text):
63
- return ""
64
 
65
- # Convert to lowercase
66
- text = str(text).lower()
 
 
 
67
 
68
- # Remove URLs
69
- text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
 
 
70
 
71
- # Remove user mentions and hashtags
72
- text = re.sub(r'@\w+|#\w+', '', text)
 
73
 
74
- # Remove punctuation
75
- text = text.translate(str.maketrans('', '', string.punctuation))
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- # Remove extra whitespace
78
- text = re.sub(r'\s+', ' ', text).strip()
79
 
80
- # Remove stopwords and lemmatize
81
- words = text.split()
82
- words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
83
 
84
- return ' '.join(words)
 
 
85
 
86
- # Utility functions
87
- def create_download_link(val, filename):
88
- """Generate a download link for a file"""
89
- b64 = base64.b64encode(val)
90
- return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="{filename}">Download {filename}</a>'
 
 
 
 
 
 
 
 
 
 
91
 
92
- def safe_file_read(uploaded_file):
93
- """Safely read uploaded file with multiple encoding attempts"""
94
  try:
95
- # Try UTF-8 first
96
- return pd.read_csv(uploaded_file, encoding='utf-8')
97
- except UnicodeDecodeError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
- # Try latin1
100
- uploaded_file.seek(0) # Reset file pointer
101
- return pd.read_csv(uploaded_file, encoding='latin1')
102
- except:
103
- try:
104
- # Try cp1252
105
- uploaded_file.seek(0)
106
- return pd.read_csv(uploaded_file, encoding='cp1252')
107
- except Exception as e:
108
- st.error(f"Error reading file: {str(e)}")
109
- return None
110
 
111
- # Data Analysis Functions
112
- def get_data_insights(df, text_col, target_col):
113
- """Get basic insights from the data"""
114
- insights = {}
115
-
116
- # Basic info
117
- insights['shape'] = df.shape
118
- insights['missing_values'] = df.isnull().sum().to_dict()
119
-
120
- # Class distribution
121
- insights['class_distribution'] = df[target_col].value_counts().to_dict()
122
-
123
- # Text length analysis
124
- df['text_length'] = df[text_col].astype(str).str.len()
125
- insights['avg_text_length'] = df['text_length'].mean()
126
- insights['min_text_length'] = df['text_length'].min()
127
- insights['max_text_length'] = df['text_length'].max()
128
-
129
- return insights
130
 
131
- def create_visualizations(df, text_col, target_col):
132
- """Create visualizations for the data"""
133
-
134
- # Class distribution
135
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
136
-
137
- # Class distribution bar plot
138
- class_counts = df[target_col].value_counts()
139
- ax1.bar(class_counts.index, class_counts.values)
140
- ax1.set_title('Class Distribution')
141
- ax1.set_xlabel('Classes')
142
- ax1.set_ylabel('Count')
143
- ax1.tick_params(axis='x', rotation=45)
144
-
145
- # Text length distribution
146
- df['text_length'] = df[text_col].astype(str).str.len()
147
- ax2.hist(df['text_length'], bins=30, alpha=0.7)
148
- ax2.set_title('Text Length Distribution')
149
- ax2.set_xlabel('Text Length')
150
- ax2.set_ylabel('Frequency')
151
-
152
- plt.tight_layout()
153
- st.pyplot(fig)
154
 
155
- # Model Training Functions
156
- def train_model(X_train, X_test, y_train, y_test, model_name):
157
- """Train the selected model"""
158
-
159
- models = {
160
- 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
161
- 'Decision Tree': DecisionTreeClassifier(random_state=42),
162
- 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
163
- 'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
164
- 'SVC': SVC(random_state=42, probability=True),
165
- 'Multinomial Naive Bayes': MultinomialNB(),
166
- 'Gaussian Naive Bayes': GaussianNB()
167
- }
168
-
169
- model = models[model_name]
170
-
171
- # Handle sparse matrices for Gaussian NB
172
- if model_name == 'Gaussian Naive Bayes':
173
- if hasattr(X_train, 'toarray'):
174
- X_train = X_train.toarray()
175
- X_test = X_test.toarray()
176
-
177
- # Train model
178
- model.fit(X_train, y_train)
179
-
180
- # Make predictions
181
- y_pred = model.predict(X_test)
182
-
183
- # Calculate metrics
184
- accuracy = accuracy_score(y_test, y_pred)
185
-
186
- return model, accuracy, y_pred
187
 
188
- # Main App
189
- st.title('🔤 No Code Text Classification App')
190
- st.markdown('Upload your data, analyze it, train models, and make predictions without writing any code!')
 
 
191
 
192
- # Sidebar
193
- st.sidebar.header("📁 Data Upload")
 
 
 
 
 
 
 
 
 
 
194
 
195
- # File upload with better error handling
196
- train_data = st.sidebar.file_uploader(
197
- "Upload training data (CSV)",
198
- type=["csv"],
199
- help="Upload a CSV file with text and labels"
200
- )
 
201
 
202
- # Process uploaded data
203
- if train_data is not None:
204
- try:
205
- with st.spinner("Loading data..."):
206
- train_df = safe_file_read(train_data)
207
 
208
- if train_df is not None:
209
- st.session_state.train_df = train_df
 
210
 
211
- st.sidebar.success(f"✅ Data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
 
 
212
 
213
- # Column selection
214
- columns = train_df.columns.tolist()
215
- text_col = st.sidebar.selectbox("📝 Select text column:", columns, key="text_col")
216
- target_col = st.sidebar.selectbox("🎯 Select target column:", columns, key="target_col")
217
 
218
- if text_col and target_col and text_col != target_col:
219
- # Clean and prepare data
220
- with st.spinner("Preprocessing data..."):
221
- text_cleaner = TextCleaner()
222
- train_df['clean_text'] = train_df[text_col].apply(text_cleaner.clean_text)
223
-
224
- # Encode labels
225
- label_encoder = LabelEncoder()
226
- train_df['encoded_target'] = label_encoder.fit_transform(train_df[target_col])
227
- st.session_state.label_encoder = label_encoder
228
-
229
- # Main sections
230
- tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "🤖 Train Model", "🔍 Predictions"])
231
-
232
- # Data Analysis Tab
233
- with tab1:
234
- st.header("📊 Data Analysis")
235
-
236
- col1, col2 = st.columns(2)
237
-
238
- with col1:
239
- st.subheader("📈 Dataset Overview")
240
- insights = get_data_insights(train_df, text_col, target_col)
241
-
242
- st.metric("Total Samples", insights['shape'][0])
243
- st.metric("Number of Features", insights['shape'][1])
244
- st.metric("Average Text Length", f"{insights['avg_text_length']:.1f}")
245
-
246
- st.subheader("🎯 Class Distribution")
247
- class_dist_df = pd.DataFrame(list(insights['class_distribution'].items()),
248
- columns=['Class', 'Count'])
249
- st.dataframe(class_dist_df, use_container_width=True)
250
-
251
- with col2:
252
- st.subheader("📋 Data Preview")
253
- preview_df = train_df[[text_col, target_col]].head()
254
- st.dataframe(preview_df, use_container_width=True)
255
-
256
- st.subheader("🧹 Cleaned Text Preview")
257
- cleaned_preview = train_df[['clean_text', target_col]].head()
258
- st.dataframe(cleaned_preview, use_container_width=True)
259
 
260
- st.subheader("📊 Visualizations")
261
- create_visualizations(train_df, text_col, target_col)
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- # Train Model Tab
264
- with tab2:
265
- st.header("🤖 Train Model")
266
-
267
- col1, col2 = st.columns(2)
268
-
269
- with col1:
270
- st.subheader("🔧 Model Selection")
271
- model_name = st.selectbox(
272
- "Choose a model:",
273
- ["Logistic Regression", "Decision Tree", "Random Forest",
274
- "Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
275
- )
276
-
277
- with col2:
278
- st.subheader("📊 Vectorizer Selection")
279
- vectorizer_type = st.selectbox(
280
- "Choose vectorizer:",
281
- ["TF-IDF Vectorizer", "Count Vectorizer"]
 
 
 
 
 
 
 
 
 
 
 
 
282
  )
283
-
284
- # Training parameters
285
- st.subheader("⚙️ Training Parameters")
286
- col3, col4 = st.columns(2)
287
- with col3:
288
- test_size = st.slider("Test size", 0.1, 0.5, 0.2, 0.05)
289
- max_features = st.number_input("Max features", 1000, 20000, 10000, 1000)
290
-
291
- if st.button("🚀 Train Model", type="primary"):
292
- try:
293
- with st.spinner("Training model... This may take a few minutes."):
294
- # Initialize vectorizer
295
- if vectorizer_type == "TF-IDF Vectorizer":
296
- vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
297
- st.session_state.vectorizer_type = 'tfidf'
298
- else:
299
- vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
300
- st.session_state.vectorizer_type = 'count'
301
-
302
- # Vectorize text
303
- X = vectorizer.fit_transform(train_df['clean_text'])
304
- y = train_df['encoded_target']
305
-
306
- # Split data
307
- X_train, X_test, y_train, y_test = train_test_split(
308
- X, y, test_size=test_size, random_state=42, stratify=y
309
- )
310
-
311
- # Train model
312
- model, accuracy, y_pred = train_model(X_train, X_test, y_train, y_test, model_name)
313
-
314
- # Store in session state
315
- st.session_state.trained_model = model
316
- st.session_state.vectorizer = vectorizer
317
-
318
- # Display results
319
- st.success("🎉 Model training completed!")
320
-
321
- col5, col6 = st.columns(2)
322
- with col5:
323
- st.metric("🎯 Accuracy", f"{accuracy:.4f}")
324
- st.metric("🏋️ Training Samples", len(X_train))
325
- st.metric("🧪 Test Samples", len(X_test))
326
-
327
- with col6:
328
- st.subheader("📊 Classification Report")
329
- report = classification_report(y_test, y_pred,
330
- target_names=label_encoder.classes_,
331
- output_dict=True)
332
- report_df = pd.DataFrame(report).transpose()
333
- st.dataframe(report_df.round(3), use_container_width=True)
334
 
335
- except Exception as e:
336
- st.error(f" Error during training: {str(e)}")
337
-
338
- # Predictions Tab
339
- with tab3:
340
- st.header("🔍 Make Predictions")
341
-
342
- if st.session_state.trained_model is not None:
343
- # Single prediction
344
- st.subheader("📝 Single Text Prediction")
345
- user_input = st.text_area("Enter text to classify:", height=100)
346
-
347
- if st.button("🔮 Predict", type="primary"):
348
- if user_input.strip():
349
- try:
350
- with st.spinner("Making prediction..."):
351
- # Clean and vectorize input
352
- text_cleaner = TextCleaner()
353
- clean_input = text_cleaner.clean_text(user_input)
354
- input_vector = st.session_state.vectorizer.transform([clean_input])
355
-
356
- # Handle sparse matrix for Gaussian NB
357
- if isinstance(st.session_state.trained_model, GaussianNB):
358
- input_vector = input_vector.toarray()
359
-
360
- # Make prediction
361
- prediction = st.session_state.trained_model.predict(input_vector)[0]
362
- predicted_label = st.session_state.label_encoder.inverse_transform([prediction])[0]
363
-
364
- # Get probabilities if available
365
- if hasattr(st.session_state.trained_model, 'predict_proba'):
366
- try:
367
- proba = st.session_state.trained_model.predict_proba(input_vector)[0]
368
-
369
- st.success("🎉 Prediction completed!")
370
- st.write(f"**Input:** {user_input}")
371
- st.write(f"**Predicted Class:** {predicted_label}")
372
-
373
- # Show probabilities
374
- st.subheader("📊 Class Probabilities")
375
- prob_df = pd.DataFrame({
376
- 'Class': st.session_state.label_encoder.classes_,
377
- 'Probability': proba
378
- }).sort_values('Probability', ascending=False)
379
-
380
- st.bar_chart(prob_df.set_index('Class'))
381
- st.dataframe(prob_df.round(4), use_container_width=True)
382
- except:
383
- st.success("🎉 Prediction completed!")
384
- st.write(f"**Predicted Class:** {predicted_label}")
385
- else:
386
- st.success("🎉 Prediction completed!")
387
- st.write(f"**Predicted Class:** {predicted_label}")
388
 
389
- except Exception as e:
390
- st.error(f"❌ Error during prediction: {str(e)}")
391
- else:
392
- st.warning("⚠️ Please enter some text to classify")
393
-
394
- # Batch predictions
395
- st.subheader("📊 Batch Predictions")
396
- batch_file = st.file_uploader("Upload CSV for batch predictions", type=["csv"])
397
-
398
- if batch_file is not None:
399
- try:
400
- batch_df = safe_file_read(batch_file)
401
- if batch_df is not None:
402
- st.write("**Preview:**")
403
- st.dataframe(batch_df.head(), use_container_width=True)
404
-
405
- batch_text_col = st.selectbox("Select text column for prediction:",
406
- batch_df.columns.tolist())
407
 
408
- if st.button("🚀 Run Batch Predictions"):
409
- with st.spinner("Processing batch predictions..."):
410
- text_cleaner = TextCleaner()
411
- predictions = []
412
-
413
- for text in batch_df[batch_text_col]:
414
- try:
415
- clean_text = text_cleaner.clean_text(str(text))
416
- text_vector = st.session_state.vectorizer.transform([clean_text])
417
-
418
- if isinstance(st.session_state.trained_model, GaussianNB):
419
- text_vector = text_vector.toarray()
420
-
421
- pred = st.session_state.trained_model.predict(text_vector)[0]
422
- pred_label = st.session_state.label_encoder.inverse_transform([pred])[0]
423
- predictions.append(pred_label)
424
- except:
425
- predictions.append("Error")
426
-
427
- batch_df['Predicted_Class'] = predictions
428
-
429
- st.success("🎉 Batch predictions completed!")
430
- st.dataframe(batch_df, use_container_width=True)
431
-
432
- # Download results
433
- csv_data = batch_df.to_csv(index=False)
434
- st.download_button(
435
- label="📥 Download Results",
436
- data=csv_data,
437
- file_name="batch_predictions.csv",
438
- mime="text/csv"
439
- )
440
- except Exception as e:
441
- st.error(f"❌ Error processing batch file: {str(e)}")
442
- else:
443
- st.warning("⚠️ No trained model found. Please train a model first in the 'Train Model' tab.")
444
- else:
445
- st.warning("⚠️ Please select different columns for text and target.")
446
 
447
- except Exception as e:
448
- st.error(f"❌ Error loading file: {str(e)}")
449
- st.info("💡 Try these solutions:")
450
- st.write("- Check if the file is a valid CSV")
451
- st.write("- Ensure the file is not corrupted")
452
- st.write("- Try saving the file with UTF-8 encoding")
453
-
454
- else:
455
- st.info("👆 Please upload a CSV file to get started")
456
 
457
- # Show example data format
458
- st.subheader("📋 Expected Data Format")
459
- example_df = pd.DataFrame({
460
- 'text': [
461
- "This product is amazing! I love it.",
462
- "Terrible quality, waste of money.",
463
- "Good value for the price.",
464
- "Not what I expected, disappointed."
465
- ],
466
- 'sentiment': ['positive', 'negative', 'positive', 'negative']
467
- })
468
- st.dataframe(example_df, use_container_width=True)
469
-
470
- # Footer
471
- st.markdown("---")
472
- st.markdown("Built with ❤️ using Streamlit | No Code Text Classification App")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
+ from NoCodeTextClassifier.EDA import Informations, Visualizations
6
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
+ from NoCodeTextClassifier.models import Models
 
 
 
 
 
 
 
 
 
9
  import os
10
  import pickle
11
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Utility functions
14
+ def save_artifacts(obj, folder_name, file_name):
15
+ """Save artifacts like encoders and vectorizers"""
16
+ os.makedirs(folder_name, exist_ok=True)
17
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
18
+ pickle.dump(obj, f)
19
 
20
+ def load_artifacts(folder_name, file_name):
21
+ """Load saved artifacts"""
22
+ try:
23
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
24
+ return pickle.load(f)
25
+ except FileNotFoundError:
26
+ st.error(f"File {file_name} not found in {folder_name} folder")
27
+ return None
28
 
29
+ def load_model(model_name):
30
+ """Load trained model"""
31
+ try:
32
+ with open(os.path.join('models', model_name), 'rb') as f:
33
+ return pickle.load(f)
34
+ except FileNotFoundError:
35
+ st.error(f"Model {model_name} not found. Please train a model first.")
36
+ return None
 
 
 
37
 
38
+ def predict_text(model_name, text, vectorizer_type="tfidf"):
39
+ """Make prediction on new text"""
40
+ try:
41
+ # Load model
42
+ model = load_model(model_name)
43
+ if model is None:
44
+ return None, None
 
 
45
 
46
+ # Load vectorizer
47
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
49
+ if vectorizer is None:
50
+ return None, None
51
 
52
+ # Load label encoder
53
+ encoder = load_artifacts("artifacts", "encoder.pkl")
54
+ if encoder is None:
55
+ return None, None
56
 
57
+ # Clean and vectorize text
58
+ text_cleaner = TextCleaner()
59
+ clean_text = text_cleaner.clean_text(text)
60
 
61
+ # Transform text using the same vectorizer used during training
62
+ text_vector = vectorizer.transform([clean_text])
63
+
64
+ # Make prediction
65
+ prediction = model.predict(text_vector)
66
+ prediction_proba = None
67
+
68
+ # Get prediction probabilities if available
69
+ if hasattr(model, 'predict_proba'):
70
+ try:
71
+ prediction_proba = model.predict_proba(text_vector)[0]
72
+ except:
73
+ pass
74
 
75
+ # Decode prediction
76
+ predicted_label = encoder.inverse_transform(prediction)[0]
77
 
78
+ return predicted_label, prediction_proba
 
 
79
 
80
+ except Exception as e:
81
+ st.error(f"Error during prediction: {str(e)}")
82
+ return None, None
83
 
84
+ # Streamlit App
85
+ st.title('No Code Text Classification App')
86
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
87
+
88
+ # Sidebar
89
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
+
91
+ # Upload Data
92
+ st.sidebar.subheader("Upload Your Dataset")
93
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
+
96
+ # Global variables to store data and settings
97
+ if 'vectorizer_type' not in st.session_state:
98
+ st.session_state.vectorizer_type = "tfidf"
99
 
100
+ if train_data is not None:
 
101
  try:
102
+ train_df = pd.read_csv(train_data, encoding='latin1')
103
+
104
+ if test_data is not None:
105
+ test_df = pd.read_csv(test_data, encoding='latin1')
106
+ else:
107
+ test_df = None
108
+
109
+ st.write("Training Data Preview:")
110
+ st.write(train_df.head(3))
111
+
112
+ columns = train_df.columns.tolist()
113
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
+ target = st.sidebar.selectbox("Choose the target column:", columns)
115
+
116
+ # Process data
117
+ info = Informations(train_df, text_data, target)
118
+ train_df['clean_text'] = info.clean_text()
119
+ train_df['text_length'] = info.text_length()
120
+
121
+ # Handle label encoding manually if the class doesn't store encoder
122
+ from sklearn.preprocessing import LabelEncoder
123
+ label_encoder = LabelEncoder()
124
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
125
+
126
+ # Save label encoder for later use
127
+ os.makedirs("artifacts", exist_ok=True)
128
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
+
130
+ except Exception as e:
131
+ st.error(f"Error loading data: {str(e)}")
132
+ train_df = None
133
+ info = None
134
+
135
+ # Data Analysis Section
136
+ if section == "Data Analysis":
137
+ if train_data is not None and train_df is not None:
138
  try:
139
+ st.subheader("Get Insights from the Data")
140
+
141
+ st.write("Data Shape:", info.shape())
142
+ st.write("Class Imbalance:", info.class_imbalanced())
143
+ st.write("Missing Values:", info.missing_values())
 
 
 
 
 
 
144
 
145
+ st.write("Processed Data Preview:")
146
+ st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
+
148
+ st.markdown("**Text Length Analysis**")
149
+ st.write(info.analysis_text_length('text_length'))
150
+
151
+ # Calculate correlation manually since we handled encoding separately
152
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
+ st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
 
 
 
 
 
 
 
 
 
 
154
 
155
+ st.subheader("Visualizations")
156
+ vis = Visualizations(train_df, text_data, target)
157
+ vis.class_distribution()
158
+ vis.text_length_distribution()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ except Exception as e:
161
+ st.error(f"Error in data analysis: {str(e)}")
162
+ else:
163
+ st.warning("Please upload training data to get insights")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ # Train Model Section
166
+ elif section == "Train Model":
167
+ if train_data is not None and train_df is not None:
168
+ try:
169
+ st.subheader("Train a Model")
170
 
171
+ # Create two columns for model selection
172
+ col1, col2 = st.columns(2)
173
+
174
+ with col1:
175
+ model = st.radio("Choose the Model", [
176
+ "Logistic Regression", "Decision Tree",
177
+ "Random Forest", "Linear SVC", "SVC",
178
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
+ ])
180
+
181
+ with col2:
182
+ vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
 
184
+ # Initialize vectorizer
185
+ if vectorizer_choice == "Tfidf Vectorizer":
186
+ vectorizer = TfidfVectorizer(max_features=10000)
187
+ st.session_state.vectorizer_type = "tfidf"
188
+ else:
189
+ vectorizer = CountVectorizer(max_features=10000)
190
+ st.session_state.vectorizer_type = "count"
191
 
192
+ st.write("Training Data Preview:")
193
+ st.write(train_df[['clean_text', 'target']].head(3))
 
 
 
194
 
195
+ # Vectorize text data
196
+ X = vectorizer.fit_transform(train_df['clean_text'])
197
+ y = train_df['target']
198
 
199
+ # Split data
200
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
201
+ st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
 
203
+ # Save vectorizer for later use
204
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
 
206
 
207
+ if st.button("Start Training"):
208
+ with st.spinner("Training model..."):
209
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ # Train selected model
212
+ if model == "Logistic Regression":
213
+ models.LogisticRegression()
214
+ elif model == "Decision Tree":
215
+ models.DecisionTree()
216
+ elif model == "Linear SVC":
217
+ models.LinearSVC()
218
+ elif model == "SVC":
219
+ models.SVC()
220
+ elif model == "Multinomial Naive Bayes":
221
+ models.MultinomialNB()
222
+ elif model == "Random Forest":
223
+ models.RandomForestClassifier()
224
+ elif model == "Gaussian Naive Bayes":
225
+ models.GaussianNB()
226
 
227
+ st.success("Model training completed!")
228
+ st.info("You can now use the 'Predictions' section to classify new text.")
229
+
230
+ except Exception as e:
231
+ st.error(f"Error in model training: {str(e)}")
232
+ else:
233
+ st.warning("Please upload training data to train a model")
234
+
235
+ # Predictions Section
236
+ elif section == "Predictions":
237
+ st.subheader("Perform Predictions on New Text")
238
+
239
+ # Check if models exist
240
+ if os.path.exists("models") and os.listdir("models"):
241
+ # Text input for prediction
242
+ text_input = st.text_area("Enter the text to classify:", height=100)
243
+
244
+ # Model selection
245
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
+
247
+ if available_models:
248
+ selected_model = st.selectbox("Choose the trained model:", available_models)
249
+
250
+ # Prediction button
251
+ if st.button("Predict", key="single_predict"):
252
+ if text_input.strip():
253
+ with st.spinner("Making prediction..."):
254
+ predicted_label, prediction_proba = predict_text(
255
+ selected_model,
256
+ text_input,
257
+ st.session_state.get('vectorizer_type', 'tfidf')
258
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ if predicted_label is not None:
261
+ st.success("Prediction completed!")
262
+
263
+ # Display results
264
+ st.markdown("### Prediction Results")
265
+ st.markdown(f"**Input Text:** {text_input}")
266
+ st.markdown(f"**Predicted Class:** {predicted_label}")
267
+
268
+ # Display probabilities if available
269
+ if prediction_proba is not None:
270
+ st.markdown("**Class Probabilities:**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
+ # Load encoder to get class names
273
+ encoder = load_artifacts("artifacts", "encoder.pkl")
274
+ if encoder is not None:
275
+ classes = encoder.classes_
276
+ prob_df = pd.DataFrame({
277
+ 'Class': classes,
278
+ 'Probability': prediction_proba
279
+ }).sort_values('Probability', ascending=False)
 
 
 
 
 
 
 
 
 
 
280
 
281
+ st.bar_chart(prob_df.set_index('Class'))
282
+ st.dataframe(prob_df)
283
+ else:
284
+ st.warning("Please enter some text to classify")
285
+ else:
286
+ st.warning("No trained models found. Please train a model first.")
287
+ else:
288
+ st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
+
290
+ # Option to classify multiple texts
291
+ st.markdown("---")
292
+ st.subheader("Batch Predictions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
 
 
 
 
 
 
 
 
295
 
296
+ if uploaded_file is not None:
297
+ try:
298
+ batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
+ st.write("Uploaded data preview:")
300
+ st.write(batch_df.head())
301
+
302
+ # Select text column
303
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
+
305
+ if os.path.exists("models") and os.listdir("models"):
306
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
+
309
+ if st.button("Run Batch Predictions", key="batch_predict"):
310
+ with st.spinner("Processing batch predictions..."):
311
+ predictions = []
312
+
313
+ for text in batch_df[text_column]:
314
+ pred, _ = predict_text(
315
+ batch_model,
316
+ str(text),
317
+ st.session_state.get('vectorizer_type', 'tfidf')
318
+ )
319
+ predictions.append(pred if pred is not None else "Error")
320
+
321
+ batch_df['Predicted_Class'] = predictions
322
+
323
+ st.success("Batch predictions completed!")
324
+ st.write("Results:")
325
+ st.write(batch_df[[text_column, 'Predicted_Class']])
326
+
327
+ # Download results
328
+ csv = batch_df.to_csv(index=False)
329
+ st.download_button(
330
+ label="Download predictions as CSV",
331
+ data=csv,
332
+ file_name="batch_predictions.csv",
333
+ mime="text/csv"
334
+ )
335
+ except Exception as e:
336
+ st.error(f"Error in batch prediction: {str(e)}")