Alamgirapi commited on
Commit
8d810b6
ยท
verified ยท
1 Parent(s): cb7e73c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +645 -336
app.py CHANGED
@@ -1,336 +1,645 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- from NoCodeTextClassifier.EDA import Informations, Visualizations
6
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
- from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
- from NoCodeTextClassifier.models import Models
9
- import os
10
- import pickle
11
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
-
13
- # Utility functions
14
- def save_artifacts(obj, folder_name, file_name):
15
- """Save artifacts like encoders and vectorizers"""
16
- os.makedirs(folder_name, exist_ok=True)
17
- with open(os.path.join(folder_name, file_name), 'wb') as f:
18
- pickle.dump(obj, f)
19
-
20
- def load_artifacts(folder_name, file_name):
21
- """Load saved artifacts"""
22
- try:
23
- with open(os.path.join(folder_name, file_name), 'rb') as f:
24
- return pickle.load(f)
25
- except FileNotFoundError:
26
- st.error(f"File {file_name} not found in {folder_name} folder")
27
- return None
28
-
29
- def load_model(model_name):
30
- """Load trained model"""
31
- try:
32
- with open(os.path.join('models', model_name), 'rb') as f:
33
- return pickle.load(f)
34
- except FileNotFoundError:
35
- st.error(f"Model {model_name} not found. Please train a model first.")
36
- return None
37
-
38
- def predict_text(model_name, text, vectorizer_type="tfidf"):
39
- """Make prediction on new text"""
40
- try:
41
- # Load model
42
- model = load_model(model_name)
43
- if model is None:
44
- return None, None
45
-
46
- # Load vectorizer
47
- vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
- vectorizer = load_artifacts("artifacts", vectorizer_file)
49
- if vectorizer is None:
50
- return None, None
51
-
52
- # Load label encoder
53
- encoder = load_artifacts("artifacts", "encoder.pkl")
54
- if encoder is None:
55
- return None, None
56
-
57
- # Clean and vectorize text
58
- text_cleaner = TextCleaner()
59
- clean_text = text_cleaner.clean_text(text)
60
-
61
- # Transform text using the same vectorizer used during training
62
- text_vector = vectorizer.transform([clean_text])
63
-
64
- # Make prediction
65
- prediction = model.predict(text_vector)
66
- prediction_proba = None
67
-
68
- # Get prediction probabilities if available
69
- if hasattr(model, 'predict_proba'):
70
- try:
71
- prediction_proba = model.predict_proba(text_vector)[0]
72
- except:
73
- pass
74
-
75
- # Decode prediction
76
- predicted_label = encoder.inverse_transform(prediction)[0]
77
-
78
- return predicted_label, prediction_proba
79
-
80
- except Exception as e:
81
- st.error(f"Error during prediction: {str(e)}")
82
- return None, None
83
-
84
- # Streamlit App
85
- st.title('No Code Text Classification App')
86
- st.write('Understand the behavior of your text data and train a model to classify the text data')
87
-
88
- # Sidebar
89
- section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
-
91
- # Upload Data
92
- st.sidebar.subheader("Upload Your Dataset")
93
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
-
96
- # Global variables to store data and settings
97
- if 'vectorizer_type' not in st.session_state:
98
- st.session_state.vectorizer_type = "tfidf"
99
-
100
- if train_data is not None:
101
- try:
102
- train_df = pd.read_csv(train_data, encoding='latin1')
103
-
104
- if test_data is not None:
105
- test_df = pd.read_csv(test_data, encoding='latin1')
106
- else:
107
- test_df = None
108
-
109
- st.write("Training Data Preview:")
110
- st.write(train_df.head(3))
111
-
112
- columns = train_df.columns.tolist()
113
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
- target = st.sidebar.selectbox("Choose the target column:", columns)
115
-
116
- # Process data
117
- info = Informations(train_df, text_data, target)
118
- train_df['clean_text'] = info.clean_text()
119
- train_df['text_length'] = info.text_length()
120
-
121
- # Handle label encoding manually if the class doesn't store encoder
122
- from sklearn.preprocessing import LabelEncoder
123
- label_encoder = LabelEncoder()
124
- train_df['target'] = label_encoder.fit_transform(train_df[target])
125
-
126
- # Save label encoder for later use
127
- os.makedirs("artifacts", exist_ok=True)
128
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
-
130
- except Exception as e:
131
- st.error(f"Error loading data: {str(e)}")
132
- train_df = None
133
- info = None
134
-
135
- # Data Analysis Section
136
- if section == "Data Analysis":
137
- if train_data is not None and train_df is not None:
138
- try:
139
- st.subheader("Get Insights from the Data")
140
-
141
- st.write("Data Shape:", info.shape())
142
- st.write("Class Imbalance:", info.class_imbalanced())
143
- st.write("Missing Values:", info.missing_values())
144
-
145
- st.write("Processed Data Preview:")
146
- st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
-
148
- st.markdown("**Text Length Analysis**")
149
- st.write(info.analysis_text_length('text_length'))
150
-
151
- # Calculate correlation manually since we handled encoding separately
152
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
- st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
-
155
- st.subheader("Visualizations")
156
- vis = Visualizations(train_df, text_data, target)
157
- vis.class_distribution()
158
- vis.text_length_distribution()
159
-
160
- except Exception as e:
161
- st.error(f"Error in data analysis: {str(e)}")
162
- else:
163
- st.warning("Please upload training data to get insights")
164
-
165
- # Train Model Section
166
- elif section == "Train Model":
167
- if train_data is not None and train_df is not None:
168
- try:
169
- st.subheader("Train a Model")
170
-
171
- # Create two columns for model selection
172
- col1, col2 = st.columns(2)
173
-
174
- with col1:
175
- model = st.radio("Choose the Model", [
176
- "Logistic Regression", "Decision Tree",
177
- "Random Forest", "Linear SVC", "SVC",
178
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
- ])
180
-
181
- with col2:
182
- vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
-
184
- # Initialize vectorizer
185
- if vectorizer_choice == "Tfidf Vectorizer":
186
- vectorizer = TfidfVectorizer(max_features=10000)
187
- st.session_state.vectorizer_type = "tfidf"
188
- else:
189
- vectorizer = CountVectorizer(max_features=10000)
190
- st.session_state.vectorizer_type = "count"
191
-
192
- st.write("Training Data Preview:")
193
- st.write(train_df[['clean_text', 'target']].head(3))
194
-
195
- # Vectorize text data
196
- X = vectorizer.fit_transform(train_df['clean_text'])
197
- y = train_df['target']
198
-
199
- # Split data
200
- X_train, X_test, y_train, y_test = process.split_data(X, y)
201
- st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
-
203
- # Save vectorizer for later use
204
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
206
-
207
- if st.button("Start Training"):
208
- with st.spinner("Training model..."):
209
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
210
-
211
- # Train selected model
212
- if model == "Logistic Regression":
213
- models.LogisticRegression()
214
- elif model == "Decision Tree":
215
- models.DecisionTree()
216
- elif model == "Linear SVC":
217
- models.LinearSVC()
218
- elif model == "SVC":
219
- models.SVC()
220
- elif model == "Multinomial Naive Bayes":
221
- models.MultinomialNB()
222
- elif model == "Random Forest":
223
- models.RandomForestClassifier()
224
- elif model == "Gaussian Naive Bayes":
225
- models.GaussianNB()
226
-
227
- st.success("Model training completed!")
228
- st.info("You can now use the 'Predictions' section to classify new text.")
229
-
230
- except Exception as e:
231
- st.error(f"Error in model training: {str(e)}")
232
- else:
233
- st.warning("Please upload training data to train a model")
234
-
235
- # Predictions Section
236
- elif section == "Predictions":
237
- st.subheader("Perform Predictions on New Text")
238
-
239
- # Check if models exist
240
- if os.path.exists("models") and os.listdir("models"):
241
- # Text input for prediction
242
- text_input = st.text_area("Enter the text to classify:", height=100)
243
-
244
- # Model selection
245
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
-
247
- if available_models:
248
- selected_model = st.selectbox("Choose the trained model:", available_models)
249
-
250
- # Prediction button
251
- if st.button("Predict", key="single_predict"):
252
- if text_input.strip():
253
- with st.spinner("Making prediction..."):
254
- predicted_label, prediction_proba = predict_text(
255
- selected_model,
256
- text_input,
257
- st.session_state.get('vectorizer_type', 'tfidf')
258
- )
259
-
260
- if predicted_label is not None:
261
- st.success("Prediction completed!")
262
-
263
- # Display results
264
- st.markdown("### Prediction Results")
265
- st.markdown(f"**Input Text:** {text_input}")
266
- st.markdown(f"**Predicted Class:** {predicted_label}")
267
-
268
- # Display probabilities if available
269
- if prediction_proba is not None:
270
- st.markdown("**Class Probabilities:**")
271
-
272
- # Load encoder to get class names
273
- encoder = load_artifacts("artifacts", "encoder.pkl")
274
- if encoder is not None:
275
- classes = encoder.classes_
276
- prob_df = pd.DataFrame({
277
- 'Class': classes,
278
- 'Probability': prediction_proba
279
- }).sort_values('Probability', ascending=False)
280
-
281
- st.bar_chart(prob_df.set_index('Class'))
282
- st.dataframe(prob_df)
283
- else:
284
- st.warning("Please enter some text to classify")
285
- else:
286
- st.warning("No trained models found. Please train a model first.")
287
- else:
288
- st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
-
290
- # Option to classify multiple texts
291
- st.markdown("---")
292
- st.subheader("Batch Predictions")
293
-
294
- uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
-
296
- if uploaded_file is not None:
297
- try:
298
- batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
- st.write("Uploaded data preview:")
300
- st.write(batch_df.head())
301
-
302
- # Select text column
303
- text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
-
305
- if os.path.exists("models") and os.listdir("models"):
306
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
- batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
-
309
- if st.button("Run Batch Predictions", key="batch_predict"):
310
- with st.spinner("Processing batch predictions..."):
311
- predictions = []
312
-
313
- for text in batch_df[text_column]:
314
- pred, _ = predict_text(
315
- batch_model,
316
- str(text),
317
- st.session_state.get('vectorizer_type', 'tfidf')
318
- )
319
- predictions.append(pred if pred is not None else "Error")
320
-
321
- batch_df['Predicted_Class'] = predictions
322
-
323
- st.success("Batch predictions completed!")
324
- st.write("Results:")
325
- st.write(batch_df[[text_column, 'Predicted_Class']])
326
-
327
- # Download results
328
- csv = batch_df.to_csv(index=False)
329
- st.download_button(
330
- label="Download predictions as CSV",
331
- data=csv,
332
- file_name="batch_predictions.csv",
333
- mime="text/csv"
334
- )
335
- except Exception as e:
336
- st.error(f"Error in batch prediction: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import seaborn as sns
6
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.preprocessing import LabelEncoder
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.tree import DecisionTreeClassifier
11
+ from sklearn.ensemble import RandomForestClassifier
12
+ from sklearn.svm import LinearSVC, SVC
13
+ from sklearn.naive_bayes import MultinomialNB, GaussianNB
14
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
15
+ import os
16
+ import pickle
17
+ import re
18
+ import string
19
+ from collections import Counter
20
+ import plotly.express as px
21
+ import plotly.graph_objects as go
22
+
23
+ # Configure Streamlit page
24
+ st.set_page_config(
25
+ page_title="Text Classification App",
26
+ page_icon="๐Ÿ“",
27
+ layout="wide"
28
+ )
29
+
30
+ # Text preprocessing class
31
+ class TextCleaner:
32
+ def __init__(self):
33
+ self.stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
34
+
35
+ def clean_text(self, text):
36
+ """Clean and preprocess text"""
37
+ if pd.isna(text):
38
+ return ""
39
+
40
+ text = str(text).lower()
41
+ text = re.sub(r'http\S+', '', text) # Remove URLs
42
+ text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters
43
+ text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
44
+ text = text.strip()
45
+
46
+ # Remove stop words (optional)
47
+ words = text.split()
48
+ words = [word for word in words if word not in self.stop_words]
49
+
50
+ return ' '.join(words)
51
+
52
+ # Data analysis functions
53
+ def get_data_insights(df, text_col, target_col):
54
+ """Get basic insights from the dataset"""
55
+ insights = {
56
+ 'shape': df.shape,
57
+ 'missing_values': df.isnull().sum().to_dict(),
58
+ 'class_distribution': df[target_col].value_counts().to_dict(),
59
+ 'text_length_stats': {
60
+ 'mean': df[text_col].str.len().mean(),
61
+ 'median': df[text_col].str.len().median(),
62
+ 'min': df[text_col].str.len().min(),
63
+ 'max': df[text_col].str.len().max()
64
+ }
65
+ }
66
+ return insights
67
+
68
+ # Model training functions
69
+ def train_model(model_name, X_train, X_test, y_train, y_test):
70
+ """Train and evaluate a model"""
71
+ models = {
72
+ 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
73
+ 'Decision Tree': DecisionTreeClassifier(random_state=42),
74
+ 'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
75
+ 'Linear SVC': LinearSVC(random_state=42, max_iter=1000),
76
+ 'SVC': SVC(random_state=42, probability=True),
77
+ 'Multinomial Naive Bayes': MultinomialNB(),
78
+ 'Gaussian Naive Bayes': GaussianNB()
79
+ }
80
+
81
+ model = models[model_name]
82
+
83
+ # For Gaussian NB, convert sparse matrix to dense
84
+ if model_name == 'Gaussian Naive Bayes':
85
+ X_train = X_train.toarray()
86
+ X_test = X_test.toarray()
87
+
88
+ # Train model
89
+ model.fit(X_train, y_train)
90
+
91
+ # Make predictions
92
+ y_pred = model.predict(X_test)
93
+
94
+ # Calculate metrics
95
+ accuracy = accuracy_score(y_test, y_pred)
96
+
97
+ # Save model
98
+ os.makedirs("models", exist_ok=True)
99
+ model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
100
+ with open(os.path.join("models", model_filename), 'wb') as f:
101
+ pickle.dump(model, f)
102
+
103
+ return model, accuracy, y_pred, model_filename
104
+
105
+ # Utility functions
106
+ def save_artifacts(obj, folder_name, file_name):
107
+ """Save artifacts like encoders and vectorizers"""
108
+ os.makedirs(folder_name, exist_ok=True)
109
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
110
+ pickle.dump(obj, f)
111
+
112
+ def load_artifacts(folder_name, file_name):
113
+ """Load saved artifacts"""
114
+ try:
115
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
116
+ return pickle.load(f)
117
+ except FileNotFoundError:
118
+ st.error(f"File {file_name} not found in {folder_name} folder")
119
+ return None
120
+
121
+ def predict_text(model_filename, text, vectorizer_type="tfidf"):
122
+ """Make prediction on new text"""
123
+ try:
124
+ # Load model
125
+ with open(os.path.join('models', model_filename), 'rb') as f:
126
+ model = pickle.load(f)
127
+
128
+ # Load vectorizer
129
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
130
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
131
+ if vectorizer is None:
132
+ return None, None
133
+
134
+ # Load label encoder
135
+ encoder = load_artifacts("artifacts", "encoder.pkl")
136
+ if encoder is None:
137
+ return None, None
138
+
139
+ # Clean and vectorize text
140
+ text_cleaner = TextCleaner()
141
+ clean_text = text_cleaner.clean_text(text)
142
+
143
+ # Transform text
144
+ text_vector = vectorizer.transform([clean_text])
145
+
146
+ # For Gaussian NB, convert to dense
147
+ if 'gaussian' in model_filename:
148
+ text_vector = text_vector.toarray()
149
+
150
+ # Make prediction
151
+ prediction = model.predict(text_vector)
152
+ prediction_proba = None
153
+
154
+ # Get prediction probabilities if available
155
+ if hasattr(model, 'predict_proba'):
156
+ try:
157
+ prediction_proba = model.predict_proba(text_vector)[0]
158
+ except:
159
+ pass
160
+
161
+ # Decode prediction
162
+ predicted_label = encoder.inverse_transform(prediction)[0]
163
+
164
+ return predicted_label, prediction_proba
165
+
166
+ except Exception as e:
167
+ st.error(f"Error during prediction: {str(e)}")
168
+ return None, None
169
+
170
+ # Streamlit App
171
+ st.title('๐Ÿ“ No Code Text Classification App')
172
+ st.markdown('---')
173
+ st.write('Analyze your text data and train machine learning models without coding!')
174
+
175
+ # Sidebar
176
+ st.sidebar.title("Navigation")
177
+ section = st.sidebar.radio("Choose Section", ["๐Ÿ“Š Data Analysis", "๐Ÿค– Train Model", "๐Ÿ”ฎ Predictions"])
178
+
179
+ # Upload Data
180
+ st.sidebar.markdown("---")
181
+ st.sidebar.subheader("๐Ÿ“ Upload Your Dataset")
182
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
183
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
184
+
185
+ # Global variables to store data and settings
186
+ if 'vectorizer_type' not in st.session_state:
187
+ st.session_state.vectorizer_type = "tfidf"
188
+
189
+ if train_data is not None:
190
+ try:
191
+ # Try different encodings
192
+ encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
193
+ train_df = None
194
+
195
+ for encoding in encodings:
196
+ try:
197
+ train_df = pd.read_csv(train_data, encoding=encoding)
198
+ break
199
+ except UnicodeDecodeError:
200
+ continue
201
+
202
+ if train_df is None:
203
+ st.error("Unable to read the CSV file. Please check the file encoding.")
204
+ else:
205
+ if test_data is not None:
206
+ for encoding in encodings:
207
+ try:
208
+ test_df = pd.read_csv(test_data, encoding=encoding)
209
+ break
210
+ except UnicodeDecodeError:
211
+ continue
212
+ else:
213
+ test_df = None
214
+
215
+ # Show data preview
216
+ with st.sidebar.expander("๐Ÿ“‹ Data Preview", expanded=True):
217
+ st.write("Shape:", train_df.shape)
218
+ st.write(train_df.head(2))
219
+
220
+ columns = train_df.columns.tolist()
221
+ text_data = st.sidebar.selectbox("๐Ÿ“ Choose the text column:", columns)
222
+ target = st.sidebar.selectbox("๐ŸŽฏ Choose the target column:", columns)
223
+
224
+ # Process data
225
+ if text_data and target:
226
+ # Clean text
227
+ text_cleaner = TextCleaner()
228
+ train_df['clean_text'] = train_df[text_data].apply(text_cleaner.clean_text)
229
+ train_df['text_length'] = train_df[text_data].str.len()
230
+
231
+ # Handle label encoding
232
+ label_encoder = LabelEncoder()
233
+ train_df['target_encoded'] = label_encoder.fit_transform(train_df[target])
234
+
235
+ # Save label encoder
236
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
237
+
238
+ except Exception as e:
239
+ st.error(f"Error loading data: {str(e)}")
240
+ train_df = None
241
+
242
+ # Data Analysis Section
243
+ if section == "๐Ÿ“Š Data Analysis":
244
+ if train_data is not None and 'train_df' in locals() and train_df is not None:
245
+ st.header("๐Ÿ“Š Data Analysis")
246
+
247
+ # Get insights
248
+ insights = get_data_insights(train_df, text_data, target)
249
+
250
+ # Display insights in columns
251
+ col1, col2, col3, col4 = st.columns(4)
252
+
253
+ with col1:
254
+ st.metric("Total Samples", insights['shape'][0])
255
+
256
+ with col2:
257
+ st.metric("Features", insights['shape'][1])
258
+
259
+ with col3:
260
+ st.metric("Classes", len(insights['class_distribution']))
261
+
262
+ with col4:
263
+ st.metric("Avg Text Length", f"{insights['text_length_stats']['mean']:.1f}")
264
+
265
+ st.markdown("---")
266
+
267
+ # Data quality section
268
+ col1, col2 = st.columns(2)
269
+
270
+ with col1:
271
+ st.subheader("๐Ÿ“‹ Dataset Overview")
272
+ st.write("**Shape:**", insights['shape'])
273
+ st.write("**Missing Values:**")
274
+ missing_df = pd.DataFrame.from_dict(insights['missing_values'], orient='index', columns=['Count'])
275
+ st.dataframe(missing_df[missing_df['Count'] > 0])
276
+
277
+ st.write("**Sample Data:**")
278
+ st.dataframe(train_df[[text_data, target, 'text_length']].head())
279
+
280
+ with col2:
281
+ st.subheader("๐Ÿ“Š Class Distribution")
282
+ class_dist = pd.DataFrame.from_dict(insights['class_distribution'], orient='index', columns=['Count'])
283
+ st.dataframe(class_dist)
284
+
285
+ # Plot class distribution
286
+ fig = px.bar(
287
+ x=class_dist.index,
288
+ y=class_dist['Count'],
289
+ title="Class Distribution",
290
+ labels={'x': 'Class', 'y': 'Count'}
291
+ )
292
+ st.plotly_chart(fig, use_container_width=True)
293
+
294
+ st.markdown("---")
295
+
296
+ # Text analysis section
297
+ st.subheader("๐Ÿ“ Text Analysis")
298
+
299
+ col1, col2 = st.columns(2)
300
+
301
+ with col1:
302
+ # Text length distribution
303
+ fig = px.histogram(
304
+ train_df,
305
+ x='text_length',
306
+ title="Text Length Distribution",
307
+ nbins=30
308
+ )
309
+ st.plotly_chart(fig, use_container_width=True)
310
+
311
+ with col2:
312
+ # Text length by class
313
+ fig = px.box(
314
+ train_df,
315
+ x=target,
316
+ y='text_length',
317
+ title="Text Length by Class"
318
+ )
319
+ st.plotly_chart(fig, use_container_width=True)
320
+
321
+ # Word frequency analysis
322
+ st.subheader("๐Ÿ”ค Most Common Words")
323
+ all_text = ' '.join(train_df['clean_text'].astype(str))
324
+ word_freq = Counter(all_text.split())
325
+ top_words = word_freq.most_common(20)
326
+
327
+ if top_words:
328
+ words_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
329
+ fig = px.bar(
330
+ words_df,
331
+ x='Frequency',
332
+ y='Word',
333
+ orientation='h',
334
+ title="Top 20 Most Common Words"
335
+ )
336
+ fig.update_layout(yaxis={'categoryorder': 'total ascending'})
337
+ st.plotly_chart(fig, use_container_width=True)
338
+
339
+ else:
340
+ st.warning("๐Ÿ“ Please upload training data to perform analysis")
341
+
342
+ # Train Model Section
343
+ elif section == "๐Ÿค– Train Model":
344
+ if train_data is not None and 'train_df' in locals() and train_df is not None:
345
+ st.header("๐Ÿค– Train Machine Learning Model")
346
+
347
+ col1, col2 = st.columns(2)
348
+
349
+ with col1:
350
+ st.subheader("โš™๏ธ Model Configuration")
351
+ model_name = st.selectbox("Choose Model", [
352
+ "Logistic Regression", "Decision Tree",
353
+ "Random Forest", "Linear SVC", "SVC",
354
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
355
+ ])
356
+
357
+ with col2:
358
+ st.subheader("๐Ÿ“Š Vectorization Method")
359
+ vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count Vectorizer"])
360
+
361
+ # Model parameters
362
+ st.subheader("๐Ÿ”ง Parameters")
363
+ col1, col2 = st.columns(2)
364
+
365
+ with col1:
366
+ max_features = st.slider("Max Features", 1000, 20000, 10000, step=1000)
367
+ test_size = st.slider("Test Size", 0.1, 0.4, 0.2, step=0.05)
368
+
369
+ with col2:
370
+ random_state = st.number_input("Random State", 0, 1000, 42)
371
+ min_df = st.slider("Min Document Frequency", 1, 10, 1)
372
+
373
+ # Initialize vectorizer
374
+ if vectorizer_choice == "TF-IDF":
375
+ vectorizer = TfidfVectorizer(
376
+ max_features=max_features,
377
+ min_df=min_df,
378
+ stop_words='english'
379
+ )
380
+ st.session_state.vectorizer_type = "tfidf"
381
+ else:
382
+ vectorizer = CountVectorizer(
383
+ max_features=max_features,
384
+ min_df=min_df,
385
+ stop_words='english'
386
+ )
387
+ st.session_state.vectorizer_type = "count"
388
+
389
+ # Show data info
390
+ st.subheader("๐Ÿ“‹ Training Data Info")
391
+ col1, col2, col3 = st.columns(3)
392
+
393
+ with col1:
394
+ st.metric("Total Samples", len(train_df))
395
+
396
+ with col2:
397
+ st.metric("Unique Classes", train_df[target].nunique())
398
+
399
+ with col3:
400
+ st.metric("Avg Text Length", f"{train_df['text_length'].mean():.1f}")
401
+
402
+ if st.button("๐Ÿš€ Start Training", type="primary"):
403
+ with st.spinner("Training model... This may take a few minutes."):
404
+ try:
405
+ # Vectorize text data
406
+ X = vectorizer.fit_transform(train_df['clean_text'])
407
+ y = train_df['target_encoded']
408
+
409
+ # Split data
410
+ X_train, X_test, y_train, y_test = train_test_split(
411
+ X, y,
412
+ test_size=test_size,
413
+ random_state=random_state,
414
+ stratify=y
415
+ )
416
+
417
+ st.success(f"โœ… Data split - Train: {X_train.shape}, Test: {X_test.shape}")
418
+
419
+ # Save vectorizer
420
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
421
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
422
+
423
+ # Train model
424
+ model, accuracy, y_pred, model_filename = train_model(
425
+ model_name, X_train, X_test, y_train, y_test
426
+ )
427
+
428
+ st.success("๐ŸŽ‰ Model training completed!")
429
+
430
+ # Display results
431
+ col1, col2 = st.columns(2)
432
+
433
+ with col1:
434
+ st.metric("๐ŸŽฏ Test Accuracy", f"{accuracy:.4f}")
435
+
436
+ # Classification report
437
+ st.subheader("๐Ÿ“Š Classification Report")
438
+ report = classification_report(
439
+ y_test, y_pred,
440
+ target_names=label_encoder.classes_,
441
+ output_dict=True
442
+ )
443
+ report_df = pd.DataFrame(report).transpose()
444
+ st.dataframe(report_df.round(4))
445
+
446
+ with col2:
447
+ # Confusion matrix
448
+ st.subheader("๐Ÿ”„ Confusion Matrix")
449
+ cm = confusion_matrix(y_test, y_pred)
450
+ fig = px.imshow(
451
+ cm,
452
+ text_auto=True,
453
+ aspect="auto",
454
+ title="Confusion Matrix",
455
+ labels=dict(x="Predicted", y="Actual"),
456
+ x=label_encoder.classes_,
457
+ y=label_encoder.classes_
458
+ )
459
+ st.plotly_chart(fig, use_container_width=True)
460
+
461
+ st.info(f"โœ… Model saved as: {model_filename}")
462
+ st.info("๐Ÿ”ฎ You can now use the 'Predictions' section to classify new text!")
463
+
464
+ except Exception as e:
465
+ st.error(f"โŒ Error during training: {str(e)}")
466
+
467
+ else:
468
+ st.warning("๐Ÿ“ Please upload training data to train a model")
469
+
470
+ # Predictions Section
471
+ elif section == "๐Ÿ”ฎ Predictions":
472
+ st.header("๐Ÿ”ฎ Text Classification Predictions")
473
+
474
+ # Check if models exist
475
+ if os.path.exists("models") and os.listdir("models"):
476
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
477
+
478
+ if available_models:
479
+ # Single prediction
480
+ st.subheader("๐Ÿ“ Single Text Classification")
481
+
482
+ col1, col2 = st.columns([2, 1])
483
+
484
+ with col1:
485
+ text_input = st.text_area("Enter text to classify:", height=150)
486
+
487
+ with col2:
488
+ selected_model = st.selectbox("Choose model:", available_models)
489
+ predict_button = st.button("๐Ÿ”ฎ Predict", type="primary")
490
+
491
+ if predict_button and text_input.strip():
492
+ with st.spinner("Making prediction..."):
493
+ predicted_label, prediction_proba = predict_text(
494
+ selected_model,
495
+ text_input,
496
+ st.session_state.get('vectorizer_type', 'tfidf')
497
+ )
498
+
499
+ if predicted_label is not None:
500
+ st.success("โœ… Prediction completed!")
501
+
502
+ # Display results
503
+ col1, col2 = st.columns(2)
504
+
505
+ with col1:
506
+ st.markdown("### ๐ŸŽฏ Results")
507
+ st.markdown(f"**Input Text:** {text_input[:200]}{'...' if len(text_input) > 200 else ''}")
508
+ st.markdown(f"**Predicted Class:** `{predicted_label}`")
509
+
510
+ with col2:
511
+ # Display probabilities if available
512
+ if prediction_proba is not None:
513
+ st.markdown("### ๐Ÿ“Š Class Probabilities")
514
+
515
+ encoder = load_artifacts("artifacts", "encoder.pkl")
516
+ if encoder is not None:
517
+ prob_df = pd.DataFrame({
518
+ 'Class': encoder.classes_,
519
+ 'Probability': prediction_proba
520
+ }).sort_values('Probability', ascending=False)
521
+
522
+ fig = px.bar(
523
+ prob_df,
524
+ x='Probability',
525
+ y='Class',
526
+ orientation='h',
527
+ title="Prediction Confidence"
528
+ )
529
+ fig.update_layout(yaxis={'categoryorder': 'total ascending'})
530
+ st.plotly_chart(fig, use_container_width=True)
531
+
532
+ elif predict_button:
533
+ st.warning("โš ๏ธ Please enter some text to classify")
534
+
535
+ # Batch predictions
536
+ st.markdown("---")
537
+ st.subheader("๐Ÿ“Š Batch Predictions")
538
+
539
+ uploaded_file = st.file_uploader("Upload CSV file with texts to classify", type=['csv'])
540
+
541
+ if uploaded_file is not None:
542
+ try:
543
+ # Try different encodings for batch file
544
+ encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
545
+ batch_df = None
546
+
547
+ for encoding in encodings:
548
+ try:
549
+ batch_df = pd.read_csv(uploaded_file, encoding=encoding)
550
+ break
551
+ except UnicodeDecodeError:
552
+ continue
553
+
554
+ if batch_df is not None:
555
+ st.write("๐Ÿ“‹ Uploaded data preview:")
556
+ st.dataframe(batch_df.head())
557
+
558
+ col1, col2 = st.columns(2)
559
+
560
+ with col1:
561
+ text_column = st.selectbox("Select text column:", batch_df.columns.tolist())
562
+
563
+ with col2:
564
+ batch_model = st.selectbox("Choose model:", available_models, key="batch_model")
565
+
566
+ if st.button("๐Ÿš€ Run Batch Predictions", type="primary"):
567
+ with st.spinner("Processing batch predictions..."):
568
+ predictions = []
569
+ confidences = []
570
+
571
+ progress_bar = st.progress(0)
572
+ total_texts = len(batch_df)
573
+
574
+ for i, text in enumerate(batch_df[text_column]):
575
+ pred, proba = predict_text(
576
+ batch_model,
577
+ str(text),
578
+ st.session_state.get('vectorizer_type', 'tfidf')
579
+ )
580
+ predictions.append(pred if pred is not None else "Error")
581
+
582
+ # Get confidence (max probability)
583
+ if proba is not None:
584
+ confidences.append(max(proba))
585
+ else:
586
+ confidences.append(0.0)
587
+
588
+ progress_bar.progress((i + 1) / total_texts)
589
+
590
+ batch_df['Predicted_Class'] = predictions
591
+ batch_df['Confidence'] = confidences
592
+
593
+ st.success("โœ… Batch predictions completed!")
594
+
595
+ # Show results
596
+ st.subheader("๐Ÿ“Š Results")
597
+ result_df = batch_df[[text_column, 'Predicted_Class', 'Confidence']]
598
+ st.dataframe(result_df)
599
+
600
+ # Summary statistics
601
+ st.subheader("๐Ÿ“ˆ Summary")
602
+ col1, col2, col3 = st.columns(3)
603
+
604
+ with col1:
605
+ st.metric("Total Predictions", len(predictions))
606
+
607
+ with col2:
608
+ successful_preds = sum(1 for p in predictions if p != "Error")
609
+ st.metric("Successful", successful_preds)
610
+
611
+ with col3:
612
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
613
+ st.metric("Avg Confidence", f"{avg_confidence:.3f}")
614
+
615
+ # Class distribution of predictions
616
+ pred_counts = pd.Series(predictions).value_counts()
617
+ if len(pred_counts) > 0:
618
+ fig = px.pie(
619
+ values=pred_counts.values,
620
+ names=pred_counts.index,
621
+ title="Distribution of Predictions"
622
+ )
623
+ st.plotly_chart(fig, use_container_width=True)
624
+
625
+ # Download results
626
+ csv = batch_df.to_csv(index=False)
627
+ st.download_button(
628
+ label="๐Ÿ“ฅ Download Results as CSV",
629
+ data=csv,
630
+ file_name="batch_predictions.csv",
631
+ mime="text/csv"
632
+ )
633
+ else:
634
+ st.error("โŒ Unable to read the CSV file. Please check the file encoding.")
635
+
636
+ except Exception as e:
637
+ st.error(f"โŒ Error in batch prediction: {str(e)}")
638
+ else:
639
+ st.warning("โš ๏ธ No trained models found. Please train a model first.")
640
+ else:
641
+ st.warning("โš ๏ธ No models directory found. Please go to 'Train Model' section to train a model first.")
642
+
643
+ # Footer
644
+ st.markdown("---")
645
+ st.markdown("๐Ÿš€ Built with Streamlit | ๐Ÿ“Š No-Code Text Classification")