Alamgirapi commited on
Commit
c9acfc2
·
verified ·
1 Parent(s): 7a3dbca

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -0
app.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ from NoCodeTextClassifier.EDA import Informations, Visualizations
6
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
+ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
+ from NoCodeTextClassifier.models import Models
9
+ import os
10
+ import pickle
11
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
+
13
+ # Utility functions
14
+ def save_artifacts(obj, folder_name, file_name):
15
+ """Save artifacts like encoders and vectorizers"""
16
+ os.makedirs(folder_name, exist_ok=True)
17
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
18
+ pickle.dump(obj, f)
19
+
20
+ def load_artifacts(folder_name, file_name):
21
+ """Load saved artifacts"""
22
+ try:
23
+ with open(os.path.join(folder_name, file_name), 'rb') as f:
24
+ return pickle.load(f)
25
+ except FileNotFoundError:
26
+ st.error(f"File {file_name} not found in {folder_name} folder")
27
+ return None
28
+
29
+ def load_model(model_name):
30
+ """Load trained model"""
31
+ try:
32
+ with open(os.path.join('models', model_name), 'rb') as f:
33
+ return pickle.load(f)
34
+ except FileNotFoundError:
35
+ st.error(f"Model {model_name} not found. Please train a model first.")
36
+ return None
37
+
38
+ def predict_text(model_name, text, vectorizer_type="tfidf"):
39
+ """Make prediction on new text"""
40
+ try:
41
+ # Load model
42
+ model = load_model(model_name)
43
+ if model is None:
44
+ return None, None
45
+
46
+ # Load vectorizer
47
+ vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
+ vectorizer = load_artifacts("artifacts", vectorizer_file)
49
+ if vectorizer is None:
50
+ return None, None
51
+
52
+ # Load label encoder
53
+ encoder = load_artifacts("artifacts", "encoder.pkl")
54
+ if encoder is None:
55
+ return None, None
56
+
57
+ # Clean and vectorize text
58
+ text_cleaner = TextCleaner()
59
+ clean_text = text_cleaner.clean_text(text)
60
+
61
+ # Transform text using the same vectorizer used during training
62
+ text_vector = vectorizer.transform([clean_text])
63
+
64
+ # Make prediction
65
+ prediction = model.predict(text_vector)
66
+ prediction_proba = None
67
+
68
+ # Get prediction probabilities if available
69
+ if hasattr(model, 'predict_proba'):
70
+ try:
71
+ prediction_proba = model.predict_proba(text_vector)[0]
72
+ except:
73
+ pass
74
+
75
+ # Decode prediction
76
+ predicted_label = encoder.inverse_transform(prediction)[0]
77
+
78
+ return predicted_label, prediction_proba
79
+
80
+ except Exception as e:
81
+ st.error(f"Error during prediction: {str(e)}")
82
+ return None, None
83
+
84
+ # Streamlit App
85
+ st.title('No Code Text Classification App')
86
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
87
+
88
+ # Sidebar
89
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
+
91
+ # Upload Data
92
+ st.sidebar.subheader("Upload Your Dataset")
93
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
+
96
+ # Global variables to store data and settings
97
+ if 'vectorizer_type' not in st.session_state:
98
+ st.session_state.vectorizer_type = "tfidf"
99
+
100
+ if train_data is not None:
101
+ try:
102
+ train_df = pd.read_csv(train_data, encoding='latin1')
103
+
104
+ if test_data is not None:
105
+ test_df = pd.read_csv(test_data, encoding='latin1')
106
+ else:
107
+ test_df = None
108
+
109
+ st.write("Training Data Preview:")
110
+ st.write(train_df.head(3))
111
+
112
+ columns = train_df.columns.tolist()
113
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
+ target = st.sidebar.selectbox("Choose the target column:", columns)
115
+
116
+ # Process data
117
+ info = Informations(train_df, text_data, target)
118
+ train_df['clean_text'] = info.clean_text()
119
+ train_df['text_length'] = info.text_length()
120
+
121
+ # Handle label encoding manually if the class doesn't store encoder
122
+ from sklearn.preprocessing import LabelEncoder
123
+ label_encoder = LabelEncoder()
124
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
125
+
126
+ # Save label encoder for later use
127
+ os.makedirs("artifacts", exist_ok=True)
128
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
+
130
+ except Exception as e:
131
+ st.error(f"Error loading data: {str(e)}")
132
+ train_df = None
133
+ info = None
134
+
135
+ # Data Analysis Section
136
+ if section == "Data Analysis":
137
+ if train_data is not None and train_df is not None:
138
+ try:
139
+ st.subheader("Get Insights from the Data")
140
+
141
+ st.write("Data Shape:", info.shape())
142
+ st.write("Class Imbalance:", info.class_imbalanced())
143
+ st.write("Missing Values:", info.missing_values())
144
+
145
+ st.write("Processed Data Preview:")
146
+ st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
+
148
+ st.markdown("**Text Length Analysis**")
149
+ st.write(info.analysis_text_length('text_length'))
150
+
151
+ # Calculate correlation manually since we handled encoding separately
152
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
+ st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
+
155
+ st.subheader("Visualizations")
156
+ vis = Visualizations(train_df, text_data, target)
157
+ vis.class_distribution()
158
+ vis.text_length_distribution()
159
+
160
+ except Exception as e:
161
+ st.error(f"Error in data analysis: {str(e)}")
162
+ else:
163
+ st.warning("Please upload training data to get insights")
164
+
165
+ # Train Model Section
166
+ elif section == "Train Model":
167
+ if train_data is not None and train_df is not None:
168
+ try:
169
+ st.subheader("Train a Model")
170
+
171
+ # Create two columns for model selection
172
+ col1, col2 = st.columns(2)
173
+
174
+ with col1:
175
+ model = st.radio("Choose the Model", [
176
+ "Logistic Regression", "Decision Tree",
177
+ "Random Forest", "Linear SVC", "SVC",
178
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
+ ])
180
+
181
+ with col2:
182
+ vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
+
184
+ # Initialize vectorizer
185
+ if vectorizer_choice == "Tfidf Vectorizer":
186
+ vectorizer = TfidfVectorizer(max_features=10000)
187
+ st.session_state.vectorizer_type = "tfidf"
188
+ else:
189
+ vectorizer = CountVectorizer(max_features=10000)
190
+ st.session_state.vectorizer_type = "count"
191
+
192
+ st.write("Training Data Preview:")
193
+ st.write(train_df[['clean_text', 'target']].head(3))
194
+
195
+ # Vectorize text data
196
+ X = vectorizer.fit_transform(train_df['clean_text'])
197
+ y = train_df['target']
198
+
199
+ # Split data
200
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
201
+ st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
+
203
+ # Save vectorizer for later use
204
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
206
+
207
+ if st.button("Start Training"):
208
+ with st.spinner("Training model..."):
209
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
210
+
211
+ # Train selected model
212
+ if model == "Logistic Regression":
213
+ models.LogisticRegression()
214
+ elif model == "Decision Tree":
215
+ models.DecisionTree()
216
+ elif model == "Linear SVC":
217
+ models.LinearSVC()
218
+ elif model == "SVC":
219
+ models.SVC()
220
+ elif model == "Multinomial Naive Bayes":
221
+ models.MultinomialNB()
222
+ elif model == "Random Forest":
223
+ models.RandomForestClassifier()
224
+ elif model == "Gaussian Naive Bayes":
225
+ models.GaussianNB()
226
+
227
+ st.success("Model training completed!")
228
+ st.info("You can now use the 'Predictions' section to classify new text.")
229
+
230
+ except Exception as e:
231
+ st.error(f"Error in model training: {str(e)}")
232
+ else:
233
+ st.warning("Please upload training data to train a model")
234
+
235
+ # Predictions Section
236
+ elif section == "Predictions":
237
+ st.subheader("Perform Predictions on New Text")
238
+
239
+ # Check if models exist
240
+ if os.path.exists("models") and os.listdir("models"):
241
+ # Text input for prediction
242
+ text_input = st.text_area("Enter the text to classify:", height=100)
243
+
244
+ # Model selection
245
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
+
247
+ if available_models:
248
+ selected_model = st.selectbox("Choose the trained model:", available_models)
249
+
250
+ # Prediction button
251
+ if st.button("Predict", key="single_predict"):
252
+ if text_input.strip():
253
+ with st.spinner("Making prediction..."):
254
+ predicted_label, prediction_proba = predict_text(
255
+ selected_model,
256
+ text_input,
257
+ st.session_state.get('vectorizer_type', 'tfidf')
258
+ )
259
+
260
+ if predicted_label is not None:
261
+ st.success("Prediction completed!")
262
+
263
+ # Display results
264
+ st.markdown("### Prediction Results")
265
+ st.markdown(f"**Input Text:** {text_input}")
266
+ st.markdown(f"**Predicted Class:** {predicted_label}")
267
+
268
+ # Display probabilities if available
269
+ if prediction_proba is not None:
270
+ st.markdown("**Class Probabilities:**")
271
+
272
+ # Load encoder to get class names
273
+ encoder = load_artifacts("artifacts", "encoder.pkl")
274
+ if encoder is not None:
275
+ classes = encoder.classes_
276
+ prob_df = pd.DataFrame({
277
+ 'Class': classes,
278
+ 'Probability': prediction_proba
279
+ }).sort_values('Probability', ascending=False)
280
+
281
+ st.bar_chart(prob_df.set_index('Class'))
282
+ st.dataframe(prob_df)
283
+ else:
284
+ st.warning("Please enter some text to classify")
285
+ else:
286
+ st.warning("No trained models found. Please train a model first.")
287
+ else:
288
+ st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
+
290
+ # Option to classify multiple texts
291
+ st.markdown("---")
292
+ st.subheader("Batch Predictions")
293
+
294
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
+
296
+ if uploaded_file is not None:
297
+ try:
298
+ batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
+ st.write("Uploaded data preview:")
300
+ st.write(batch_df.head())
301
+
302
+ # Select text column
303
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
+
305
+ if os.path.exists("models") and os.listdir("models"):
306
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
+
309
+ if st.button("Run Batch Predictions", key="batch_predict"):
310
+ with st.spinner("Processing batch predictions..."):
311
+ predictions = []
312
+
313
+ for text in batch_df[text_column]:
314
+ pred, _ = predict_text(
315
+ batch_model,
316
+ str(text),
317
+ st.session_state.get('vectorizer_type', 'tfidf')
318
+ )
319
+ predictions.append(pred if pred is not None else "Error")
320
+
321
+ batch_df['Predicted_Class'] = predictions
322
+
323
+ st.success("Batch predictions completed!")
324
+ st.write("Results:")
325
+ st.write(batch_df[[text_column, 'Predicted_Class']])
326
+
327
+ # Download results
328
+ csv = batch_df.to_csv(index=False)
329
+ st.download_button(
330
+ label="Download predictions as CSV",
331
+ data=csv,
332
+ file_name="batch_predictions.csv",
333
+ mime="text/csv"
334
+ )
335
+ except Exception as e:
336
+ st.error(f"Error in batch prediction: {str(e)}")