Alamgirapi commited on
Commit
e7db41b
·
verified ·
1 Parent(s): dfcb201

Delete src

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +0 -336
src/streamlit_app.py DELETED
@@ -1,336 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- from NoCodeTextClassifier.EDA import Informations, Visualizations
6
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
7
- from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
8
- from NoCodeTextClassifier.models import Models
9
- import os
10
- import pickle
11
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
-
13
- # Utility functions
14
- def save_artifacts(obj, folder_name, file_name):
15
- """Save artifacts like encoders and vectorizers"""
16
- os.makedirs(folder_name, exist_ok=True)
17
- with open(os.path.join(folder_name, file_name), 'wb') as f:
18
- pickle.dump(obj, f)
19
-
20
- def load_artifacts(folder_name, file_name):
21
- """Load saved artifacts"""
22
- try:
23
- with open(os.path.join(folder_name, file_name), 'rb') as f:
24
- return pickle.load(f)
25
- except FileNotFoundError:
26
- st.error(f"File {file_name} not found in {folder_name} folder")
27
- return None
28
-
29
- def load_model(model_name):
30
- """Load trained model"""
31
- try:
32
- with open(os.path.join('models', model_name), 'rb') as f:
33
- return pickle.load(f)
34
- except FileNotFoundError:
35
- st.error(f"Model {model_name} not found. Please train a model first.")
36
- return None
37
-
38
- def predict_text(model_name, text, vectorizer_type="tfidf"):
39
- """Make prediction on new text"""
40
- try:
41
- # Load model
42
- model = load_model(model_name)
43
- if model is None:
44
- return None, None
45
-
46
- # Load vectorizer
47
- vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
48
- vectorizer = load_artifacts("artifacts", vectorizer_file)
49
- if vectorizer is None:
50
- return None, None
51
-
52
- # Load label encoder
53
- encoder = load_artifacts("artifacts", "encoder.pkl")
54
- if encoder is None:
55
- return None, None
56
-
57
- # Clean and vectorize text
58
- text_cleaner = TextCleaner()
59
- clean_text = text_cleaner.clean_text(text)
60
-
61
- # Transform text using the same vectorizer used during training
62
- text_vector = vectorizer.transform([clean_text])
63
-
64
- # Make prediction
65
- prediction = model.predict(text_vector)
66
- prediction_proba = None
67
-
68
- # Get prediction probabilities if available
69
- if hasattr(model, 'predict_proba'):
70
- try:
71
- prediction_proba = model.predict_proba(text_vector)[0]
72
- except:
73
- pass
74
-
75
- # Decode prediction
76
- predicted_label = encoder.inverse_transform(prediction)[0]
77
-
78
- return predicted_label, prediction_proba
79
-
80
- except Exception as e:
81
- st.error(f"Error during prediction: {str(e)}")
82
- return None, None
83
-
84
- # Streamlit App
85
- st.title('No Code Text Classification App')
86
- st.write('Understand the behavior of your text data and train a model to classify the text data')
87
-
88
- # Sidebar
89
- section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
-
91
- # Upload Data
92
- st.sidebar.subheader("Upload Your Dataset")
93
- train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
- test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
-
96
- # Global variables to store data and settings
97
- if 'vectorizer_type' not in st.session_state:
98
- st.session_state.vectorizer_type = "tfidf"
99
-
100
- if train_data is not None:
101
- try:
102
- train_df = pd.read_csv(train_data, encoding='latin1')
103
-
104
- if test_data is not None:
105
- test_df = pd.read_csv(test_data, encoding='latin1')
106
- else:
107
- test_df = None
108
-
109
- st.write("Training Data Preview:")
110
- st.write(train_df.head(3))
111
-
112
- columns = train_df.columns.tolist()
113
- text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
- target = st.sidebar.selectbox("Choose the target column:", columns)
115
-
116
- # Process data
117
- info = Informations(train_df, text_data, target)
118
- train_df['clean_text'] = info.clean_text()
119
- train_df['text_length'] = info.text_length()
120
-
121
- # Handle label encoding manually if the class doesn't store encoder
122
- from sklearn.preprocessing import LabelEncoder
123
- label_encoder = LabelEncoder()
124
- train_df['target'] = label_encoder.fit_transform(train_df[target])
125
-
126
- # Save label encoder for later use
127
- os.makedirs("artifacts", exist_ok=True)
128
- save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
-
130
- except Exception as e:
131
- st.error(f"Error loading data: {str(e)}")
132
- train_df = None
133
- info = None
134
-
135
- # Data Analysis Section
136
- if section == "Data Analysis":
137
- if train_data is not None and train_df is not None:
138
- try:
139
- st.subheader("Get Insights from the Data")
140
-
141
- st.write("Data Shape:", info.shape())
142
- st.write("Class Imbalance:", info.class_imbalanced())
143
- st.write("Missing Values:", info.missing_values())
144
-
145
- st.write("Processed Data Preview:")
146
- st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
-
148
- st.markdown("**Text Length Analysis**")
149
- st.write(info.analysis_text_length('text_length'))
150
-
151
- # Calculate correlation manually since we handled encoding separately
152
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
- st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
154
-
155
- st.subheader("Visualizations")
156
- vis = Visualizations(train_df, text_data, target)
157
- vis.class_distribution()
158
- vis.text_length_distribution()
159
-
160
- except Exception as e:
161
- st.error(f"Error in data analysis: {str(e)}")
162
- else:
163
- st.warning("Please upload training data to get insights")
164
-
165
- # Train Model Section
166
- elif section == "Train Model":
167
- if train_data is not None and train_df is not None:
168
- try:
169
- st.subheader("Train a Model")
170
-
171
- # Create two columns for model selection
172
- col1, col2 = st.columns(2)
173
-
174
- with col1:
175
- model = st.radio("Choose the Model", [
176
- "Logistic Regression", "Decision Tree",
177
- "Random Forest", "Linear SVC", "SVC",
178
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
- ])
180
-
181
- with col2:
182
- vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
183
-
184
- # Initialize vectorizer
185
- if vectorizer_choice == "Tfidf Vectorizer":
186
- vectorizer = TfidfVectorizer(max_features=10000)
187
- st.session_state.vectorizer_type = "tfidf"
188
- else:
189
- vectorizer = CountVectorizer(max_features=10000)
190
- st.session_state.vectorizer_type = "count"
191
-
192
- st.write("Training Data Preview:")
193
- st.write(train_df[['clean_text', 'target']].head(3))
194
-
195
- # Vectorize text data
196
- X = vectorizer.fit_transform(train_df['clean_text'])
197
- y = train_df['target']
198
-
199
- # Split data
200
- X_train, X_test, y_train, y_test = process.split_data(X, y)
201
- st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
-
203
- # Save vectorizer for later use
204
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
206
-
207
- if st.button("Start Training"):
208
- with st.spinner("Training model..."):
209
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
210
-
211
- # Train selected model
212
- if model == "Logistic Regression":
213
- models.LogisticRegression()
214
- elif model == "Decision Tree":
215
- models.DecisionTree()
216
- elif model == "Linear SVC":
217
- models.LinearSVC()
218
- elif model == "SVC":
219
- models.SVC()
220
- elif model == "Multinomial Naive Bayes":
221
- models.MultinomialNB()
222
- elif model == "Random Forest":
223
- models.RandomForestClassifier()
224
- elif model == "Gaussian Naive Bayes":
225
- models.GaussianNB()
226
-
227
- st.success("Model training completed!")
228
- st.info("You can now use the 'Predictions' section to classify new text.")
229
-
230
- except Exception as e:
231
- st.error(f"Error in model training: {str(e)}")
232
- else:
233
- st.warning("Please upload training data to train a model")
234
-
235
- # Predictions Section
236
- elif section == "Predictions":
237
- st.subheader("Perform Predictions on New Text")
238
-
239
- # Check if models exist
240
- if os.path.exists("models") and os.listdir("models"):
241
- # Text input for prediction
242
- text_input = st.text_area("Enter the text to classify:", height=100)
243
-
244
- # Model selection
245
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
-
247
- if available_models:
248
- selected_model = st.selectbox("Choose the trained model:", available_models)
249
-
250
- # Prediction button
251
- if st.button("Predict", key="single_predict"):
252
- if text_input.strip():
253
- with st.spinner("Making prediction..."):
254
- predicted_label, prediction_proba = predict_text(
255
- selected_model,
256
- text_input,
257
- st.session_state.get('vectorizer_type', 'tfidf')
258
- )
259
-
260
- if predicted_label is not None:
261
- st.success("Prediction completed!")
262
-
263
- # Display results
264
- st.markdown("### Prediction Results")
265
- st.markdown(f"**Input Text:** {text_input}")
266
- st.markdown(f"**Predicted Class:** {predicted_label}")
267
-
268
- # Display probabilities if available
269
- if prediction_proba is not None:
270
- st.markdown("**Class Probabilities:**")
271
-
272
- # Load encoder to get class names
273
- encoder = load_artifacts("artifacts", "encoder.pkl")
274
- if encoder is not None:
275
- classes = encoder.classes_
276
- prob_df = pd.DataFrame({
277
- 'Class': classes,
278
- 'Probability': prediction_proba
279
- }).sort_values('Probability', ascending=False)
280
-
281
- st.bar_chart(prob_df.set_index('Class'))
282
- st.dataframe(prob_df)
283
- else:
284
- st.warning("Please enter some text to classify")
285
- else:
286
- st.warning("No trained models found. Please train a model first.")
287
- else:
288
- st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
-
290
- # Option to classify multiple texts
291
- st.markdown("---")
292
- st.subheader("Batch Predictions")
293
-
294
- uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
295
-
296
- if uploaded_file is not None:
297
- try:
298
- batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
- st.write("Uploaded data preview:")
300
- st.write(batch_df.head())
301
-
302
- # Select text column
303
- text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
-
305
- if os.path.exists("models") and os.listdir("models"):
306
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
- batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
-
309
- if st.button("Run Batch Predictions", key="batch_predict"):
310
- with st.spinner("Processing batch predictions..."):
311
- predictions = []
312
-
313
- for text in batch_df[text_column]:
314
- pred, _ = predict_text(
315
- batch_model,
316
- str(text),
317
- st.session_state.get('vectorizer_type', 'tfidf')
318
- )
319
- predictions.append(pred if pred is not None else "Error")
320
-
321
- batch_df['Predicted_Class'] = predictions
322
-
323
- st.success("Batch predictions completed!")
324
- st.write("Results:")
325
- st.write(batch_df[[text_column, 'Predicted_Class']])
326
-
327
- # Download results
328
- csv = batch_df.to_csv(index=False)
329
- st.download_button(
330
- label="Download predictions as CSV",
331
- data=csv,
332
- file_name="batch_predictions.csv",
333
- mime="text/csv"
334
- )
335
- except Exception as e:
336
- st.error(f"Error in batch prediction: {str(e)}")