Tom commited on
Commit
4ee4ca0
1 Parent(s): 7d6fc0e
Files changed (2) hide show
  1. app.py +152 -356
  2. requirements.txt +1 -0
app.py CHANGED
@@ -3,17 +3,21 @@ import numpy as np
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import warnings
6
- import io
7
- import base64
8
- import os
9
  import tempfile
 
 
 
 
 
10
  from sklearn.model_selection import train_test_split
11
  from sklearn.ensemble import RandomForestClassifier
12
- from sklearn.metrics import classification_report
13
- from sklearn.preprocessing import StandardScaler
14
- from sklearn.cluster import KMeans
 
 
15
  from sklearn.decomposition import PCA
16
- import gradio as gr
17
 
18
  # Suppress specific FutureWarnings
19
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -21,373 +25,165 @@ warnings.filterwarnings("ignore", category=FutureWarning)
21
  # Set seaborn style for better aesthetics
22
  sns.set(style="whitegrid")
23
 
24
- def clean_data(df):
25
- df = df.drop_duplicates()
26
- df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')
27
-
 
28
  categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
29
- numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
30
-
31
- df[categorical_cols] = df[categorical_cols].fillna('Unknown')
32
- df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
33
-
34
- attendance_cols = [col for col in df.columns if 'attendance' in col]
35
- for col in attendance_cols:
36
- df[col] = df[col].astype(str).str.rstrip('%').replace('Unknown', '0').astype(float) / 100.0
37
-
38
- df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], format='%d/%m/%Y', errors='coerce')
39
- df = df.dropna(subset=['date_of_birth'])
40
- df['age'] = (pd.Timestamp('today') - df['date_of_birth']).dt.days // 365
41
-
42
- df = df.drop(columns=['nsn'], errors='ignore')
43
-
44
- category_cols = [
45
- 'gender', 'ethnicity', 'year_level', 'contributing_primary_school',
46
- 'year_11_english_teacher', 'year_11_maths_teacher', 'year_12_english_teacher',
47
- 'year_12_maths_teacher', 'form_teacher', 'leaving_date', 'primary_language',
48
- 'first_language', 'secondary_language', 'term_1_intervention',
49
- 'term_2_intervention', 'term_3_intervention', 'term_4_intervention',
50
- 'major_life_event', 'learning_difficulty', 'pastoral_care_incident',
51
- 'pastoral_care_action_taken', 'pastoral_care_follow_up'
52
- ]
53
- for col in category_cols:
54
- if col in df.columns:
55
- df[col] = df[col].astype('category')
56
-
57
- if 'ncea_results' in df.columns:
58
- ncea_results = []
59
- for idx, row in df.iterrows():
60
- try:
61
- ncea_data = eval(row['ncea_results'])
62
- total_credits = sum([result.get('Credits', 0) for result in ncea_data])
63
- ncea_results.append({'Index': idx, 'Total Credits': total_credits})
64
- except:
65
- ncea_results.append({'Index': idx, 'Total Credits': 0})
66
-
67
- ncea_df = pd.DataFrame(ncea_results)
68
- df = df.merge(ncea_df, left_index=True, right_on='Index', how='left')
69
- df = df.drop(columns=['Index', 'ncea_results'], errors='ignore')
70
- else:
71
- df['Total Credits'] = 0
72
-
73
- if 'pastoral_care_follow_up' in df.columns:
74
- df['action_effective'] = df['pastoral_care_follow_up'].apply(
75
- lambda x: 'Effective' if 'resolved' in str(x).lower() else 'Not Effective'
76
- )
77
-
78
- df['credit_threshold'] = df['year_level'].apply(lambda x: 80 if x == 'Year 11' else 60)
79
- df['credit_achievement_rate'] = df['Total Credits'] / df['credit_threshold']
80
-
81
- return df
82
-
83
- def plt_to_file():
84
- with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmpfile:
85
- plt.savefig(tmpfile.name)
86
- plt.close()
87
- return tmpfile.name
88
-
89
- def identify_at_risk_students(df):
90
- def prepare_data_for_modeling(df):
91
- df_model = df.drop(columns=[
92
- 'first_name', 'last_name', 'date_of_birth', 'form_teacher',
93
- 'leaving_date', 'pastoral_care', 'pastoral_care_follow_up',
94
- 'pastoral_care_action_taken', 'pastoral_care_incident',
95
- 'extra_curricular_activities', 'contributing_primary_school',
96
- 'year_11_english_teacher', 'year_11_maths_teacher',
97
- 'year_12_english_teacher', 'year_12_maths_teacher', 'primary_language',
98
- 'first_language', 'secondary_language', 'action_effective'
99
- ], errors='ignore')
100
-
101
- categorical_cols = df_model.select_dtypes(include=['object', 'category']).columns
102
- df_encoded = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)
103
- df_encoded = df_encoded.fillna(0)
104
-
105
- features = df_encoded.drop(['Total Credits', 'credit_threshold', 'credit_achievement_rate'], axis=1, errors='ignore')
106
- target = (df_encoded['credit_achievement_rate'] < 1).astype(int)
107
-
108
- return features, target
109
-
110
- features, target = prepare_data_for_modeling(df)
111
- X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
112
-
113
- model = RandomForestClassifier(n_estimators=100, random_state=42)
114
- model.fit(X_train, y_train)
115
- y_pred = model.predict(X_test)
116
-
117
- report = classification_report(y_test, y_pred)
118
-
119
- importances = model.feature_importances_
120
- feature_names = features.columns
121
- feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
122
- feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
123
-
124
- graphs = []
125
- tables = {}
126
-
127
- tables['classification_report'] = report
128
- tables['feature_importance'] = feature_importance_df.head(10).to_string()
129
-
130
- if feature_importance_df['Importance'].sum() > 0:
131
- plt.figure(figsize=(12, 6))
132
- sns.barplot(data=feature_importance_df.head(10), x='Importance', y='Feature', palette='viridis')
133
- plt.title('Top 10 Important Features for Predicting At-Risk Students', fontsize=14)
134
- plt.xlabel('Importance', fontsize=12)
135
- plt.ylabel('Feature', fontsize=12)
136
- plt.tight_layout()
137
- graphs.append(plt_to_file())
138
-
139
- return graphs, tables
140
-
141
- def process_extra_curricular(df):
142
- df['extra_curricular_activities'] = df['extra_curricular_activities'].apply(
143
- lambda x: eval(x) if isinstance(x, str) else []
144
- )
145
- activities = df['extra_curricular_activities'].explode().unique()
146
- activities = [activity for activity in activities if activity is not None]
147
- for activity in activities:
148
- df[activity] = df['extra_curricular_activities'].apply(lambda x: int(activity in x))
149
  return df
150
 
151
- def analyze_extra_curricular_impact(df):
152
- graphs = []
153
- activity_cols = [col for col in df.columns if col in ['Cricket', 'Debating', 'Football', 'Art Club', 'Drama Club', 'Rugby']]
154
- for activity in activity_cols:
155
- if activity in df.columns:
156
- data = df.copy()
157
- group = data.groupby(activity)['credit_achievement_rate'].mean().reset_index()
158
- group[activity] = group[activity].map({0: 'Not Involved', 1: 'Involved'})
159
- plt.figure(figsize=(6, 4))
160
- sns.barplot(data=group, x=activity, y='credit_achievement_rate', palette='Set2', edgecolor='w', errorbar=None)
161
- plt.title(f'Impact of {activity} on Credit Achievement Rate', fontsize=14)
162
- plt.xlabel('Participation Status', fontsize=12)
163
- plt.ylabel('Average Credit Achievement Rate', fontsize=12)
164
- plt.tight_layout()
165
- graphs.append(plt_to_file())
166
- return graphs
167
-
168
- def analyze_teacher_performance(df):
169
- graphs = []
170
- tables = {}
171
- teacher_year_levels = {
172
- 'year_11_english_teacher': 'Year 11',
173
- 'year_11_maths_teacher': 'Year 11',
174
- 'year_12_english_teacher': 'Year 12',
175
- 'year_12_maths_teacher': 'Year 12'
176
- }
177
- for col, year_level in teacher_year_levels.items():
178
- data = df[(df[col] != 'Unknown') & (df['year_level'] == year_level)]
179
- if not data.empty:
180
- group = data.groupby(col)['credit_achievement_rate'].mean().reset_index()
181
- plt.figure(figsize=(10, 6))
182
- sns.barplot(data=group, x=col, y='credit_achievement_rate', palette='Set3', edgecolor='w', errorbar=None)
183
- plt.title(f'Average Credit Achievement Rate by {col.replace("_", " ").title()} ({year_level})', fontsize=14)
184
- plt.xlabel('Teacher', fontsize=12)
185
- plt.ylabel('Average Credit Achievement Rate', fontsize=12)
186
- plt.xticks(rotation=45)
187
- plt.tight_layout()
188
- graphs.append(plt_to_file())
189
- else:
190
- tables[f"{col}_{year_level}"] = f"No data available for {col} in {year_level}."
191
- return graphs, tables
192
-
193
- def analyze_language_impact(df):
194
- graphs = []
195
- tables = {}
196
- data = df[df['primary_language'] != 'Unknown']
197
- if not data.empty:
198
- group = data.groupby('primary_language')['credit_achievement_rate'].mean().reset_index()
199
- plt.figure(figsize=(10, 6))
200
- sns.barplot(data=group, x='primary_language', y='credit_achievement_rate', palette='Pastel1', edgecolor='w', errorbar=None)
201
- plt.title('Average Credit Achievement Rate by Primary Language', fontsize=14)
202
- plt.xlabel('Primary Language', fontsize=12)
203
- plt.ylabel('Average Credit Achievement Rate', fontsize=12)
204
- plt.xticks(rotation=45)
205
- plt.tight_layout()
206
- graphs.append(plt_to_file())
207
- else:
208
- tables['language_impact'] = "No data available for primary languages."
209
- return graphs, tables
210
 
211
  def perform_clustering(df):
212
- graphs = []
213
- tables = {}
214
- attendance_cols = [col for col in df.columns if 'attendance' in col]
215
- features = df[['credit_achievement_rate', 'age'] + attendance_cols]
216
- features = features.fillna(0)
217
  scaler = StandardScaler()
218
- scaled_features = scaler.fit_transform(features)
219
- pca = PCA(n_components=2)
220
- principal_components = pca.fit_transform(scaled_features)
221
- kmeans = KMeans(n_clusters=3, random_state=42)
222
- clusters = kmeans.fit_predict(principal_components)
223
- df['Cluster'] = clusters
224
- cluster_analysis = df.groupby('Cluster')[['credit_achievement_rate', 'age'] + attendance_cols].mean()
225
- tables['cluster_analysis'] = cluster_analysis.to_string()
226
- plt.figure(figsize=(8, 6))
227
- sns.scatterplot(x=principal_components[:,0], y=principal_components[:,1], hue=clusters, palette='Set1', s=100, alpha=0.7)
228
- plt.title('Student Clusters', fontsize=14)
229
- plt.xlabel('Principal Component 1', fontsize=12)
230
- plt.ylabel('Principal Component 2', fontsize=12)
231
- plt.legend(title='Cluster')
232
- plt.tight_layout()
233
- graphs.append(plt_to_file())
234
- return graphs, tables
235
-
236
- def plot_correlation_with_credit_achievement(df):
237
- graphs = []
238
- tables = {}
239
- numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
240
- corr_matrix = df[numeric_cols].corr()
241
-
242
- if 'credit_achievement_rate' not in corr_matrix.columns:
243
- tables['correlation_error'] = "Error: 'credit_achievement_rate' column not found in the dataset."
244
- return graphs, tables
245
-
246
- corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=1, errors='ignore')
247
- corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=0, errors='ignore')
248
-
249
- correlation_with_credit = corr_matrix[['credit_achievement_rate']].sort_values(by='credit_achievement_rate', ascending=False)
250
-
251
- plt.figure(figsize=(8, 10))
252
- sns.heatmap(correlation_with_credit, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={"size": 10}, cbar=True)
253
- plt.title('Correlation with Credit Achievement Rate', fontsize=16)
254
- plt.xticks(rotation=45, ha='right', fontsize=10)
255
- plt.tight_layout()
256
- graphs.append(plt_to_file())
257
-
258
- tables['correlation_with_credit'] = correlation_with_credit.to_string()
259
-
260
- corr_matrix_clean = corr_matrix.replace([np.inf, -np.inf], np.nan).fillna(0)
261
-
262
- plt.figure(figsize=(12, 12))
263
- sns.clustermap(corr_matrix_clean, annot=False, cmap='coolwarm', figsize=(12, 12), method='average')
264
- plt.title('Cluster Map of Feature Correlations (excluding credit_threshold, Total Credits)', fontsize=16)
265
- graphs.append(plt_to_file())
266
 
267
- return graphs, tables
 
 
268
 
269
- def plot_top_features_vs_credit(df):
270
- graphs = []
271
- tables = {}
272
- numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
273
- corr_matrix = df[numeric_cols].corr()
274
 
275
- corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=0, errors='ignore')
276
- corr_matrix = corr_matrix.drop(['credit_threshold', 'Total Credits'], axis=1, errors='ignore')
277
 
278
- top_corr_features = corr_matrix['credit_achievement_rate'].abs().sort_values(ascending=False).index[1:6]
279
-
280
- tables['top_corr_features'] = f"Top features most correlated with Credit Achievement Rate:\n{', '.join(top_corr_features)}"
281
-
282
- for feature in top_corr_features:
283
- if pd.api.types.is_numeric_dtype(df[feature]):
284
- df_sorted = df[[feature, 'credit_achievement_rate']].sort_values(by=feature)
285
-
286
- plt.figure(figsize=(10, 6))
287
- sns.lineplot(x=df_sorted[feature], y=df_sorted['credit_achievement_rate'], marker='o')
288
- plt.title(f'Line Graph: {feature} vs Credit Achievement Rate', fontsize=14)
289
- plt.xlabel(feature.replace('_', ' ').title(), fontsize=12)
290
- plt.ylabel('Credit Achievement Rate', fontsize=12)
291
- plt.tight_layout()
292
- graphs.append(plt_to_file())
293
- elif pd.api.types.is_categorical_dtype(df[feature]) or pd.api.types.is_object_dtype(df[feature]):
294
- group = df.groupby(feature)['credit_achievement_rate'].mean().reset_index()
295
-
296
- plt.figure(figsize=(10, 6))
297
- sns.barplot(x=group[feature], y=group['credit_achievement_rate'], palette='Set2')
298
- plt.title(f'Bar Plot: {feature} vs Credit Achievement Rate', fontsize=14)
299
- plt.xlabel(feature.replace('_', ' ').title(), fontsize=12)
300
- plt.ylabel('Average Credit Achievement Rate', fontsize=12)
301
- plt.xticks(rotation=45)
302
- plt.tight_layout()
303
- graphs.append(plt_to_file())
304
-
305
- return graphs, tables
306
 
307
- def perform_comprehensive_analysis(df):
308
- all_graphs = []
309
- all_tables = {}
 
 
310
 
311
- # 1. Identifying At-Risk Students
312
- graphs, tables = identify_at_risk_students(df)
313
- all_graphs.extend(graphs)
314
- all_tables.update(tables)
315
 
316
- # 2. Analyzing Impact of Extra-Curricular Activities
317
- df = process_extra_curricular(df)
318
- graphs = analyze_extra_curricular_impact(df)
319
- all_graphs.extend(graphs)
320
 
321
- # 3. Analyzing Teacher Performance
322
- graphs, tables = analyze_teacher_performance(df)
323
- all_graphs.extend(graphs)
324
- all_tables.update(tables)
325
 
326
- # 4. Analyzing Language Proficiency Impact
327
- graphs, tables = analyze_language_impact(df)
328
- all_graphs.extend(graphs)
329
- all_tables.update(tables)
330
 
331
- # 5. Performing Cluster Analysis
332
- graphs, tables = perform_clustering(df)
333
- all_graphs.extend(graphs)
334
- all_tables.update(tables)
 
335
 
336
- # 6. Correlation Analysis for Credit Achievement Rate
337
- graphs, tables = plot_correlation_with_credit_achievement(df)
338
- all_graphs.extend(graphs)
339
- all_tables.update(tables)
 
340
 
341
- # 7. Plotting Top Features vs Credit Achievement Rate
342
- graphs, tables = plot_top_features_vs_credit(df)
343
- all_graphs.extend(graphs)
344
- all_tables.update(tables)
345
 
346
- return all_graphs, all_tables
347
 
348
- def gradio_wrapper(file):
349
- df = pd.read_csv(file.name)
350
- df = clean_data(df)
351
- graphs, tables = perform_comprehensive_analysis(df)
352
-
353
- # Convert tables to a list of strings for easier display
354
- table_outputs = [
355
- f"### {k}\n```\n{v}\n```" for k, v in tables.items()
356
- ]
357
 
358
- # Ensure we have exactly 7 table outputs (plus the gallery of graphs)
359
- while len(table_outputs) < 7:
360
- table_outputs.append("No data available for this section.")
361
-
362
- return [graphs] + table_outputs
363
-
364
- # Create Gradio interface
365
- iface = gr.Interface(
366
- fn=gradio_wrapper,
367
- inputs=gr.File(label="Upload CSV"),
368
- outputs=[
369
- gr.Gallery(label="Graphs", columns=2, rows=3, height="auto"),
370
- gr.Markdown(label="Classification Report"),
371
- gr.Markdown(label="Feature Importance"),
372
- gr.Markdown(label="Teacher Performance"),
373
- gr.Markdown(label="Language Impact"),
374
- gr.Markdown(label="Cluster Analysis"),
375
- gr.Markdown(label="Correlation with Credit Achievement Rate"),
376
- gr.Markdown(label="Top Correlated Features")
377
- ],
378
- title="Comprehensive Student Data Analysis",
379
- description="Upload a CSV file to analyze student data. The analysis includes identifying at-risk students, impact of extra-curricular activities, teacher performance, language proficiency impact, cluster analysis, and correlation analysis."
380
- )
381
-
382
- # Launch the interface
383
- iface.launch()
384
-
385
- # Clean up temporary files
386
- def cleanup_temp_files():
387
- for filename in os.listdir(tempfile.gettempdir()):
388
- if filename.endswith(".png"):
389
- os.remove(os.path.join(tempfile.gettempdir(), filename))
390
-
391
- # Register the cleanup function to be called when the script exits
392
- import atexit
393
- atexit.register(cleanup_temp_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import warnings
 
 
 
6
  import tempfile
7
+ import os
8
+ import dash
9
+ import dash_core_components as dcc
10
+ import dash_html_components as html
11
+ import dash_table
12
  from sklearn.model_selection import train_test_split
13
  from sklearn.ensemble import RandomForestClassifier
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.cluster import KMeans, DBSCAN
18
+ from sklearn.metrics import classification_report, accuracy_score, silhouette_score
19
  from sklearn.decomposition import PCA
20
+ from sklearn.manifold import TSNE
21
 
22
  # Suppress specific FutureWarnings
23
  warnings.filterwarnings("ignore", category=FutureWarning)
 
25
  # Set seaborn style for better aesthetics
26
  sns.set(style="whitegrid")
27
 
28
+ def enhanced_preprocessing(df):
29
+ # Handling missing values
30
+ df = df.fillna('Unknown')
31
+
32
+ # Encoding categorical features
33
  categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
34
+ for col in categorical_cols:
35
+ if len(df[col].unique()) < 20: # Label Encoding for columns with low cardinality
36
+ label_encoder = LabelEncoder()
37
+ df[col] = label_encoder.fit_transform(df[col])
38
+ else: # One-Hot Encoding for high-cardinality features
39
+ one_hot = pd.get_dummies(df[col], prefix=col)
40
+ df = pd.concat([df, one_hot], axis=1).drop(col, axis=1)
41
+
42
+ # Vectorizing free-text columns (example: interventions column)
43
+ if 'interventions' in df.columns:
44
+ tfidf = TfidfVectorizer()
45
+ tfidf_matrix = tfidf.fit_transform(df['interventions'])
46
+ tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
47
+ df = pd.concat([df, tfidf_df], axis=1).drop('interventions', axis=1)
48
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return df
50
 
51
+ def calculate_correlations(df, threshold=0.3):
52
+ correlations = df.corr()
53
+ significant_corr = correlations[abs(correlations) > threshold].stack().reset_index()
54
+ significant_corr = significant_corr[significant_corr['level_0'] != significant_corr['level_1']]
55
+ significant_corr.columns = ['Feature 1', 'Feature 2', 'Correlation']
56
+
57
+ return significant_corr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def perform_clustering(df):
60
+ # Normalize the data for clustering
 
 
 
 
61
  scaler = StandardScaler()
62
+ df_scaled = scaler.fit_transform(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # Determine best clustering method based on dataset characteristics
65
+ kmeans = KMeans(n_clusters=4, random_state=42)
66
+ dbscan = DBSCAN(eps=0.5, min_samples=5)
67
 
68
+ kmeans_labels = kmeans.fit_predict(df_scaled)
69
+ dbscan_labels = dbscan.fit_predict(df_scaled)
 
 
 
70
 
71
+ kmeans_score = silhouette_score(df_scaled, kmeans_labels)
72
+ dbscan_score = silhouette_score(df_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
73
 
74
+ if kmeans_score > dbscan_score:
75
+ df['Cluster'] = kmeans_labels
76
+ best_model = 'K-Means'
77
+ else:
78
+ df['Cluster'] = dbscan_labels
79
+ best_model = 'DBSCAN'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # Use PCA for visualization
82
+ pca = PCA(n_components=2)
83
+ pca_components = pca.fit_transform(df_scaled)
84
+ df['PCA1'] = pca_components[:, 0]
85
+ df['PCA2'] = pca_components[:, 1]
86
 
87
+ return df, best_model
 
 
 
88
 
89
+ def perform_predictions(df):
90
+ results = []
91
+ target_cols = [col for col in df.columns if col in ['skip_class', 'final_grade']]
 
92
 
93
+ for target in target_cols:
94
+ X = df.drop(target, axis=1)
95
+ y = df[target]
 
96
 
97
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
 
98
 
99
+ # Model 1: Random Forest
100
+ rf_model = RandomForestClassifier(random_state=42)
101
+ rf_model.fit(X_train, y_train)
102
+ rf_pred = rf_model.predict(X_test)
103
+ rf_accuracy = accuracy_score(y_test, rf_pred)
104
 
105
+ # Model 2: Logistic Regression
106
+ lr_model = LogisticRegression(max_iter=1000)
107
+ lr_model.fit(X_train, y_train)
108
+ lr_pred = lr_model.predict(X_test)
109
+ lr_accuracy = accuracy_score(y_test, lr_pred)
110
 
111
+ if rf_accuracy > lr_accuracy:
112
+ results.append({'Target': target, 'Model': 'Random Forest', 'Accuracy': rf_accuracy})
113
+ else:
114
+ results.append({'Target': target, 'Model': 'Logistic Regression', 'Accuracy': lr_accuracy})
115
 
116
+ return results
117
 
118
+ def create_dashboard(df, correlation_data, clustering_data, prediction_results):
119
+ app = dash.Dash(__name__)
 
 
 
 
 
 
 
120
 
121
+ app.layout = html.Div([
122
+ html.H1('Comprehensive Student Data Analysis'),
123
+
124
+ html.Div([
125
+ html.H2('Correlation Analysis'),
126
+ dash_table.DataTable(
127
+ id='correlation_table',
128
+ columns=[{'name': i, 'id': i} for i in correlation_data.columns],
129
+ data=correlation_data.to_dict('records')
130
+ )
131
+ ]),
132
+
133
+ html.Div([
134
+ html.H2('Clustering Analysis'),
135
+ html.P(f'Best Clustering Algorithm: {clustering_data["best_model"]}'),
136
+ dcc.Graph(
137
+ id='clustering_scatter',
138
+ figure={
139
+ 'data': [
140
+ {
141
+ 'x': df['PCA1'],
142
+ 'y': df['PCA2'],
143
+ 'mode': 'markers',
144
+ 'marker': {'color': df['Cluster'], 'colorscale': 'Viridis', 'size': 10},
145
+ 'text': df['Cluster'],
146
+ 'type': 'scatter'
147
+ }
148
+ ],
149
+ 'layout': {
150
+ 'title': 'Cluster Visualization using PCA',
151
+ 'xaxis': {'title': 'PCA Component 1'},
152
+ 'yaxis': {'title': 'PCA Component 2'}
153
+ }
154
+ }
155
+ )
156
+ ]),
157
+
158
+ html.Div([
159
+ html.H2('Prediction Models'),
160
+ dash_table.DataTable(
161
+ id='prediction_table',
162
+ columns=[{'name': i, 'id': i} for i in prediction_results.columns],
163
+ data=prediction_results.to_dict('records')
164
+ )
165
+ ])
166
+ ])
167
+
168
+ app.run_server(debug=True)
169
+
170
+ # Main execution
171
+ if __name__ == "__main__":
172
+ # Load dataset
173
+ df = pd.read_csv('student_data.csv') # Replace with your CSV file
174
+
175
+ # Preprocess the data
176
+ df = enhanced_preprocessing(df)
177
+
178
+ # Perform correlation analysis
179
+ correlation_data = calculate_correlations(df)
180
+
181
+ # Perform clustering analysis
182
+ df, best_model = perform_clustering(df)
183
+ clustering_data = {'best_model': best_model}
184
+
185
+ # Perform prediction analysis
186
+ prediction_results = pd.DataFrame(perform_predictions(df))
187
+
188
+ # Create and launch the dashboard
189
+ create_dashboard(df, correlation_data, clustering_data, prediction_results)
requirements.txt CHANGED
@@ -4,3 +4,4 @@ matplotlib
4
  seaborn
5
  scikit-learn
6
  gradio
 
 
4
  seaborn
5
  scikit-learn
6
  gradio
7
+ dash