ProfessorLeVesseur commited on
Commit
b92ff14
1 Parent(s): 90d5f4e

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +1 -184
data_processor.py CHANGED
@@ -1,172 +1,3 @@
1
- # import pandas as pd
2
- # import os
3
- # import re
4
- # from huggingface_hub import InferenceClient
5
-
6
- # class DataProcessor:
7
- # INTERVENTION_COLUMN = 'Did the intervention happen today?'
8
- # ENGAGED_STR = 'Engaged (Respect, Responsibility, Effort)'
9
- # PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
10
- # NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
11
-
12
- # def __init__(self):
13
- # self.hf_api_key = os.getenv('HF_API_KEY')
14
- # if not self.hf_api_key:
15
- # raise ValueError("HF_API_KEY not set in environment variables")
16
- # self.client = InferenceClient(api_key=self.hf_api_key)
17
-
18
- # def read_excel(self, uploaded_file):
19
- # return pd.read_excel(uploaded_file)
20
-
21
- # def format_session_data(self, df):
22
- # df['Date of Session'] = self.safe_convert_to_datetime(df['Date of Session'], '%m/%d/%Y')
23
- # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
24
- # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
25
- # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
26
- # df = df[['Date of Session', 'Timestamp'] + [col for col in df.columns if col not in ['Date of Session', 'Timestamp']]]
27
- # return df
28
-
29
- # def safe_convert_to_time(self, series, format_str='%I:%M %p'):
30
- # try:
31
- # converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
32
- # if format_str:
33
- # return converted.dt.strftime(format_str)
34
- # return converted
35
- # except Exception as e:
36
- # print(f"Error converting series to time: {e}")
37
- # return series
38
-
39
- # def safe_convert_to_datetime(self, series, format_str=None):
40
- # try:
41
- # converted = pd.to_datetime(series, errors='coerce')
42
- # if format_str:
43
- # return converted.dt.strftime(format_str)
44
- # return converted
45
- # except Exception as e:
46
- # print(f"Error converting series to datetime: {e}")
47
- # return series
48
-
49
- # def replace_student_names_with_initials(self, df):
50
- # updated_columns = []
51
- # for col in df.columns:
52
- # if col.startswith('Student Attendance'):
53
- # match = re.match(r'Student Attendance \[(.+?)\]', col)
54
- # if match:
55
- # name = match.group(1)
56
- # name_parts = name.split()
57
- # if len(name_parts) == 1:
58
- # initials = name_parts[0][0]
59
- # else:
60
- # initials = ''.join([part[0] for part in name_parts])
61
- # updated_columns.append(f'Student Attendance [{initials}]')
62
- # else:
63
- # updated_columns.append(col)
64
- # else:
65
- # updated_columns.append(col)
66
- # df.columns = updated_columns
67
- # return df
68
-
69
- # def compute_intervention_statistics(self, df):
70
- # total_days = len(df)
71
- # sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
72
- # sessions_not_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('no').sum()
73
- # intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
74
- # intervention_frequency = round(intervention_frequency, 0)
75
-
76
- # stats = {
77
- # 'Intervention Frequency (%)': [intervention_frequency],
78
- # 'Intervention Sessions Held': [sessions_held],
79
- # 'Intervention Sessions Not Held': [sessions_not_held],
80
- # 'Total Number of Days Available': [total_days]
81
- # }
82
- # return pd.DataFrame(stats)
83
-
84
- # def compute_student_metrics(self, df):
85
- # intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
86
- # intervention_sessions_held = len(intervention_df)
87
- # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
88
-
89
- # student_metrics = {}
90
- # for col in student_columns:
91
- # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
92
- # student_data = intervention_df[[col]].copy()
93
- # student_data[col] = student_data[col].fillna('Absent')
94
-
95
- # attendance_values = student_data[col].apply(lambda x: 1 if x in [
96
- # self.ENGAGED_STR,
97
- # self.PARTIALLY_ENGAGED_STR,
98
- # self.NOT_ENGAGED_STR
99
- # ] else 0)
100
-
101
- # sessions_attended = attendance_values.sum()
102
- # attendance_pct = (sessions_attended / intervention_sessions_held) * 100 if intervention_sessions_held > 0 else 0
103
- # attendance_pct = round(attendance_pct)
104
-
105
- # engagement_counts = {
106
- # 'Engaged': 0,
107
- # 'Partially Engaged': 0,
108
- # 'Not Engaged': 0,
109
- # 'Absent': 0
110
- # }
111
-
112
- # for x in student_data[col]:
113
- # if x == self.ENGAGED_STR:
114
- # engagement_counts['Engaged'] += 1
115
- # elif x == self.PARTIALLY_ENGAGED_STR:
116
- # engagement_counts['Partially Engaged'] += 1
117
- # elif x == self.NOT_ENGAGED_STR:
118
- # engagement_counts['Not Engaged'] += 1
119
- # else:
120
- # engagement_counts['Absent'] += 1 # Count as Absent if not engaged
121
-
122
- # # Calculate percentages for engagement states
123
- # total_sessions = sum(engagement_counts.values())
124
-
125
- # # Engagement (%)
126
- # engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
127
- # engagement_pct = round(engagement_pct)
128
-
129
- # engaged_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
130
- # engaged_pct = round(engaged_pct)
131
-
132
- # partially_engaged_pct = (engagement_counts['Partially Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
133
- # partially_engaged_pct = round(partially_engaged_pct)
134
-
135
- # not_engaged_pct = (engagement_counts['Not Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
136
- # not_engaged_pct = round(not_engaged_pct)
137
-
138
- # absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
139
- # absent_pct = round(absent_pct)
140
-
141
- # # Store metrics in the required order
142
- # student_metrics[student_name] = {
143
- # 'Attendance (%)': attendance_pct,
144
- # 'Attendance #': sessions_attended, # Raw number of sessions attended
145
- # 'Engagement (%)': engagement_pct,
146
- # 'Engaged (%)': engaged_pct,
147
- # 'Partially Engaged (%)': partially_engaged_pct,
148
- # 'Not Engaged (%)': not_engaged_pct,
149
- # 'Absent (%)': absent_pct
150
- # }
151
-
152
- # # Create a DataFrame from student_metrics
153
- # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
154
- # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
155
- # return student_metrics_df
156
-
157
- # def compute_average_metrics(self, student_metrics_df):
158
- # # Calculate the attendance and engagement average percentages across students
159
- # attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage
160
- # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Calculate the average engagement percentage
161
-
162
- # # Round the averages to make them whole numbers
163
- # attendance_avg_stats = round(attendance_avg_stats)
164
- # engagement_avg_stats = round(engagement_avg_stats)
165
-
166
- # return attendance_avg_stats, engagement_avg_stats
167
-
168
-
169
-
170
  import pandas as pd
171
  import os
172
  import re
@@ -340,18 +171,4 @@ class DataProcessor:
340
  return "Address Attendance"
341
  elif row["Engagement ≥ 80%"] == "No":
342
  return "Address Engagement"
343
- return "Consider addressing logistical barriers, improving fidelity, and/or collecting progress monitoring data"
344
-
345
- # def build_tree_diagram(self, row):
346
- # dot = Digraph()
347
- # dot.node("Q1", "Has the student attended ≥ 90% of interventions?")
348
- # dot.node("Q2", "Has the student been engaged ≥ 80% of intervention time?")
349
- # dot.node("A1", "Address Attendance", shape="box")
350
- # dot.node("A2", "Address Engagement", shape="box")
351
- # dot.node("A3", "Consider addressing logistical barriers", shape="box")
352
- # if row["Attended ≥ 90%"] == "No":
353
- # dot.edge("Q1", "A1", label="No")
354
- # else:
355
- # dot.edge("Q1", "Q2", label="Yes")
356
- # dot.edge("Q2", "A2" if row["Engagement ≥ 80%"] == "No" else "A3", label="Yes")
357
- # return dot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import os
3
  import re
 
171
  return "Address Attendance"
172
  elif row["Engagement ≥ 80%"] == "No":
173
  return "Address Engagement"
174
+ return "Consider addressing logistical barriers, improving fidelity, and/or collecting progress monitoring data"