ProfessorLeVesseur commited on
Commit
823b52c
1 Parent(s): e993cf9

Create data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +137 -0
data_processor.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ from huggingface_hub import InferenceClient
5
+
6
+ class DataProcessor:
7
+ INTERVENTION_COLUMN = 'Did the intervention happen today?'
8
+ ENGAGED_STR = 'Engaged (Respect, Responsibility, Effort)'
9
+ PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
10
+ NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
11
+
12
+ def __init__(self):
13
+ self.hf_api_key = os.getenv('HF_API_KEY')
14
+ if not self.hf_api_key:
15
+ raise ValueError("HF_API_KEY not set in environment variables")
16
+ self.client = InferenceClient(api_key=self.hf_api_key)
17
+
18
+ def read_excel(self, uploaded_file):
19
+ return pd.read_excel(uploaded_file)
20
+
21
+ def format_session_data(self, df):
22
+ df['Date of Session'] = self.safe_convert_to_datetime(df['Date of Session'], '%m/%d/%Y')
23
+ df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
24
+ df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
25
+ df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
26
+ df = df[['Date of Session', 'Timestamp'] + [col for col in df.columns if col not in ['Date of Session', 'Timestamp']]]
27
+ return df
28
+
29
+ def safe_convert_to_time(self, series, format_str='%I:%M %p'):
30
+ try:
31
+ converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
32
+ if format_str:
33
+ return converted.dt.strftime(format_str)
34
+ return converted
35
+ except Exception as e:
36
+ print(f"Error converting series to time: {e}")
37
+ return series
38
+
39
+ def safe_convert_to_datetime(self, series, format_str=None):
40
+ try:
41
+ converted = pd.to_datetime(series, errors='coerce')
42
+ if format_str:
43
+ return converted.dt.strftime(format_str)
44
+ return converted
45
+ except Exception as e:
46
+ print(f"Error converting series to datetime: {e}")
47
+ return series
48
+
49
+ def replace_student_names_with_initials(self, df):
50
+ updated_columns = []
51
+ for col in df.columns:
52
+ if col.startswith('Student Attendance'):
53
+ match = re.match(r'Student Attendance \[(.+?)\]', col)
54
+ if match:
55
+ name = match.group(1)
56
+ name_parts = name.split()
57
+ if len(name_parts) == 1:
58
+ initials = name_parts[0][0]
59
+ else:
60
+ initials = ''.join([part[0] for part in name_parts])
61
+ updated_columns.append(f'Student Attendance [{initials}]')
62
+ else:
63
+ updated_columns.append(col)
64
+ else:
65
+ updated_columns.append(col)
66
+ df.columns = updated_columns
67
+ return df
68
+
69
+ def compute_intervention_statistics(self, df):
70
+ total_days = len(df)
71
+ sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
72
+ sessions_not_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('no').sum()
73
+ intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
74
+ intervention_frequency = round(intervention_frequency, 0)
75
+
76
+ stats = {
77
+ 'Intervention Frequency (%)': [intervention_frequency],
78
+ 'Intervention Sessions Held': [sessions_held],
79
+ 'Intervention Sessions Not Held': [sessions_not_held],
80
+ 'Total Number of Days Available': [total_days]
81
+ }
82
+ return pd.DataFrame(stats)
83
+
84
+ def compute_student_metrics(self, df):
85
+ intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
86
+ intervention_sessions_held = len(intervention_df)
87
+ student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
88
+
89
+ student_metrics = {}
90
+ for col in student_columns:
91
+ student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
92
+ student_data = intervention_df[[col]].copy()
93
+ student_data[col] = student_data[col].fillna('Absent')
94
+
95
+ attendance_values = student_data[col].apply(lambda x: 1 if x in [
96
+ self.ENGAGED_STR,
97
+ self.PARTIALLY_ENGAGED_STR,
98
+ self.NOT_ENGAGED_STR
99
+ ] else 0)
100
+
101
+ sessions_attended = attendance_values.sum()
102
+ attendance_pct = (sessions_attended / intervention_sessions_held) * 100 if intervention_sessions_held > 0 else 0
103
+ attendance_pct = round(attendance_pct)
104
+
105
+ engagement_counts = {
106
+ 'Engaged': 0,
107
+ 'Partially Engaged': 0,
108
+ 'Not Engaged': 0,
109
+ 'Absent': 0
110
+ }
111
+
112
+ for x in student_data[col]:
113
+ if x == self.ENGAGED_STR:
114
+ engagement_counts['Engaged'] += 1
115
+ elif x == self.PARTIALLY_ENGAGED_STR:
116
+ engagement_counts['Partially Engaged'] += 1
117
+ elif x == self.NOT_ENGAGED_STR:
118
+ engagement_counts['Not Engaged'] += 1
119
+ else:
120
+ engagement_counts['Absent'] += 1
121
+
122
+ total_sessions = sum(engagement_counts.values())
123
+ engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
124
+ engagement_pct = round(engagement_pct)
125
+
126
+ student_metrics[student_name] = {
127
+ 'Attendance (%)': attendance_pct,
128
+ 'Attendance #': sessions_attended,
129
+ 'Engagement (%)': engagement_pct
130
+ }
131
+
132
+ return pd.DataFrame.from_dict(student_metrics, orient='index').reset_index().rename(columns={'index': 'Student'})
133
+
134
+ def compute_average_metrics(self, student_metrics_df):
135
+ attendance_avg_stats = student_metrics_df['Attendance (%)'].mean()
136
+ engagement_avg_stats = student_metrics_df['Engagement (%)'].mean()
137
+ return round(attendance_avg_stats), round(engagement_avg_stats)