Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -252,26 +252,10 @@ class LearningStyleQuiz:
|
|
252 |
# Initialize learning style quiz
|
253 |
learning_style_quiz = LearningStyleQuiz()
|
254 |
|
255 |
-
# ========== TRANSCRIPT PARSER ==========
|
256 |
-
class
|
257 |
def __init__(self):
|
258 |
-
|
259 |
-
self.format1_patterns = {
|
260 |
-
'student_info': re.compile(
|
261 |
-
r"(\d{7}) - (.*?)\s*\|\s*Current Grade:\s*(\d+)\s*\|\s*YOG\s*(\d{4})"
|
262 |
-
r"\s*\|\s*Weighted GPA\s*([\d.]+)\s*\|\s*Comm Serv Date\s*(\d{2}/\d{2}/\d{4})"
|
263 |
-
r"\s*\|\s*Total Credits Earned\s*([\d.]+)"
|
264 |
-
),
|
265 |
-
'requirement': re.compile(
|
266 |
-
r"([A-Z]-[A-Za-z ]+)\s*\|\s*([^|]+)\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([^|]+)%"
|
267 |
-
),
|
268 |
-
'course': re.compile(
|
269 |
-
r"([A-Z]-[A-Za-z ]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^|]+)\|"
|
270 |
-
r"\s*([A-Z0-9])\s*\|\s*(\d+)\s*\|\s*([A-Z])\s*\|\s*([A-Z])\s*\|\s*([\d.]+|inProgress)"
|
271 |
-
)
|
272 |
-
}
|
273 |
-
|
274 |
-
self.format2_patterns = {
|
275 |
'student_info': re.compile(
|
276 |
r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
|
277 |
r"GRADE LEVEL:\s*(\d+).*?"
|
@@ -297,98 +281,42 @@ class MiamiDadeTranscriptParser:
|
|
297 |
r"BIOLOGY ASSESSMENT PASSED|"
|
298 |
r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
|
299 |
re.DOTALL
|
|
|
|
|
|
|
|
|
300 |
)
|
301 |
}
|
302 |
-
|
303 |
def parse_transcript(self, file_path: str) -> Dict:
|
304 |
-
"""Parse Miami-Dade transcript PDF
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
else:
|
318 |
-
raise ValueError("Unrecognized transcript format")
|
319 |
-
|
320 |
-
def _parse_format1(self, text: str) -> Dict:
|
321 |
-
"""Parse the first transcript format"""
|
322 |
-
parsed_data = {
|
323 |
-
'student_info': self._parse_format1_student_info(text),
|
324 |
-
'requirements': self._parse_format1_requirements(text),
|
325 |
-
'course_history': self._parse_format1_courses(text),
|
326 |
-
'format': 'progress_summary'
|
327 |
-
}
|
328 |
-
return parsed_data
|
329 |
-
|
330 |
-
def _parse_format1_student_info(self, text: str) -> Dict:
|
331 |
-
"""Extract student information from format 1"""
|
332 |
-
match = self.format1_patterns['student_info'].search(text)
|
333 |
-
if not match:
|
334 |
-
return {}
|
335 |
-
|
336 |
-
return {
|
337 |
-
'id': match.group(1),
|
338 |
-
'name': match.group(2).strip(),
|
339 |
-
'grade': match.group(3),
|
340 |
-
'year_of_graduation': match.group(4),
|
341 |
-
'weighted_gpa': float(match.group(5)),
|
342 |
-
'community_service_date': match.group(6),
|
343 |
-
'total_credits': float(match.group(7)),
|
344 |
-
'district': 'Miami-Dade'
|
345 |
-
}
|
346 |
-
|
347 |
-
def _parse_format1_requirements(self, text: str) -> Dict:
|
348 |
-
"""Parse graduation requirements section from format 1"""
|
349 |
-
requirements = {}
|
350 |
-
for match in self.format1_patterns['requirement'].finditer(text):
|
351 |
-
requirements[match.group(1).strip()] = {
|
352 |
-
'description': match.group(2).strip(),
|
353 |
-
'required': float(match.group(3)),
|
354 |
-
'waived': float(match.group(4)),
|
355 |
-
'completed': float(match.group(5)),
|
356 |
-
'percent_complete': float(match.group(6))
|
357 |
-
}
|
358 |
-
return requirements
|
359 |
-
|
360 |
-
def _parse_format1_courses(self, text: str) -> List[Dict]:
|
361 |
-
"""Parse course history section from format 1"""
|
362 |
-
courses = []
|
363 |
-
for match in self.format1_patterns['course'].finditer(text):
|
364 |
-
courses.append({
|
365 |
-
'requirement': match.group(1).strip(),
|
366 |
-
'school_year': match.group(2),
|
367 |
-
'grade_level': match.group(3),
|
368 |
-
'course_code': match.group(4),
|
369 |
-
'description': match.group(5).strip(),
|
370 |
-
'term': match.group(6),
|
371 |
-
'district_number': match.group(7),
|
372 |
-
'included': match.group(8),
|
373 |
-
'credits': 0 if 'inProgress' in match.group(9) else float(match.group(9)),
|
374 |
-
'status': 'In Progress' if 'inProgress' in match.group(9) else 'Completed'
|
375 |
-
})
|
376 |
-
return courses
|
377 |
|
378 |
-
def
|
379 |
-
"""Parse the
|
380 |
parsed_data = {
|
381 |
-
'student_info': self.
|
382 |
-
'academic_summary': self.
|
383 |
-
'course_history': self.
|
384 |
-
'assessments': self.
|
385 |
-
'format': '
|
386 |
}
|
387 |
return parsed_data
|
388 |
|
389 |
-
def
|
390 |
-
"""Extract student information
|
391 |
-
match = self.
|
392 |
if not match:
|
393 |
return {}
|
394 |
|
@@ -411,10 +339,11 @@ class MiamiDadeTranscriptParser:
|
|
411 |
eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
|
412 |
return eth_match.group(1).strip() if eth_match else None
|
413 |
|
414 |
-
def
|
415 |
-
"""Parse academic summary section
|
416 |
-
gpa_match = self.
|
417 |
-
credits_matches = self.
|
|
|
418 |
|
419 |
summary = {
|
420 |
'gpa': {
|
@@ -422,7 +351,10 @@ class MiamiDadeTranscriptParser:
|
|
422 |
'state': float(gpa_match.group(2)) if gpa_match else None
|
423 |
},
|
424 |
'credits': {},
|
425 |
-
'class_rank':
|
|
|
|
|
|
|
426 |
}
|
427 |
|
428 |
for match in credits_matches:
|
@@ -435,21 +367,10 @@ class MiamiDadeTranscriptParser:
|
|
435 |
|
436 |
return summary
|
437 |
|
438 |
-
def
|
439 |
-
"""
|
440 |
-
rank_match = re.search(
|
441 |
-
r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
|
442 |
-
text
|
443 |
-
)
|
444 |
-
return {
|
445 |
-
'percentile': int(rank_match.group(1)) if rank_match else None,
|
446 |
-
'class_size': int(rank_match.group(2)) if rank_match else None
|
447 |
-
}
|
448 |
-
|
449 |
-
def _parse_format2_courses(self, text: str) -> List[Dict]:
|
450 |
-
"""Parse course history section from format 2"""
|
451 |
courses = []
|
452 |
-
for match in self.
|
453 |
courses.append({
|
454 |
'term': match.group(1),
|
455 |
'course_code': match.group(2),
|
@@ -463,9 +384,9 @@ class MiamiDadeTranscriptParser:
|
|
463 |
})
|
464 |
return courses
|
465 |
|
466 |
-
def
|
467 |
-
"""Parse assessment and requirement information
|
468 |
-
matches = self.
|
469 |
assessments = {
|
470 |
'ela_passed_date': None,
|
471 |
'algebra_passed': False,
|
@@ -491,8 +412,8 @@ class MiamiDadeTranscriptParser:
|
|
491 |
|
492 |
return assessments
|
493 |
|
494 |
-
# Initialize
|
495 |
-
transcript_parser =
|
496 |
|
497 |
# ========== ACADEMIC ANALYZER ==========
|
498 |
class AcademicAnalyzer:
|
@@ -600,7 +521,6 @@ class AcademicAnalyzer:
|
|
600 |
|
601 |
try:
|
602 |
if parsed_data.get('format') == 'progress_summary':
|
603 |
-
# Format 1 analysis
|
604 |
total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
|
605 |
if total_match:
|
606 |
analysis['completion_percentage'] = float(total_match.group(4))
|
@@ -628,7 +548,6 @@ class AcademicAnalyzer:
|
|
628 |
if req and float(req.get('completed', 0)) < float(req.get('required', 0))
|
629 |
]
|
630 |
else:
|
631 |
-
# Format 2 analysis
|
632 |
credits = parsed_data.get('academic_summary', {}).get('credits', {})
|
633 |
total_required = sum(
|
634 |
v.get('required', 0)
|
@@ -1466,7 +1385,7 @@ class EnhancedTeachingAssistant:
|
|
1466 |
service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
|
1467 |
else:
|
1468 |
gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
|
1469 |
-
service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0)
|
1470 |
|
1471 |
learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
|
1472 |
profile.get('learning_style', ''))
|
|
|
252 |
# Initialize learning style quiz
|
253 |
learning_style_quiz = LearningStyleQuiz()
|
254 |
|
255 |
+
# ========== ENHANCED TRANSCRIPT PARSER ==========
|
256 |
+
class EnhancedMiamiDadeTranscriptParser:
|
257 |
def __init__(self):
|
258 |
+
self.patterns = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
'student_info': re.compile(
|
260 |
r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
|
261 |
r"GRADE LEVEL:\s*(\d+).*?"
|
|
|
281 |
r"BIOLOGY ASSESSMENT PASSED|"
|
282 |
r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
|
283 |
re.DOTALL
|
284 |
+
),
|
285 |
+
'class_rank': re.compile(
|
286 |
+
r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
|
287 |
+
re.DOTALL
|
288 |
)
|
289 |
}
|
290 |
+
|
291 |
def parse_transcript(self, file_path: str) -> Dict:
|
292 |
+
"""Parse Miami-Dade transcript PDF with enhanced pattern matching"""
|
293 |
+
try:
|
294 |
+
with pdfplumber.open(file_path) as pdf:
|
295 |
+
text = "\n".join(page.extract_text() for page in pdf.pages)
|
296 |
+
|
297 |
+
# Clean up text
|
298 |
+
text = re.sub(r'\s+', ' ', text)
|
299 |
+
text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)
|
300 |
+
|
301 |
+
return self._parse_format(text)
|
302 |
+
except Exception as e:
|
303 |
+
logger.error(f"Error parsing transcript: {str(e)}")
|
304 |
+
raise ValueError(f"Error processing transcript: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
+
def _parse_format(self, text: str) -> Dict:
|
307 |
+
"""Parse the transcript format shown in the example"""
|
308 |
parsed_data = {
|
309 |
+
'student_info': self._parse_student_info(text),
|
310 |
+
'academic_summary': self._parse_academic_summary(text),
|
311 |
+
'course_history': self._parse_courses(text),
|
312 |
+
'assessments': self._parse_assessments(text),
|
313 |
+
'format': 'cumulative_summary_v2'
|
314 |
}
|
315 |
return parsed_data
|
316 |
|
317 |
+
def _parse_student_info(self, text: str) -> Dict:
|
318 |
+
"""Extract student information"""
|
319 |
+
match = self.patterns['student_info'].search(text)
|
320 |
if not match:
|
321 |
return {}
|
322 |
|
|
|
339 |
eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
|
340 |
return eth_match.group(1).strip() if eth_match else None
|
341 |
|
342 |
+
def _parse_academic_summary(self, text: str) -> Dict:
|
343 |
+
"""Parse academic summary section"""
|
344 |
+
gpa_match = self.patterns['gpa'].search(text)
|
345 |
+
credits_matches = self.patterns['credits'].finditer(text)
|
346 |
+
rank_match = self.patterns['class_rank'].search(text)
|
347 |
|
348 |
summary = {
|
349 |
'gpa': {
|
|
|
351 |
'state': float(gpa_match.group(2)) if gpa_match else None
|
352 |
},
|
353 |
'credits': {},
|
354 |
+
'class_rank': {
|
355 |
+
'percentile': int(rank_match.group(1)) if rank_match else None,
|
356 |
+
'class_size': int(rank_match.group(2)) if rank_match else None
|
357 |
+
}
|
358 |
}
|
359 |
|
360 |
for match in credits_matches:
|
|
|
367 |
|
368 |
return summary
|
369 |
|
370 |
+
def _parse_courses(self, text: str) -> List[Dict]:
|
371 |
+
"""Parse course history section"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
courses = []
|
373 |
+
for match in self.patterns['course'].finditer(text):
|
374 |
courses.append({
|
375 |
'term': match.group(1),
|
376 |
'course_code': match.group(2),
|
|
|
384 |
})
|
385 |
return courses
|
386 |
|
387 |
+
def _parse_assessments(self, text: str) -> Dict:
|
388 |
+
"""Parse assessment and requirement information"""
|
389 |
+
matches = self.patterns['assessment'].finditer(text)
|
390 |
assessments = {
|
391 |
'ela_passed_date': None,
|
392 |
'algebra_passed': False,
|
|
|
412 |
|
413 |
return assessments
|
414 |
|
415 |
+
# Initialize the enhanced parser
|
416 |
+
transcript_parser = EnhancedMiamiDadeTranscriptParser()
|
417 |
|
418 |
# ========== ACADEMIC ANALYZER ==========
|
419 |
class AcademicAnalyzer:
|
|
|
521 |
|
522 |
try:
|
523 |
if parsed_data.get('format') == 'progress_summary':
|
|
|
524 |
total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
|
525 |
if total_match:
|
526 |
analysis['completion_percentage'] = float(total_match.group(4))
|
|
|
548 |
if req and float(req.get('completed', 0)) < float(req.get('required', 0))
|
549 |
]
|
550 |
else:
|
|
|
551 |
credits = parsed_data.get('academic_summary', {}).get('credits', {})
|
552 |
total_required = sum(
|
553 |
v.get('required', 0)
|
|
|
1385 |
service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
|
1386 |
else:
|
1387 |
gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
|
1388 |
+
service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0))
|
1389 |
|
1390 |
learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
|
1391 |
profile.get('learning_style', ''))
|