Spaces:
Sleeping
Sleeping
Update utility/utils.py
Browse files- utility/utils.py +9 -45
utility/utils.py
CHANGED
|
@@ -289,7 +289,7 @@ def get_paddle_ocr():
|
|
| 289 |
global _PADDLE_OCR
|
| 290 |
if _PADDLE_OCR is None:
|
| 291 |
try:
|
| 292 |
-
_PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en'
|
| 293 |
except Exception as e:
|
| 294 |
logging.error(f"Failed to initialize PaddleOCR: {e}")
|
| 295 |
return None
|
|
@@ -384,7 +384,6 @@ def extract_contact_details(text):
|
|
| 384 |
# Phone numbers with at least 5 digits in any segment
|
| 385 |
combined_phone_regex = re.compile(r'''
|
| 386 |
(?:
|
| 387 |
-
#(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
|
| 388 |
\+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
|
| 389 |
\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
|
| 390 |
\(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
|
|
@@ -413,13 +412,13 @@ def extract_contact_details(text):
|
|
| 413 |
\d{5}-\d{5} | # India XXXXX-XXXXX
|
| 414 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
| 415 |
\+91\d{10} | # +91 XXXXXXXXXX
|
| 416 |
-
\d{10} | # XXXXXXXXXX
|
| 417 |
\d{6}-\d{4} | # XXXXXX-XXXX
|
| 418 |
\d{4}-\d{6} | # XXXX-XXXXXX
|
| 419 |
\d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
|
| 420 |
\d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
|
| 421 |
\d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
|
| 422 |
-
\d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX
|
| 423 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
| 424 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
| 425 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
|
@@ -469,55 +468,19 @@ def extract_contact_details(text):
|
|
| 469 |
\+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
|
| 470 |
\+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
|
| 471 |
0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
|
| 472 |
-
\+54\d{10} | # +54 9 XXXXXXXXXX
|
| 473 |
\+54\d{9} | # +54 XXXXXXXXX
|
| 474 |
0\d{7} | # 0XXXXXXX
|
| 475 |
\+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
|
| 476 |
0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
|
| 477 |
\+966\d{8} | # +966 XXXXXXXX
|
| 478 |
0\d{8} | # 0XXXXXXXX
|
| 479 |
-
\+
|
| 480 |
-
\+1\s\d{3}\s\d{3}\s\d{
|
| 481 |
-
\d{5}\s\d{5} | # XXXXX XXXXX
|
| 482 |
-
\d{10} | # XXXXXXXXXX
|
| 483 |
-
\+44\d{10} | # +44 XXXXXXXXXX
|
| 484 |
-
0\d{10} | # 0XXXXXXXXXX
|
| 485 |
-
\+61\d{9} | # +61 XXXXXXXXX
|
| 486 |
-
0\d{9} | # 0XXXXXXXXX
|
| 487 |
-
\+91\d{10} | # +91 XXXXXXXXXX
|
| 488 |
-
\+49\d{12} | # +49 XXXXXXXXXXXX
|
| 489 |
-
\+49\d{10} | # +49 XXXXXXXXXX
|
| 490 |
-
0\d{11} | # 0XXXXXXXXXXX
|
| 491 |
-
\+86\d{11} | # +86 XXXXXXXXXXX
|
| 492 |
-
\+81\d{10} | # +81 XXXXXXXXXX
|
| 493 |
-
\+81\d{9} | # +81 XXXXXXXXX
|
| 494 |
-
0\d{9} | # 0XXXXXXXXX
|
| 495 |
-
\+55\d{11} | # +55 XXXXXXXXXXX
|
| 496 |
-
\+55\d{10} | # +55 XXXXXXXXXX
|
| 497 |
-
0\d{10} | # 0XXXXXXXXXX
|
| 498 |
-
\+33\d{9} | # +33 XXXXXXXXX
|
| 499 |
-
0\d{9} | # 0XXXXXXXXX
|
| 500 |
-
\+7\d{10} | # +7 XXXXXXXXXX
|
| 501 |
-
8\d{10} | # 8 XXXXXXXXXX
|
| 502 |
-
\+27\d{9} | # +27 XXXXXXXXX
|
| 503 |
-
0\d{9} | # 0XXXXXXXXX (South Africa STD)
|
| 504 |
-
\+52\d{10} | # +52 XXXXXXXXXX
|
| 505 |
-
01\d{7} | # 01 XXXXXXX
|
| 506 |
-
\+234\d{10} | # +234 XXXXXXXXXX
|
| 507 |
-
0\d{10} | # 0XXXXXXXXXX
|
| 508 |
-
\+971\d{8} | # +971 XXXXXXXX
|
| 509 |
-
0\d{8} | # 0XXXXXXXX
|
| 510 |
-
\+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
|
| 511 |
-
\+54\d{9} | # +54 XXXXXXXXX
|
| 512 |
-
0\d{7} | # 0XXXXXXX
|
| 513 |
-
\+966\d{8} | # +966 XXXXXXXX
|
| 514 |
-
0\d{8} # 0XXXXXXXX
|
| 515 |
-
\+\d{3}-\d{3}-\d{4}
|
| 516 |
)
|
|
|
|
| 517 |
|
| 518 |
|
| 519 |
-
''',re.VERBOSE)
|
| 520 |
-
|
| 521 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 522 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
| 523 |
|
|
@@ -618,7 +581,8 @@ def process_resume_data(LLMdata, cont_data, extracted_text):
|
|
| 618 |
"Location": LLMdata.get("Address", []),
|
| 619 |
"Link": LLMdata.get("Link", []),
|
| 620 |
"Company": LLMdata.get("Company", []),
|
| 621 |
-
"extracted_text": extracted_text
|
|
|
|
| 622 |
}
|
| 623 |
|
| 624 |
for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
|
|
|
|
| 289 |
global _PADDLE_OCR
|
| 290 |
if _PADDLE_OCR is None:
|
| 291 |
try:
|
| 292 |
+
_PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en')
|
| 293 |
except Exception as e:
|
| 294 |
logging.error(f"Failed to initialize PaddleOCR: {e}")
|
| 295 |
return None
|
|
|
|
| 384 |
# Phone numbers with at least 5 digits in any segment
|
| 385 |
combined_phone_regex = re.compile(r'''
|
| 386 |
(?:
|
|
|
|
| 387 |
\+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
|
| 388 |
\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
|
| 389 |
\(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
|
|
|
|
| 412 |
\d{5}-\d{5} | # India XXXXX-XXXXX
|
| 413 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
| 414 |
\+91\d{10} | # +91 XXXXXXXXXX
|
| 415 |
+
\d{10} | # XXXXXXXXXX
|
| 416 |
\d{6}-\d{4} | # XXXXXX-XXXX
|
| 417 |
\d{4}-\d{6} | # XXXX-XXXXXX
|
| 418 |
\d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
|
| 419 |
\d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
|
| 420 |
\d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
|
| 421 |
+
\d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX
|
| 422 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
| 423 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
| 424 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
|
|
|
| 468 |
\+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
|
| 469 |
\+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
|
| 470 |
0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
|
| 471 |
+
\+54\d{10} | # +5 Argentian +54 9 XXXXXXXXXX
|
| 472 |
\+54\d{9} | # +54 XXXXXXXXX
|
| 473 |
0\d{7} | # 0XXXXXXX
|
| 474 |
\+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
|
| 475 |
0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
|
| 476 |
\+966\d{8} | # +966 XXXXXXXX
|
| 477 |
0\d{8} | # 0XXXXXXXX
|
| 478 |
+
\+\d{3}-\d{3}-\d{4} | # Generic +XXX-XXX-XXXX
|
| 479 |
+
(?:\+?\d{1,3})?[-.\s()]?\d{3,5}[-.\s()]?\d{3,5}[-.\s()]?\d{3,5} # Highly flexible generic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
)
|
| 481 |
+
''', re.VERBOSE)
|
| 482 |
|
| 483 |
|
|
|
|
|
|
|
| 484 |
email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
|
| 485 |
link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
|
| 486 |
|
|
|
|
| 581 |
"Location": LLMdata.get("Address", []),
|
| 582 |
"Link": LLMdata.get("Link", []),
|
| 583 |
"Company": LLMdata.get("Company", []),
|
| 584 |
+
"extracted_text": extracted_text,
|
| 585 |
+
"status_message": f"Source: {LLMdata.get('meta', 'Primary+Backup')}"
|
| 586 |
}
|
| 587 |
|
| 588 |
for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
|