WebashalarForML commited on
Commit
bab53e7
·
verified ·
1 Parent(s): bdbbac1

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +9 -45
utility/utils.py CHANGED
@@ -289,7 +289,7 @@ def get_paddle_ocr():
289
  global _PADDLE_OCR
290
  if _PADDLE_OCR is None:
291
  try:
292
- _PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
293
  except Exception as e:
294
  logging.error(f"Failed to initialize PaddleOCR: {e}")
295
  return None
@@ -384,7 +384,6 @@ def extract_contact_details(text):
384
  # Phone numbers with at least 5 digits in any segment
385
  combined_phone_regex = re.compile(r'''
386
  (?:
387
- #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
388
  \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
389
  \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
390
  \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
@@ -413,13 +412,13 @@ def extract_contact_details(text):
413
  \d{5}-\d{5} | # India XXXXX-XXXXX
414
  0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
415
  \+91\d{10} | # +91 XXXXXXXXXX
416
- \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
417
  \d{6}-\d{4} | # XXXXXX-XXXX
418
  \d{4}-\d{6} | # XXXX-XXXXXX
419
  \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
420
  \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
421
  \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
422
- \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
423
  \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
424
  \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
425
  0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
@@ -469,55 +468,19 @@ def extract_contact_details(text):
469
  \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
470
  \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
471
  0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
472
- \+54\d{10} | # +54 9 XXXXXXXXXX
473
  \+54\d{9} | # +54 XXXXXXXXX
474
  0\d{7} | # 0XXXXXXX
475
  \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
476
  0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
477
  \+966\d{8} | # +966 XXXXXXXX
478
  0\d{8} | # 0XXXXXXXX
479
- \+1\d{10} | # +1 XXXXXXXXXX
480
- \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX
481
- \d{5}\s\d{5} | # XXXXX XXXXX
482
- \d{10} | # XXXXXXXXXX
483
- \+44\d{10} | # +44 XXXXXXXXXX
484
- 0\d{10} | # 0XXXXXXXXXX
485
- \+61\d{9} | # +61 XXXXXXXXX
486
- 0\d{9} | # 0XXXXXXXXX
487
- \+91\d{10} | # +91 XXXXXXXXXX
488
- \+49\d{12} | # +49 XXXXXXXXXXXX
489
- \+49\d{10} | # +49 XXXXXXXXXX
490
- 0\d{11} | # 0XXXXXXXXXXX
491
- \+86\d{11} | # +86 XXXXXXXXXXX
492
- \+81\d{10} | # +81 XXXXXXXXXX
493
- \+81\d{9} | # +81 XXXXXXXXX
494
- 0\d{9} | # 0XXXXXXXXX
495
- \+55\d{11} | # +55 XXXXXXXXXXX
496
- \+55\d{10} | # +55 XXXXXXXXXX
497
- 0\d{10} | # 0XXXXXXXXXX
498
- \+33\d{9} | # +33 XXXXXXXXX
499
- 0\d{9} | # 0XXXXXXXXX
500
- \+7\d{10} | # +7 XXXXXXXXXX
501
- 8\d{10} | # 8 XXXXXXXXXX
502
- \+27\d{9} | # +27 XXXXXXXXX
503
- 0\d{9} | # 0XXXXXXXXX (South Africa STD)
504
- \+52\d{10} | # +52 XXXXXXXXXX
505
- 01\d{7} | # 01 XXXXXXX
506
- \+234\d{10} | # +234 XXXXXXXXXX
507
- 0\d{10} | # 0XXXXXXXXXX
508
- \+971\d{8} | # +971 XXXXXXXX
509
- 0\d{8} | # 0XXXXXXXX
510
- \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
511
- \+54\d{9} | # +54 XXXXXXXXX
512
- 0\d{7} | # 0XXXXXXX
513
- \+966\d{8} | # +966 XXXXXXXX
514
- 0\d{8} # 0XXXXXXXX
515
- \+\d{3}-\d{3}-\d{4}
516
  )
 
517
 
518
 
519
- ''',re.VERBOSE)
520
-
521
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
522
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
523
 
@@ -618,7 +581,8 @@ def process_resume_data(LLMdata, cont_data, extracted_text):
618
  "Location": LLMdata.get("Address", []),
619
  "Link": LLMdata.get("Link", []),
620
  "Company": LLMdata.get("Company", []),
621
- "extracted_text": extracted_text
 
622
  }
623
 
624
  for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
 
289
  global _PADDLE_OCR
290
  if _PADDLE_OCR is None:
291
  try:
292
+ _PADDLE_OCR = PaddleOCR(use_angle_cls=True, lang='en')
293
  except Exception as e:
294
  logging.error(f"Failed to initialize PaddleOCR: {e}")
295
  return None
 
384
  # Phone numbers with at least 5 digits in any segment
385
  combined_phone_regex = re.compile(r'''
386
  (?:
 
387
  \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
388
  \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
389
  \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
 
412
  \d{5}-\d{5} | # India XXXXX-XXXXX
413
  0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
414
  \+91\d{10} | # +91 XXXXXXXXXX
415
+ \d{10} | # XXXXXXXXXX
416
  \d{6}-\d{4} | # XXXXXX-XXXX
417
  \d{4}-\d{6} | # XXXX-XXXXXX
418
  \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
419
  \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
420
  \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
421
+ \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX
422
  \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
423
  \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
424
  0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
 
468
  \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
469
  \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
470
  0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
471
+ \+54\d{10} | # +5 Argentian +54 9 XXXXXXXXXX
472
  \+54\d{9} | # +54 XXXXXXXXX
473
  0\d{7} | # 0XXXXXXX
474
  \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
475
  0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
476
  \+966\d{8} | # +966 XXXXXXXX
477
  0\d{8} | # 0XXXXXXXX
478
+ \+\d{3}-\d{3}-\d{4} | # Generic +XXX-XXX-XXXX
479
+ (?:\+?\d{1,3})?[-.\s()]?\d{3,5}[-.\s()]?\d{3,5}[-.\s()]?\d{3,5} # Highly flexible generic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  )
481
+ ''', re.VERBOSE)
482
 
483
 
 
 
484
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
485
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
486
 
 
581
  "Location": LLMdata.get("Address", []),
582
  "Link": LLMdata.get("Link", []),
583
  "Company": LLMdata.get("Company", []),
584
+ "extracted_text": extracted_text,
585
+ "status_message": f"Source: {LLMdata.get('meta', 'Primary+Backup')}"
586
  }
587
 
588
  for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]: