WebashalarForML commited on
Commit
5194ace
·
verified ·
1 Parent(s): 28a746e

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +168 -88
utility/utils.py CHANGED
@@ -514,107 +514,187 @@ def remove_duplicates_case_insensitive(data_dict):
514
  data_dict[key] = unique_list
515
  return data_dict
516
 
517
- # Process the model output for parsed result
518
- def process_resume_data(LLMdata,cont_data,extracted_text):
519
-
520
- # # Removing duplicate emails
521
- # unique_emails = []
522
- # for email in cont_data['emails']:
523
- # if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
524
- # unique_emails.append(email)
525
 
526
- # # Removing duplicate links (case insensitive)
527
- # unique_links = []
528
- # for link in cont_data['links_RE']:
529
- # if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
530
- # unique_links.append(link)
531
 
532
- # # Removing duplicate phone numbers
533
- # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
534
- # unique_numbers = []
535
- # for num in cont_data['phone_numbers']:
536
- # if num[-10:] not in normalized_contact:
537
- # unique_numbers.append(num)
538
 
539
- # # Add unique emails, links, and phone numbers to the original LLMdata
540
- # LLMdata['Email'] += unique_emails
541
- # LLMdata['Link'] += unique_links
542
- # LLMdata['Contact'] += unique_numbers
543
- # Ensure keys exist (CRITICAL FIX)
544
- LLMdata['Email'] = LLMdata.get('Email', []) or []
545
- LLMdata['Link'] = LLMdata.get('Link', []) or []
546
- LLMdata['Contact'] = LLMdata.get('Contact', []) or []
547
 
548
- # Removing duplicate emails
549
- unique_emails = []
550
- for email in cont_data.get('emails', []):
551
- if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
552
- unique_emails.append(email)
553
 
554
- # Removing duplicate links
555
- unique_links = []
556
- for link in cont_data.get('links_RE', []):
557
- if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
558
- unique_links.append(link)
559
 
560
- # Normalize existing contacts safely
561
- normalized_contact = [
562
- str(num)[-10:] for num in LLMdata['Contact'] if num
563
- ]
564
 
565
- # Removing duplicate phone numbers
566
- unique_numbers = []
567
- for num in cont_data.get('phone_numbers', []):
568
- if str(num)[-10:] not in normalized_contact:
569
- unique_numbers.append(num)
570
 
571
- # Merge safely
572
- LLMdata['Email'].extend(unique_emails)
573
- LLMdata['Link'].extend(unique_links)
574
- LLMdata['Contact'].extend(unique_numbers)
575
 
576
 
577
- # Apply the function to the data
578
- LLMdata=remove_duplicates_case_insensitive(LLMdata)
579
 
580
- # Initialize the processed data dictionary
581
- processed_data = {
582
- "name": [],
583
- "contact_number": [],
584
- "Designation":[],
585
- "email": [],
586
- "Location": [],
587
- "Link": [],
588
- "Company":[],
589
- "extracted_text": extracted_text
590
- }
591
- #LLM
592
 
593
- processed_data['name'].extend(LLMdata.get('Name', None))
594
- #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
595
- processed_data['Designation'].extend(LLMdata.get('Designation', []))
596
- #processed_data['email'].extend(LLMdata.get("Email", []))
597
- processed_data['Location'].extend(LLMdata.get('Address', []))
598
- #processed_data['Link'].extend(LLMdata.get('Link', []))
599
- processed_data['Company'].extend(LLMdata.get('Company', []))
600
 
601
- #Contact
602
- #processed_data['email'].extend(cont_data.get("emails", []))
603
- #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
604
- #processed_data['Link'].extend(cont_data.get("links_RE", []))
605
 
606
- #New_merge_data
607
- processed_data['email'].extend(LLMdata['Email'])
608
- processed_data['contact_number'].extend(LLMdata['Contact'])
609
- processed_data['Link'].extend(LLMdata['Link'])
610
-
611
- #to remove not found fields
612
- # List of keys to check for 'Not found'
613
- keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
614
-
615
- # Replace 'Not found' with an empty list for each key
616
- for key in keys_to_check:
617
- if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
618
- processed_data[key] = []
619
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  return processed_data
 
514
  data_dict[key] = unique_list
515
  return data_dict
516
 
517
+ # # Process the model output for parsed result
518
+ # def process_resume_data(LLMdata,cont_data,extracted_text):
519
+
520
+ # # # Removing duplicate emails
521
+ # # unique_emails = []
522
+ # # for email in cont_data['emails']:
523
+ # # if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
524
+ # # unique_emails.append(email)
525
 
526
+ # # # Removing duplicate links (case insensitive)
527
+ # # unique_links = []
528
+ # # for link in cont_data['links_RE']:
529
+ # # if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
530
+ # # unique_links.append(link)
531
 
532
+ # # # Removing duplicate phone numbers
533
+ # # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
534
+ # # unique_numbers = []
535
+ # # for num in cont_data['phone_numbers']:
536
+ # # if num[-10:] not in normalized_contact:
537
+ # # unique_numbers.append(num)
538
 
539
+ # # # Add unique emails, links, and phone numbers to the original LLMdata
540
+ # # LLMdata['Email'] += unique_emails
541
+ # # LLMdata['Link'] += unique_links
542
+ # # LLMdata['Contact'] += unique_numbers
543
+ # # Ensure keys exist (CRITICAL FIX)
544
+ # LLMdata['Email'] = LLMdata.get('Email', []) or []
545
+ # LLMdata['Link'] = LLMdata.get('Link', []) or []
546
+ # LLMdata['Contact'] = LLMdata.get('Contact', []) or []
547
 
548
+ # # Removing duplicate emails
549
+ # unique_emails = []
550
+ # for email in cont_data.get('emails', []):
551
+ # if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
552
+ # unique_emails.append(email)
553
 
554
+ # # Removing duplicate links
555
+ # unique_links = []
556
+ # for link in cont_data.get('links_RE', []):
557
+ # if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
558
+ # unique_links.append(link)
559
 
560
+ # # Normalize existing contacts safely
561
+ # normalized_contact = [
562
+ # str(num)[-10:] for num in LLMdata['Contact'] if num
563
+ # ]
564
 
565
+ # # Removing duplicate phone numbers
566
+ # unique_numbers = []
567
+ # for num in cont_data.get('phone_numbers', []):
568
+ # if str(num)[-10:] not in normalized_contact:
569
+ # unique_numbers.append(num)
570
 
571
+ # # Merge safely
572
+ # LLMdata['Email'].extend(unique_emails)
573
+ # LLMdata['Link'].extend(unique_links)
574
+ # LLMdata['Contact'].extend(unique_numbers)
575
 
576
 
577
+ # # Apply the function to the data
578
+ # LLMdata=remove_duplicates_case_insensitive(LLMdata)
579
 
580
+ # # Initialize the processed data dictionary
581
+ # processed_data = {
582
+ # "name": [],
583
+ # "contact_number": [],
584
+ # "Designation":[],
585
+ # "email": [],
586
+ # "Location": [],
587
+ # "Link": [],
588
+ # "Company":[],
589
+ # "extracted_text": extracted_text
590
+ # }
591
+ # #LLM
592
 
593
+ # processed_data['name'].extend(LLMdata.get('Name', None))
594
+ # #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
595
+ # processed_data['Designation'].extend(LLMdata.get('Designation', []))
596
+ # #processed_data['email'].extend(LLMdata.get("Email", []))
597
+ # processed_data['Location'].extend(LLMdata.get('Address', []))
598
+ # #processed_data['Link'].extend(LLMdata.get('Link', []))
599
+ # processed_data['Company'].extend(LLMdata.get('Company', []))
600
 
601
+ # #Contact
602
+ # #processed_data['email'].extend(cont_data.get("emails", []))
603
+ # #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
604
+ # #processed_data['Link'].extend(cont_data.get("links_RE", []))
605
 
606
+ # #New_merge_data
607
+ # processed_data['email'].extend(LLMdata['Email'])
608
+ # processed_data['contact_number'].extend(LLMdata['Contact'])
609
+ # processed_data['Link'].extend(LLMdata['Link'])
610
+
611
+ # #to remove not found fields
612
+ # # List of keys to check for 'Not found'
613
+ # keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
614
+
615
+ # # Replace 'Not found' with an empty list for each key
616
+ # for key in keys_to_check:
617
+ # if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
618
+ # processed_data[key] = []
619
 
620
+ # return processed_data
621
+ def process_resume_data(LLMdata, cont_data, extracted_text):
622
+
623
+ # -------------------------------
624
+ # ✅ STEP 1: Normalize LLM Schema
625
+ # -------------------------------
626
+ expected_keys = ["Name", "Designation", "Company", "Contact", "Address", "Email", "Link"]
627
+
628
+ for key in expected_keys:
629
+ if key not in LLMdata or LLMdata[key] is None:
630
+ LLMdata[key] = []
631
+ elif not isinstance(LLMdata[key], list):
632
+ LLMdata[key] = [LLMdata[key]]
633
+
634
+ # -------------------------------
635
+ # ✅ STEP 2: Normalize cont_data
636
+ # -------------------------------
637
+ cont_data = cont_data or {}
638
+ cont_data.setdefault("emails", [])
639
+ cont_data.setdefault("phone_numbers", [])
640
+ cont_data.setdefault("links_RE", [])
641
+
642
+ # -------------------------------
643
+ # ✅ STEP 3: Normalize existing contacts
644
+ # -------------------------------
645
+ normalized_llm_numbers = {
646
+ str(num)[-10:] for num in LLMdata["Contact"] if num
647
+ }
648
+
649
+ # -------------------------------
650
+ # ✅ STEP 4: Merge Emails
651
+ # -------------------------------
652
+ for email in cont_data["emails"]:
653
+ if not any(email.lower() == str(e).lower() for e in LLMdata["Email"]):
654
+ LLMdata["Email"].append(email)
655
+
656
+ # -------------------------------
657
+ # ✅ STEP 5: Merge Links
658
+ # -------------------------------
659
+ for link in cont_data["links_RE"]:
660
+ if not any(link.lower() == str(l).lower() for l in LLMdata["Link"]):
661
+ LLMdata["Link"].append(link)
662
+
663
+ # -------------------------------
664
+ # ✅ STEP 6: Merge Phone Numbers
665
+ # -------------------------------
666
+ for num in cont_data["phone_numbers"]:
667
+ norm = str(num)[-10:]
668
+ if norm not in normalized_llm_numbers:
669
+ LLMdata["Contact"].append(num)
670
+ normalized_llm_numbers.add(norm)
671
+
672
+ # -------------------------------
673
+ # ✅ STEP 7: Remove duplicates (case-insensitive)
674
+ # -------------------------------
675
+ LLMdata = remove_duplicates_case_insensitive(LLMdata)
676
+
677
+ # -------------------------------
678
+ # ✅ STEP 8: Build final structure
679
+ # -------------------------------
680
+ processed_data = {
681
+ "name": LLMdata["Name"],
682
+ "contact_number": LLMdata["Contact"],
683
+ "Designation": LLMdata["Designation"],
684
+ "email": LLMdata["Email"],
685
+ "Location": LLMdata["Address"],
686
+ "Link": LLMdata["Link"],
687
+ "Company": LLMdata["Company"],
688
+ "extracted_text": extracted_text
689
+ }
690
+
691
+ # -------------------------------
692
+ # ✅ STEP 9: Clean "Not found"
693
+ # -------------------------------
694
+ for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
695
+ processed_data[key] = [
696
+ v for v in processed_data[key]
697
+ if str(v).lower() != "not found"
698
+ ]
699
+
700
  return processed_data