ImageDataExtractor2

Running

App Files Files Community

WebashalarForML commited on 3 days ago

Commit

5194ace

verified ·

1 Parent(s): 28a746e

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +168 -88

utility/utils.py CHANGED Viewed

@@ -514,107 +514,187 @@ def remove_duplicates_case_insensitive(data_dict):
         data_dict[key] = unique_list
     return data_dict
-# Process the model output for parsed result
-def process_resume_data(LLMdata,cont_data,extracted_text):
-    # # Removing duplicate emails
-    # unique_emails = []
-    # for email in cont_data['emails']:
-    #     if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
-    #         unique_emails.append(email)
-    # # Removing duplicate links (case insensitive)
-    # unique_links = []
-    # for link in cont_data['links_RE']:
-    #     if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
-    #         unique_links.append(link)
-    # # Removing duplicate phone numbers
-    # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
-    # unique_numbers = []
-    # for num in cont_data['phone_numbers']:
-    #     if num[-10:] not in normalized_contact:
-    #         unique_numbers.append(num)
-    # # Add unique emails, links, and phone numbers to the original LLMdata
-    # LLMdata['Email'] += unique_emails
-    # LLMdata['Link'] += unique_links
-    # LLMdata['Contact'] += unique_numbers
-    # Ensure keys exist (CRITICAL FIX)
-    LLMdata['Email'] = LLMdata.get('Email', []) or []
-    LLMdata['Link'] = LLMdata.get('Link', []) or []
-    LLMdata['Contact'] = LLMdata.get('Contact', []) or []
-    # Removing duplicate emails
-    unique_emails = []
-    for email in cont_data.get('emails', []):
-        if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
-            unique_emails.append(email)
-    # Removing duplicate links
-    unique_links = []
-    for link in cont_data.get('links_RE', []):
-        if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
-            unique_links.append(link)
-    # Normalize existing contacts safely
-    normalized_contact = [
-        str(num)[-10:] for num in LLMdata['Contact'] if num
-    ]
-    # Removing duplicate phone numbers
-    unique_numbers = []
-    for num in cont_data.get('phone_numbers', []):
-        if str(num)[-10:] not in normalized_contact:
-            unique_numbers.append(num)
-    # Merge safely
-    LLMdata['Email'].extend(unique_emails)
-    LLMdata['Link'].extend(unique_links)
-    LLMdata['Contact'].extend(unique_numbers)
-    # Apply the function to the data
-    LLMdata=remove_duplicates_case_insensitive(LLMdata)
-    # Initialize the processed data dictionary
-    processed_data = {
-            "name": [],
-            "contact_number": [],
-            "Designation":[],
-            "email": [],
-            "Location": [],
-            "Link": [],
-            "Company":[],
-            "extracted_text": extracted_text
-            }
-    #LLM
-    processed_data['name'].extend(LLMdata.get('Name', None))
-    #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
-    processed_data['Designation'].extend(LLMdata.get('Designation', []))
-    #processed_data['email'].extend(LLMdata.get("Email", []))
-    processed_data['Location'].extend(LLMdata.get('Address', []))
-    #processed_data['Link'].extend(LLMdata.get('Link', []))
-    processed_data['Company'].extend(LLMdata.get('Company', []))
-    #Contact
-    #processed_data['email'].extend(cont_data.get("emails", []))
-    #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
-    #processed_data['Link'].extend(cont_data.get("links_RE", []))
-    #New_merge_data
-    processed_data['email'].extend(LLMdata['Email'])
-    processed_data['contact_number'].extend(LLMdata['Contact'])
-    processed_data['Link'].extend(LLMdata['Link'])
-    #to remove not found fields
-    # List of keys to check for 'Not found'
-    keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
-    # Replace 'Not found' with an empty list for each key
-    for key in keys_to_check:
-        if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
-            processed_data[key] = []
     return processed_data

         data_dict[key] = unique_list
     return data_dict
+# # Process the model output for parsed result
+# def process_resume_data(LLMdata,cont_data,extracted_text):
+#     # # Removing duplicate emails
+#     # unique_emails = []
+#     # for email in cont_data['emails']:
+#     #     if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
+#     #         unique_emails.append(email)
+#     # # Removing duplicate links (case insensitive)
+#     # unique_links = []
+#     # for link in cont_data['links_RE']:
+#     #     if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
+#     #         unique_links.append(link)
+#     # # Removing duplicate phone numbers
+#     # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
+#     # unique_numbers = []
+#     # for num in cont_data['phone_numbers']:
+#     #     if num[-10:] not in normalized_contact:
+#     #         unique_numbers.append(num)
+#     # # Add unique emails, links, and phone numbers to the original LLMdata
+#     # LLMdata['Email'] += unique_emails
+#     # LLMdata['Link'] += unique_links
+#     # LLMdata['Contact'] += unique_numbers
+#     # Ensure keys exist (CRITICAL FIX)
+#     LLMdata['Email'] = LLMdata.get('Email', []) or []
+#     LLMdata['Link'] = LLMdata.get('Link', []) or []
+#     LLMdata['Contact'] = LLMdata.get('Contact', []) or []
+#     # Removing duplicate emails
+#     unique_emails = []
+#     for email in cont_data.get('emails', []):
+#         if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
+#             unique_emails.append(email)
+#     # Removing duplicate links
+#     unique_links = []
+#     for link in cont_data.get('links_RE', []):
+#         if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
+#             unique_links.append(link)
+#     # Normalize existing contacts safely
+#     normalized_contact = [
+#         str(num)[-10:] for num in LLMdata['Contact'] if num
+#     ]
+#     # Removing duplicate phone numbers
+#     unique_numbers = []
+#     for num in cont_data.get('phone_numbers', []):
+#         if str(num)[-10:] not in normalized_contact:
+#             unique_numbers.append(num)
+#     # Merge safely
+#     LLMdata['Email'].extend(unique_emails)
+#     LLMdata['Link'].extend(unique_links)
+#     LLMdata['Contact'].extend(unique_numbers)
+#     # Apply the function to the data
+#     LLMdata=remove_duplicates_case_insensitive(LLMdata)
+#     # Initialize the processed data dictionary
+#     processed_data = {
+#             "name": [],
+#             "contact_number": [],
+#             "Designation":[],
+#             "email": [],
+#             "Location": [],
+#             "Link": [],
+#             "Company":[],
+#             "extracted_text": extracted_text
+#             }
+#     #LLM
+#     processed_data['name'].extend(LLMdata.get('Name', None))
+#     #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
+#     processed_data['Designation'].extend(LLMdata.get('Designation', []))
+#     #processed_data['email'].extend(LLMdata.get("Email", []))
+#     processed_data['Location'].extend(LLMdata.get('Address', []))
+#     #processed_data['Link'].extend(LLMdata.get('Link', []))
+#     processed_data['Company'].extend(LLMdata.get('Company', []))
+#     #Contact
+#     #processed_data['email'].extend(cont_data.get("emails", []))
+#     #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
+#     #processed_data['Link'].extend(cont_data.get("links_RE", []))
+#     #New_merge_data
+#     processed_data['email'].extend(LLMdata['Email'])
+#     processed_data['contact_number'].extend(LLMdata['Contact'])
+#     processed_data['Link'].extend(LLMdata['Link'])
+#     #to remove not found fields
+#     # List of keys to check for 'Not found'
+#     keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
+#     # Replace 'Not found' with an empty list for each key
+#     for key in keys_to_check:
+#         if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
+#             processed_data[key] = []
+#     return processed_data
+def process_resume_data(LLMdata, cont_data, extracted_text):
+    # -------------------------------
+    # ✅ STEP 1: Normalize LLM Schema
+    # -------------------------------
+    expected_keys = ["Name", "Designation", "Company", "Contact", "Address", "Email", "Link"]
+    for key in expected_keys:
+        if key not in LLMdata or LLMdata[key] is None:
+            LLMdata[key] = []
+        elif not isinstance(LLMdata[key], list):
+            LLMdata[key] = [LLMdata[key]]
+    # -------------------------------
+    # ✅ STEP 2: Normalize cont_data
+    # -------------------------------
+    cont_data = cont_data or {}
+    cont_data.setdefault("emails", [])
+    cont_data.setdefault("phone_numbers", [])
+    cont_data.setdefault("links_RE", [])
+    # -------------------------------
+    # ✅ STEP 3: Normalize existing contacts
+    # -------------------------------
+    normalized_llm_numbers = {
+        str(num)[-10:] for num in LLMdata["Contact"] if num
+    }
+    # -------------------------------
+    # ✅ STEP 4: Merge Emails
+    # -------------------------------
+    for email in cont_data["emails"]:
+        if not any(email.lower() == str(e).lower() for e in LLMdata["Email"]):
+            LLMdata["Email"].append(email)
+    # -------------------------------
+    # ✅ STEP 5: Merge Links
+    # -------------------------------
+    for link in cont_data["links_RE"]:
+        if not any(link.lower() == str(l).lower() for l in LLMdata["Link"]):
+            LLMdata["Link"].append(link)
+    # -------------------------------
+    # ✅ STEP 6: Merge Phone Numbers
+    # -------------------------------
+    for num in cont_data["phone_numbers"]:
+        norm = str(num)[-10:]
+        if norm not in normalized_llm_numbers:
+            LLMdata["Contact"].append(num)
+            normalized_llm_numbers.add(norm)
+    # -------------------------------
+    # ✅ STEP 7: Remove duplicates (case-insensitive)
+    # -------------------------------
+    LLMdata = remove_duplicates_case_insensitive(LLMdata)
+    # -------------------------------
+    # ✅ STEP 8: Build final structure
+    # -------------------------------
+    processed_data = {
+        "name": LLMdata["Name"],
+        "contact_number": LLMdata["Contact"],
+        "Designation": LLMdata["Designation"],
+        "email": LLMdata["Email"],
+        "Location": LLMdata["Address"],
+        "Link": LLMdata["Link"],
+        "Company": LLMdata["Company"],
+        "extracted_text": extracted_text
+    }
+    # -------------------------------
+    # ✅ STEP 9: Clean "Not found"
+    # -------------------------------
+    for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
+        processed_data[key] = [
+            v for v in processed_data[key]
+            if str(v).lower() != "not found"
+        ]
     return processed_data