ImageDataExtractor2

Running

App Files Files Community

WebashalarForML commited on 4 days ago

Commit

28a746e

verified ·

1 Parent(s): 8151bf1

Update utility/utils.py

Browse files

Files changed (1) hide show

utility/utils.py +139 -34

utility/utils.py CHANGED Viewed

@@ -200,33 +200,105 @@ def extract_text_from_images(image_paths):
     return all_extracted_texts, all_extracted_imgs_json
 # Function to call the Gemma model and process the output as Json
-def Data_Extractor(data, client=client):
-    text = f'''Act as a  Text extractor for the following text given in text: {data}
-    extract text in the following output JSON string:
-    {{
-    "Name": ["Identify and Extract All the person's name from the text."],
-    "Designation": ["Extract All the designation or job title mentioned in the text."],
-    "Company": ["Extract All the company or organization name if mentioned."],
-    "Contact": ["Extract All phone number, including country codes if present."],
-    "Address": ["Extract All the full postal address or location mentioned in the text."],
-    "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
-    "Link": ["Identify and Extract any website URLs or social media links present in the text."]
-    }}
-    Output:
-    '''
-    # Call the API for inference
-    response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
-    print("parse in text ---:",response)
-    # Convert the response text to JSON
     try:
-        json_data = json.loads(response)
-        print("Json_data-------------->",json_data)
         return json_data
     except json.JSONDecodeError as e:
-        return {"error": f"Error decoding JSON: {e}"}
 # For have text compatible to the llm
 def json_to_llm_str(textJson):
@@ -445,29 +517,62 @@ def remove_duplicates_case_insensitive(data_dict):
 # Process the model output for parsed result
 def process_resume_data(LLMdata,cont_data,extracted_text):
     # Removing duplicate emails
     unique_emails = []
-    for email in cont_data['emails']:
-        if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
             unique_emails.append(email)
-    # Removing duplicate links (case insensitive)
     unique_links = []
-    for link in cont_data['links_RE']:
-        if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
             unique_links.append(link)
     # Removing duplicate phone numbers
-    normalized_contact = [num[-10:] for num in LLMdata['Contact']]
     unique_numbers = []
-    for num in cont_data['phone_numbers']:
-        if num[-10:] not in normalized_contact:
             unique_numbers.append(num)
-    # Add unique emails, links, and phone numbers to the original LLMdata
-    LLMdata['Email'] += unique_emails
-    LLMdata['Link'] += unique_links
-    LLMdata['Contact'] += unique_numbers
     # Apply the function to the data
     LLMdata=remove_duplicates_case_insensitive(LLMdata)

     return all_extracted_texts, all_extracted_imgs_json
 # Function to call the Gemma model and process the output as Json
+# def Data_Extractor(data, client=client):
+#     text = f'''Act as a  Text extractor for the following text given in text: {data}
+#     extract text in the following output JSON string:
+#     {{
+#     "Name": ["Identify and Extract All the person's name from the text."],
+#     "Designation": ["Extract All the designation or job title mentioned in the text."],
+#     "Company": ["Extract All the company or organization name if mentioned."],
+#     "Contact": ["Extract All phone number, including country codes if present."],
+#     "Address": ["Extract All the full postal address or location mentioned in the text."],
+#     "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
+#     "Link": ["Identify and Extract any website URLs or social media links present in the text."]
+#     }}
+#     Output:
+#     '''
+#     # Call the API for inference
+#     response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
+#     print("parse in text ---:",response)
+#     # Convert the response text to JSON
+#     try:
+#         json_data = json.loads(response)
+#         print("Json_data-------------->",json_data)
+#         return json_data
+#     except json.JSONDecodeError as e:
+#         return {"error": f"Error decoding JSON: {e}"}
+def Data_Extractor(data):
+    url = "https://api.groq.com/openai/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}"
+    }
+    prompt = f"""
+You are a strict JSON generator.
+Extract structured data from the following text.
+Return ONLY valid JSON. No explanation. No markdown.
+Schema:
+{{
+    "Name": [],
+    "Designation": [],
+    "Company": [],
+    "Contact": [],
+    "Address": [],
+    "Email": [],
+    "Link": []
+}}
+Rules:
+- Always return all keys
+- If nothing found → return empty list []
+- Do NOT return "Not found"
+- Ensure valid JSON format
+Text:
+{data}
+"""
+    payload = {
+        "model": "llama-3.3-70b-versatile",
+        "messages": [
+            {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.2,   # 🔥 IMPORTANT: lower = more structured
+        "max_tokens": 1024,
+        "top_p": 1,
+        "stream": False
+    }
+    response = requests.post(url, headers=headers, json=payload)
+    if response.status_code != 200:
+        return {"error": response.text}
+    result = response.json()
+    # Extract model output
+    content = result["choices"][0]["message"]["content"]
+    print("RAW LLM OUTPUT:\n", content)
+    # 🔧 Clean response (important)
+    content = content.strip()
+    # Remove markdown if model adds ```json
+    if content.startswith("```"):
+        content = content.split("```")[1]
     try:
+        json_data = json.loads(content)
         return json_data
     except json.JSONDecodeError as e:
+        print("JSON ERROR:", e)
+        return {"error": "Invalid JSON from model", "raw": content}
 # For have text compatible to the llm
 def json_to_llm_str(textJson):
 # Process the model output for parsed result
 def process_resume_data(LLMdata,cont_data,extracted_text):
+    # # Removing duplicate emails
+    # unique_emails = []
+    # for email in cont_data['emails']:
+    #     if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
+    #         unique_emails.append(email)
+    # # Removing duplicate links (case insensitive)
+    # unique_links = []
+    # for link in cont_data['links_RE']:
+    #     if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
+    #         unique_links.append(link)
+    # # Removing duplicate phone numbers
+    # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
+    # unique_numbers = []
+    # for num in cont_data['phone_numbers']:
+    #     if num[-10:] not in normalized_contact:
+    #         unique_numbers.append(num)
+    # # Add unique emails, links, and phone numbers to the original LLMdata
+    # LLMdata['Email'] += unique_emails
+    # LLMdata['Link'] += unique_links
+    # LLMdata['Contact'] += unique_numbers
+    # Ensure keys exist (CRITICAL FIX)
+    LLMdata['Email'] = LLMdata.get('Email', []) or []
+    LLMdata['Link'] = LLMdata.get('Link', []) or []
+    LLMdata['Contact'] = LLMdata.get('Contact', []) or []
     # Removing duplicate emails
     unique_emails = []
+    for email in cont_data.get('emails', []):
+        if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
             unique_emails.append(email)
+    # Removing duplicate links
     unique_links = []
+    for link in cont_data.get('links_RE', []):
+        if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
             unique_links.append(link)
+    # Normalize existing contacts safely
+    normalized_contact = [
+        str(num)[-10:] for num in LLMdata['Contact'] if num
+    ]
     # Removing duplicate phone numbers
     unique_numbers = []
+    for num in cont_data.get('phone_numbers', []):
+        if str(num)[-10:] not in normalized_contact:
             unique_numbers.append(num)
+    # Merge safely
+    LLMdata['Email'].extend(unique_emails)
+    LLMdata['Link'].extend(unique_links)
+    LLMdata['Contact'].extend(unique_numbers)
     # Apply the function to the data
     LLMdata=remove_duplicates_case_insensitive(LLMdata)