Spaces:
Running
Running
Update utility/utils.py
Browse files- utility/utils.py +139 -34
utility/utils.py
CHANGED
|
@@ -200,33 +200,105 @@ def extract_text_from_images(image_paths):
|
|
| 200 |
return all_extracted_texts, all_extracted_imgs_json
|
| 201 |
|
| 202 |
# Function to call the Gemma model and process the output as Json
|
| 203 |
-
def Data_Extractor(data, client=client):
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
# Convert the response text to JSON
|
| 224 |
try:
|
| 225 |
-
json_data = json.loads(
|
| 226 |
-
print("Json_data-------------->",json_data)
|
| 227 |
return json_data
|
| 228 |
except json.JSONDecodeError as e:
|
| 229 |
-
|
|
|
|
| 230 |
|
| 231 |
# For have text compatible to the llm
|
| 232 |
def json_to_llm_str(textJson):
|
|
@@ -445,29 +517,62 @@ def remove_duplicates_case_insensitive(data_dict):
|
|
| 445 |
# Process the model output for parsed result
|
| 446 |
def process_resume_data(LLMdata,cont_data,extracted_text):
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
# Removing duplicate emails
|
| 449 |
unique_emails = []
|
| 450 |
-
for email in cont_data
|
| 451 |
-
if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
|
| 452 |
unique_emails.append(email)
|
| 453 |
|
| 454 |
-
# Removing duplicate links
|
| 455 |
unique_links = []
|
| 456 |
-
for link in cont_data
|
| 457 |
-
if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
|
| 458 |
unique_links.append(link)
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
# Removing duplicate phone numbers
|
| 461 |
-
normalized_contact = [num[-10:] for num in LLMdata['Contact']]
|
| 462 |
unique_numbers = []
|
| 463 |
-
for num in cont_data
|
| 464 |
-
if num[-10:] not in normalized_contact:
|
| 465 |
unique_numbers.append(num)
|
| 466 |
-
|
| 467 |
-
#
|
| 468 |
-
LLMdata['Email']
|
| 469 |
-
LLMdata['Link']
|
| 470 |
-
LLMdata['Contact']
|
|
|
|
| 471 |
|
| 472 |
# Apply the function to the data
|
| 473 |
LLMdata=remove_duplicates_case_insensitive(LLMdata)
|
|
|
|
| 200 |
return all_extracted_texts, all_extracted_imgs_json
|
| 201 |
|
| 202 |
# Function to call the Gemma model and process the output as Json
|
| 203 |
+
# def Data_Extractor(data, client=client):
|
| 204 |
+
# text = f'''Act as a Text extractor for the following text given in text: {data}
|
| 205 |
+
# extract text in the following output JSON string:
|
| 206 |
+
# {{
|
| 207 |
+
# "Name": ["Identify and Extract All the person's name from the text."],
|
| 208 |
+
# "Designation": ["Extract All the designation or job title mentioned in the text."],
|
| 209 |
+
# "Company": ["Extract All the company or organization name if mentioned."],
|
| 210 |
+
# "Contact": ["Extract All phone number, including country codes if present."],
|
| 211 |
+
# "Address": ["Extract All the full postal address or location mentioned in the text."],
|
| 212 |
+
# "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
|
| 213 |
+
# "Link": ["Identify and Extract any website URLs or social media links present in the text."]
|
| 214 |
+
# }}
|
| 215 |
+
# Output:
|
| 216 |
+
# '''
|
| 217 |
|
| 218 |
+
# # Call the API for inference
|
| 219 |
+
# response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
|
| 220 |
|
| 221 |
+
# print("parse in text ---:",response)
|
| 222 |
+
|
| 223 |
+
# # Convert the response text to JSON
|
| 224 |
+
# try:
|
| 225 |
+
# json_data = json.loads(response)
|
| 226 |
+
# print("Json_data-------------->",json_data)
|
| 227 |
+
# return json_data
|
| 228 |
+
# except json.JSONDecodeError as e:
|
| 229 |
+
# return {"error": f"Error decoding JSON: {e}"}
|
| 230 |
+
def Data_Extractor(data):
|
| 231 |
+
url = "https://api.groq.com/openai/v1/chat/completions"
|
| 232 |
+
|
| 233 |
+
headers = {
|
| 234 |
+
"Content-Type": "application/json",
|
| 235 |
+
"Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}"
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
prompt = f"""
|
| 239 |
+
You are a strict JSON generator.
|
| 240 |
+
|
| 241 |
+
Extract structured data from the following text.
|
| 242 |
+
|
| 243 |
+
Return ONLY valid JSON. No explanation. No markdown.
|
| 244 |
+
|
| 245 |
+
Schema:
|
| 246 |
+
{{
|
| 247 |
+
"Name": [],
|
| 248 |
+
"Designation": [],
|
| 249 |
+
"Company": [],
|
| 250 |
+
"Contact": [],
|
| 251 |
+
"Address": [],
|
| 252 |
+
"Email": [],
|
| 253 |
+
"Link": []
|
| 254 |
+
}}
|
| 255 |
+
|
| 256 |
+
Rules:
|
| 257 |
+
- Always return all keys
|
| 258 |
+
- If nothing found → return empty list []
|
| 259 |
+
- Do NOT return "Not found"
|
| 260 |
+
- Ensure valid JSON format
|
| 261 |
+
|
| 262 |
+
Text:
|
| 263 |
+
{data}
|
| 264 |
+
"""
|
| 265 |
+
|
| 266 |
+
payload = {
|
| 267 |
+
"model": "llama-3.3-70b-versatile",
|
| 268 |
+
"messages": [
|
| 269 |
+
{"role": "user", "content": prompt}
|
| 270 |
+
],
|
| 271 |
+
"temperature": 0.2, # 🔥 IMPORTANT: lower = more structured
|
| 272 |
+
"max_tokens": 1024,
|
| 273 |
+
"top_p": 1,
|
| 274 |
+
"stream": False
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
response = requests.post(url, headers=headers, json=payload)
|
| 278 |
+
|
| 279 |
+
if response.status_code != 200:
|
| 280 |
+
return {"error": response.text}
|
| 281 |
+
|
| 282 |
+
result = response.json()
|
| 283 |
+
|
| 284 |
+
# Extract model output
|
| 285 |
+
content = result["choices"][0]["message"]["content"]
|
| 286 |
+
|
| 287 |
+
print("RAW LLM OUTPUT:\n", content)
|
| 288 |
+
|
| 289 |
+
# 🔧 Clean response (important)
|
| 290 |
+
content = content.strip()
|
| 291 |
+
|
| 292 |
+
# Remove markdown if model adds ```json
|
| 293 |
+
if content.startswith("```"):
|
| 294 |
+
content = content.split("```")[1]
|
| 295 |
|
|
|
|
| 296 |
try:
|
| 297 |
+
json_data = json.loads(content)
|
|
|
|
| 298 |
return json_data
|
| 299 |
except json.JSONDecodeError as e:
|
| 300 |
+
print("JSON ERROR:", e)
|
| 301 |
+
return {"error": "Invalid JSON from model", "raw": content}
|
| 302 |
|
| 303 |
# For have text compatible to the llm
|
| 304 |
def json_to_llm_str(textJson):
|
|
|
|
| 517 |
# Process the model output for parsed result
|
| 518 |
def process_resume_data(LLMdata,cont_data,extracted_text):
|
| 519 |
|
| 520 |
+
# # Removing duplicate emails
|
| 521 |
+
# unique_emails = []
|
| 522 |
+
# for email in cont_data['emails']:
|
| 523 |
+
# if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
|
| 524 |
+
# unique_emails.append(email)
|
| 525 |
+
|
| 526 |
+
# # Removing duplicate links (case insensitive)
|
| 527 |
+
# unique_links = []
|
| 528 |
+
# for link in cont_data['links_RE']:
|
| 529 |
+
# if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
|
| 530 |
+
# unique_links.append(link)
|
| 531 |
+
|
| 532 |
+
# # Removing duplicate phone numbers
|
| 533 |
+
# normalized_contact = [num[-10:] for num in LLMdata['Contact']]
|
| 534 |
+
# unique_numbers = []
|
| 535 |
+
# for num in cont_data['phone_numbers']:
|
| 536 |
+
# if num[-10:] not in normalized_contact:
|
| 537 |
+
# unique_numbers.append(num)
|
| 538 |
+
|
| 539 |
+
# # Add unique emails, links, and phone numbers to the original LLMdata
|
| 540 |
+
# LLMdata['Email'] += unique_emails
|
| 541 |
+
# LLMdata['Link'] += unique_links
|
| 542 |
+
# LLMdata['Contact'] += unique_numbers
|
| 543 |
+
# Ensure keys exist (CRITICAL FIX)
|
| 544 |
+
LLMdata['Email'] = LLMdata.get('Email', []) or []
|
| 545 |
+
LLMdata['Link'] = LLMdata.get('Link', []) or []
|
| 546 |
+
LLMdata['Contact'] = LLMdata.get('Contact', []) or []
|
| 547 |
+
|
| 548 |
# Removing duplicate emails
|
| 549 |
unique_emails = []
|
| 550 |
+
for email in cont_data.get('emails', []):
|
| 551 |
+
if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
|
| 552 |
unique_emails.append(email)
|
| 553 |
|
| 554 |
+
# Removing duplicate links
|
| 555 |
unique_links = []
|
| 556 |
+
for link in cont_data.get('links_RE', []):
|
| 557 |
+
if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
|
| 558 |
unique_links.append(link)
|
| 559 |
|
| 560 |
+
# Normalize existing contacts safely
|
| 561 |
+
normalized_contact = [
|
| 562 |
+
str(num)[-10:] for num in LLMdata['Contact'] if num
|
| 563 |
+
]
|
| 564 |
+
|
| 565 |
# Removing duplicate phone numbers
|
|
|
|
| 566 |
unique_numbers = []
|
| 567 |
+
for num in cont_data.get('phone_numbers', []):
|
| 568 |
+
if str(num)[-10:] not in normalized_contact:
|
| 569 |
unique_numbers.append(num)
|
| 570 |
+
|
| 571 |
+
# Merge safely
|
| 572 |
+
LLMdata['Email'].extend(unique_emails)
|
| 573 |
+
LLMdata['Link'].extend(unique_links)
|
| 574 |
+
LLMdata['Contact'].extend(unique_numbers)
|
| 575 |
+
|
| 576 |
|
| 577 |
# Apply the function to the data
|
| 578 |
LLMdata=remove_duplicates_case_insensitive(LLMdata)
|