WebashalarForML commited on
Commit
28a746e
·
verified ·
1 Parent(s): 8151bf1

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +139 -34
utility/utils.py CHANGED
@@ -200,33 +200,105 @@ def extract_text_from_images(image_paths):
200
  return all_extracted_texts, all_extracted_imgs_json
201
 
202
  # Function to call the Gemma model and process the output as Json
203
- def Data_Extractor(data, client=client):
204
- text = f'''Act as a Text extractor for the following text given in text: {data}
205
- extract text in the following output JSON string:
206
- {{
207
- "Name": ["Identify and Extract All the person's name from the text."],
208
- "Designation": ["Extract All the designation or job title mentioned in the text."],
209
- "Company": ["Extract All the company or organization name if mentioned."],
210
- "Contact": ["Extract All phone number, including country codes if present."],
211
- "Address": ["Extract All the full postal address or location mentioned in the text."],
212
- "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
213
- "Link": ["Identify and Extract any website URLs or social media links present in the text."]
214
- }}
215
- Output:
216
- '''
217
 
218
- # Call the API for inference
219
- response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
220
 
221
- print("parse in text ---:",response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Convert the response text to JSON
224
  try:
225
- json_data = json.loads(response)
226
- print("Json_data-------------->",json_data)
227
  return json_data
228
  except json.JSONDecodeError as e:
229
- return {"error": f"Error decoding JSON: {e}"}
 
230
 
231
  # For have text compatible to the llm
232
  def json_to_llm_str(textJson):
@@ -445,29 +517,62 @@ def remove_duplicates_case_insensitive(data_dict):
445
  # Process the model output for parsed result
446
  def process_resume_data(LLMdata,cont_data,extracted_text):
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  # Removing duplicate emails
449
  unique_emails = []
450
- for email in cont_data['emails']:
451
- if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
452
  unique_emails.append(email)
453
 
454
- # Removing duplicate links (case insensitive)
455
  unique_links = []
456
- for link in cont_data['links_RE']:
457
- if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
458
  unique_links.append(link)
459
 
 
 
 
 
 
460
  # Removing duplicate phone numbers
461
- normalized_contact = [num[-10:] for num in LLMdata['Contact']]
462
  unique_numbers = []
463
- for num in cont_data['phone_numbers']:
464
- if num[-10:] not in normalized_contact:
465
  unique_numbers.append(num)
466
-
467
- # Add unique emails, links, and phone numbers to the original LLMdata
468
- LLMdata['Email'] += unique_emails
469
- LLMdata['Link'] += unique_links
470
- LLMdata['Contact'] += unique_numbers
 
471
 
472
  # Apply the function to the data
473
  LLMdata=remove_duplicates_case_insensitive(LLMdata)
 
200
  return all_extracted_texts, all_extracted_imgs_json
201
 
202
  # Function to call the Gemma model and process the output as Json
203
+ # def Data_Extractor(data, client=client):
204
+ # text = f'''Act as a Text extractor for the following text given in text: {data}
205
+ # extract text in the following output JSON string:
206
+ # {{
207
+ # "Name": ["Identify and Extract All the person's name from the text."],
208
+ # "Designation": ["Extract All the designation or job title mentioned in the text."],
209
+ # "Company": ["Extract All the company or organization name if mentioned."],
210
+ # "Contact": ["Extract All phone number, including country codes if present."],
211
+ # "Address": ["Extract All the full postal address or location mentioned in the text."],
212
+ # "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
213
+ # "Link": ["Identify and Extract any website URLs or social media links present in the text."]
214
+ # }}
215
+ # Output:
216
+ # '''
217
 
218
+ # # Call the API for inference
219
+ # response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
220
 
221
+ # print("parse in text ---:",response)
222
+
223
+ # # Convert the response text to JSON
224
+ # try:
225
+ # json_data = json.loads(response)
226
+ # print("Json_data-------------->",json_data)
227
+ # return json_data
228
+ # except json.JSONDecodeError as e:
229
+ # return {"error": f"Error decoding JSON: {e}"}
230
+ def Data_Extractor(data):
231
+ url = "https://api.groq.com/openai/v1/chat/completions"
232
+
233
+ headers = {
234
+ "Content-Type": "application/json",
235
+ "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}"
236
+ }
237
+
238
+ prompt = f"""
239
+ You are a strict JSON generator.
240
+
241
+ Extract structured data from the following text.
242
+
243
+ Return ONLY valid JSON. No explanation. No markdown.
244
+
245
+ Schema:
246
+ {{
247
+ "Name": [],
248
+ "Designation": [],
249
+ "Company": [],
250
+ "Contact": [],
251
+ "Address": [],
252
+ "Email": [],
253
+ "Link": []
254
+ }}
255
+
256
+ Rules:
257
+ - Always return all keys
258
+ - If nothing found → return empty list []
259
+ - Do NOT return "Not found"
260
+ - Ensure valid JSON format
261
+
262
+ Text:
263
+ {data}
264
+ """
265
+
266
+ payload = {
267
+ "model": "llama-3.3-70b-versatile",
268
+ "messages": [
269
+ {"role": "user", "content": prompt}
270
+ ],
271
+ "temperature": 0.2, # 🔥 IMPORTANT: lower = more structured
272
+ "max_tokens": 1024,
273
+ "top_p": 1,
274
+ "stream": False
275
+ }
276
+
277
+ response = requests.post(url, headers=headers, json=payload)
278
+
279
+ if response.status_code != 200:
280
+ return {"error": response.text}
281
+
282
+ result = response.json()
283
+
284
+ # Extract model output
285
+ content = result["choices"][0]["message"]["content"]
286
+
287
+ print("RAW LLM OUTPUT:\n", content)
288
+
289
+ # 🔧 Clean response (important)
290
+ content = content.strip()
291
+
292
+ # Remove markdown if model adds ```json
293
+ if content.startswith("```"):
294
+ content = content.split("```")[1]
295
 
 
296
  try:
297
+ json_data = json.loads(content)
 
298
  return json_data
299
  except json.JSONDecodeError as e:
300
+ print("JSON ERROR:", e)
301
+ return {"error": "Invalid JSON from model", "raw": content}
302
 
303
  # For have text compatible to the llm
304
  def json_to_llm_str(textJson):
 
517
  # Process the model output for parsed result
518
  def process_resume_data(LLMdata,cont_data,extracted_text):
519
 
520
+ # # Removing duplicate emails
521
+ # unique_emails = []
522
+ # for email in cont_data['emails']:
523
+ # if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
524
+ # unique_emails.append(email)
525
+
526
+ # # Removing duplicate links (case insensitive)
527
+ # unique_links = []
528
+ # for link in cont_data['links_RE']:
529
+ # if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
530
+ # unique_links.append(link)
531
+
532
+ # # Removing duplicate phone numbers
533
+ # normalized_contact = [num[-10:] for num in LLMdata['Contact']]
534
+ # unique_numbers = []
535
+ # for num in cont_data['phone_numbers']:
536
+ # if num[-10:] not in normalized_contact:
537
+ # unique_numbers.append(num)
538
+
539
+ # # Add unique emails, links, and phone numbers to the original LLMdata
540
+ # LLMdata['Email'] += unique_emails
541
+ # LLMdata['Link'] += unique_links
542
+ # LLMdata['Contact'] += unique_numbers
543
+ # Ensure keys exist (CRITICAL FIX)
544
+ LLMdata['Email'] = LLMdata.get('Email', []) or []
545
+ LLMdata['Link'] = LLMdata.get('Link', []) or []
546
+ LLMdata['Contact'] = LLMdata.get('Contact', []) or []
547
+
548
  # Removing duplicate emails
549
  unique_emails = []
550
+ for email in cont_data.get('emails', []):
551
+ if not any(email.lower() == str(existing_email).lower() for existing_email in LLMdata['Email']):
552
  unique_emails.append(email)
553
 
554
+ # Removing duplicate links
555
  unique_links = []
556
+ for link in cont_data.get('links_RE', []):
557
+ if not any(link.lower() == str(existing_link).lower() for existing_link in LLMdata['Link']):
558
  unique_links.append(link)
559
 
560
+ # Normalize existing contacts safely
561
+ normalized_contact = [
562
+ str(num)[-10:] for num in LLMdata['Contact'] if num
563
+ ]
564
+
565
  # Removing duplicate phone numbers
 
566
  unique_numbers = []
567
+ for num in cont_data.get('phone_numbers', []):
568
+ if str(num)[-10:] not in normalized_contact:
569
  unique_numbers.append(num)
570
+
571
+ # Merge safely
572
+ LLMdata['Email'].extend(unique_emails)
573
+ LLMdata['Link'].extend(unique_links)
574
+ LLMdata['Contact'].extend(unique_numbers)
575
+
576
 
577
  # Apply the function to the data
578
  LLMdata=remove_duplicates_case_insensitive(LLMdata)