WebashalarForML commited on
Commit
00227b0
·
verified ·
1 Parent(s): 75824d4

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +378 -401
utility/utils.py CHANGED
@@ -1,508 +1,485 @@
1
  # libraries
2
  import os
3
- from huggingface_hub import InferenceClient
4
- from dotenv import load_dotenv
5
  import json
6
  import re
7
- #import easyocr
8
- from PIL import Image, ImageEnhance, ImageDraw
9
- import cv2
10
- import numpy as np
11
- from paddleocr import PaddleOCR
12
  import logging
13
  from datetime import datetime
14
 
 
 
 
 
 
 
15
  # Configure logging
16
  logging.basicConfig(
17
  level=logging.INFO,
18
- handlers=[
19
- logging.StreamHandler() # Remove FileHandler and log only to the console
20
- ]
21
  )
22
 
23
- # Set the PaddleOCR home directory to a writable location
24
- import os
25
-
26
- os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
27
-
28
- RESULT_FOLDER = 'static/results/'
29
- JSON_FOLDER = 'static/json/'
30
 
31
- if not os.path.exists('/tmp/.paddleocr'):
32
- os.makedirs(RESULT_FOLDER, exist_ok=True)
 
 
33
 
34
- # Check if PaddleOCR home directory is writable
35
- if not os.path.exists('/tmp/.paddleocr'):
36
- os.makedirs('/tmp/.paddleocr', exist_ok=True)
37
- logging.info("Created PaddleOCR home directory.")
38
- else:
39
- logging.info("PaddleOCR home directory exists.")
40
 
41
- # Load environment variables from .env file
42
- load_dotenv()
43
 
44
- # Authenticate with Hugging Face
45
- HFT = os.getenv('HF_TOKEN')
 
 
46
 
47
- # Initialize the InferenceClient
48
- client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
49
 
50
  def load_image(image_path):
51
  ext = os.path.splitext(image_path)[1].lower()
52
- if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
53
  image = cv2.imread(image_path)
54
  if image is None:
55
- raise ValueError(f"Failed to load image from {image_path}. The file may be corrupted or unreadable.")
56
  return image
57
- else:
58
- raise ValueError(f"Unsupported image format: {ext}")
59
-
60
- # Function for upscaling image using OpenCV's INTER_CUBIC
61
  def upscale_image(image, scale=2):
62
  height, width = image.shape[:2]
63
- upscaled_image = cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC)
64
- return upscaled_image
65
 
66
- # Function to denoise the image (reduce noise)
67
  def reduce_noise(image):
68
  return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
69
 
70
- # Function to sharpen the image
71
  def sharpen_image(image):
72
- kernel = np.array([[0, -1, 0],
73
- [-1, 5, -1],
74
- [0, -1, 0]])
75
- sharpened_image = cv2.filter2D(image, -1, kernel)
76
- return sharpened_image
 
 
77
 
78
- # Function to increase contrast and enhance details without changing color
79
  def enhance_image(image):
80
  pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
81
  enhancer = ImageEnhance.Contrast(pil_img)
82
  enhanced_image = enhancer.enhance(1.5)
83
- enhanced_image_bgr = cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR)
84
- return enhanced_image_bgr
85
 
86
- # Complete function to process image
87
  def process_image(image_path, scale=2):
88
- # Load the image
89
  image = load_image(image_path)
90
-
91
- # Upscale the image
92
  upscaled_image = upscale_image(image, scale)
93
-
94
- # Reduce noise
95
  denoised_image = reduce_noise(upscaled_image)
96
-
97
- # Sharpen the image
98
  sharpened_image = sharpen_image(denoised_image)
99
-
100
- # Enhance the image contrast and details without changing color
101
  final_image = enhance_image(sharpened_image)
102
-
103
  return final_image
104
 
105
- # Function for OCR with PaddleOCR, returning both text and bounding boxes
106
- def ocr_with_paddle(img):
107
- final_text = ''
108
- boxes = []
109
-
110
- # Initialize PaddleOCR
111
- ocr = PaddleOCR(
112
- lang='en',
113
- use_angle_cls=False
114
- # det_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/det'),
115
- # rec_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/rec/en/en_PP-OCRv4_rec_infer'),
116
- # cls_model_dir=os.path.join(os.environ['PADDLEOCR_HOME'], 'whl/cls/ch_ppocr_mobile_v2.0_cls_infer')
117
- )
118
- # ocr = PaddleOCR(
119
- # use_angle_cls=True,
120
- # lang='en',
121
- # det_model_dir='/app/paddleocr_models/whl/det/ch_ppocr_mobile_v2.0_det_infer',
122
- # rec_model_dir='/app/paddleocr_models/whl/rec/ch_ppocr_mobile_v2.0_rec_infer',
123
- # cls_model_dir='/app/paddleocr_models/whl/cls/ch_ppocr_mobile_v2.0_cls_infer'
124
- # )
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # Check if img is a file path or an image array
128
- if isinstance(img, str):
129
- img = cv2.imread(img)
130
 
131
- # Perform OCR
132
- result = ocr.ocr(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Iterate through the OCR result
135
- for line in result[0]:
136
- # Check how many values are returned (2 or 3) and unpack accordingly
137
- if len(line) == 3:
138
- box, (text, confidence), _ = line # When 3 values are returned
139
- elif len(line) == 2:
140
- box, (text, confidence) = line # When only 2 values are returned
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- # Store the recognized text and bounding boxes
143
- final_text += ' ' + text # Extract the text from the tuple
144
- boxes.append(box)
145
 
146
- # Draw the bounding box
147
- points = [(int(point[0]), int(point[1])) for point in box]
148
- cv2.polylines(img, [np.array(points)], isClosed=True, color=(0, 255, 0), thickness=2)
 
 
 
 
149
 
150
- # Store the image with bounding boxes in a variable
151
- img_with_boxes = img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- return final_text, img_with_boxes
154
 
155
  def extract_text_from_images(image_paths):
 
 
 
 
 
 
 
 
156
  all_extracted_texts = {}
157
  all_extracted_imgs = {}
 
158
  for image_path in image_paths:
159
  try:
160
- # Enhance the image before OCR
161
  enhanced_image = process_image(image_path, scale=2)
162
 
163
- # Perform OCR on the enhanced image and get boxes
164
- result, img_with_boxes = ocr_with_paddle(enhanced_image)
165
-
166
- # Draw bounding boxes on the processed image
167
- img_result = Image.fromarray(enhanced_image)
168
- #img_with_boxes = draw_boxes(img_result, boxes)
169
-
170
- # genrating unique id to save the images
171
- # Get the current date and time
172
  current_time = datetime.now()
173
-
174
- # Format it as a string to create a unique ID
175
  unique_id = current_time.strftime("%Y%m%d%H%M%S%f")
 
 
 
 
 
 
176
 
177
- #print(unique_id)
 
 
 
178
 
179
- # Save the image with boxes
180
- result_image_path = os.path.join(RESULT_FOLDER, f'result_{unique_id}_{os.path.basename(image_path)}')
181
- #img_with_boxes.save(result_image_path)
182
- cv2.imwrite(result_image_path, img_with_boxes)
183
 
184
- # Store the text and image result paths
185
- all_extracted_texts[image_path] = result
186
  all_extracted_imgs[image_path] = result_image_path
187
- except ValueError as ve:
188
- print(f"Error processing image {image_path}: {ve}")
189
- continue # Continue to the next image if there's an error
190
-
191
- # Convert to JSON-compatible structure
192
- all_extracted_imgs_json = {str(k): str(v) for k, v in all_extracted_imgs.items()}
193
- return all_extracted_texts, all_extracted_imgs_json
194
-
195
- # Function to call the Gemma model and process the output as Json
196
- def Data_Extractor(data, client=client):
197
- text = f'''Act as a Text extractor for the following text given in text: {data}
198
- extract text in the following output JSON string:
199
- {{
200
- "Name": ["Identify and Extract All the person's name from the text."],
201
- "Designation": ["Extract All the designation or job title mentioned in the text."],
202
- "Company": ["Extract All the company or organization name if mentioned."],
203
- "Contact": ["Extract All phone number, including country codes if present."],
204
- "Address": ["Extract All the full postal address or location mentioned in the text."],
205
- "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
206
- "Link": ["Identify and Extract any website URLs or social media links present in the text."]
207
- }}
208
- Output:
209
- '''
210
-
211
- # Call the API for inference
212
- response = client.text_generation(text, max_new_tokens=1000)#, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
213
-
214
- print("parse in text ---:",response)
215
-
216
- # Convert the response text to JSON
217
- try:
218
- json_data = json.loads(response)
219
- print("Json_data-------------->",json_data)
220
- return json_data
221
- except json.JSONDecodeError as e:
222
- return {"error": f"Error decoding JSON: {e}"}
223
 
224
- # For have text compatible to the llm
225
- def json_to_llm_str(textJson):
226
- str=''
227
- for file,item in textJson.items():
228
- str+=item + ' '
229
- return str
 
 
230
 
231
- # Define the RE for extracting the contact details like number, mail , portfolio, website etc
232
  def extract_contact_details(text):
233
- # Regex patterns
234
- # Phone numbers with at least 5 digits in any segment
235
  combined_phone_regex = re.compile(r'''
236
- (?:
237
- #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
238
- \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
239
- \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
240
- \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
241
- \(\d{3}\)\s\d{3}\s\d{3} | # USA/Canada (XXX) XXX XXX
242
- \+1\d{10} | # +1 XXXXXXXXXX
243
- \d{10} | # XXXXXXXXXX
244
- \+44\s\d{4}\s\d{6} | # UK Intl +44 XXXX XXXXXX
245
- \+44\s\d{3}\s\d{3}\s\d{4} | # UK Intl +44 XXX XXX XXXX
246
- 0\d{4}\s\d{6} | # UK STD 0XXXX XXXXXX
247
- 0\d{3}\s\d{3}\s\d{4} | # UK STD 0XXX XXX XXXX
248
- \+44\d{10} | # +44 XXXXXXXXXX
249
- 0\d{10} | # 0XXXXXXXXXX
250
- \+61\s\d\s\d{4}\s\d{4} | # Australia Intl +61 X XXXX XXXX
251
- 0\d\s\d{4}\s\d{4} | # Australia STD 0X XXXX XXXX
252
- \+61\d{9} | # +61 XXXXXXXXX
253
- 0\d{9} | # 0XXXXXXXXX
254
- \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
255
- \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
256
- \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
257
- \+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
258
- \+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
259
- \+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
260
- \+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
261
- \+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
262
- \d{5}\s\d{5} | # India XXXXX XXXXX
263
- \d{5}-\d{5} | # India XXXXX-XXXXX
264
- 0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
265
- \+91\d{10} | # +91 XXXXXXXXXX
266
- \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
267
- \d{6}-\d{4} | # XXXXXX-XXXX
268
- \d{4}-\d{6} | # XXXX-XXXXXX
269
- \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
270
- \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
271
- \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
272
- \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
273
- \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
274
- \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
275
- 0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
276
- \+49\d{12} | # +49 XXXXXXXXXXXX
277
- \+49\d{10} | # +49 XXXXXXXXXX
278
- 0\d{11} | # 0XXXXXXXXXXX
279
- \+86\s\d{3}\s\d{4}\s\d{4} | # China Intl +86 XXX XXXX XXXX
280
- 0\d{3}\s\d{4}\s\d{4} | # China STD 0XXX XXXX XXXX
281
- \+86\d{11} | # +86 XXXXXXXXXXX
282
- \+81\s\d\s\d{4}\s\d{4} | # Japan Intl +81 X XXXX XXXX
283
- \+81\s\d{2}\s\d{4}\s\d{4} | # Japan Intl +81 XX XXXX XXXX
284
- 0\d\s\d{4}\s\d{4} | # Japan STD 0X XXXX XXXX
285
- \+81\d{10} | # +81 XXXXXXXXXX
286
- \+81\d{9} | # +81 XXXXXXXXX
287
- 0\d{9} | # 0XXXXXXXXX
288
- \+55\s\d{2}\s\d{5}-\d{4} | # Brazil Intl +55 XX XXXXX-XXXX
289
- \+55\s\d{2}\s\d{4}-\d{4} | # Brazil Intl +55 XX XXXX-XXXX
290
- 0\d{2}\s\d{4}\s\d{4} | # Brazil STD 0XX XXXX XXXX
291
- \+55\d{11} | # +55 XXXXXXXXXXX
292
- \+55\d{10} | # +55 XXXXXXXXXX
293
- 0\d{10} | # 0XXXXXXXXXX
294
- \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France Intl +33 X XX XX XX XX
295
- 0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France STD 0X XX XX XX XX
296
- \+33\d{9} | # +33 XXXXXXXXX
297
- 0\d{9} | # 0XXXXXXXXX
298
- \+7\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia Intl +7 XXX XXX-XX-XX
299
- 8\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia STD 8 XXX XXX-XX-XX
300
- \+7\d{10} | # +7 XXXXXXXXXX
301
- 8\d{10} | # 8 XXXXXXXXXX
302
- \+27\s\d{2}\s\d{3}\s\d{4} | # South Africa Intl +27 XX XXX XXXX
303
- 0\d{2}\s\d{3}\s\d{4} | # South Africa STD 0XX XXX XXXX
304
- \+27\d{9} | # +27 XXXXXXXXX
305
- 0\d{9} | # 0XXXXXXXXX
306
- \+52\s\d{3}\s\d{3}\s\d{4} | # Mexico Intl +52 XXX XXX XXXX
307
- \+52\s\d{2}\s\d{4}\s\d{4} | # Mexico Intl +52 XX XXXX XXXX
308
- 01\s\d{3}\s\d{4} | # Mexico STD 01 XXX XXXX
309
- \+52\d{10} | # +52 XXXXXXXXXX
310
- 01\d{7} | # 01 XXXXXXX
311
- \+234\s\d{3}\s\d{3}\s\d{4} | # Nigeria Intl +234 XXX XXX XXXX
312
- 0\d{3}\s\d{3}\s\d{4} | # Nigeria STD 0XXX XXX XXXX
313
- \+234\d{10} | # +234 XXXXXXXXXX
314
- 0\d{10} | # 0XXXXXXXXXX
315
- \+971\s\d\s\d{3}\s\d{4} | # UAE Intl +971 X XXX XXXX
316
- 0\d\s\d{3}\s\d{4} | # UAE STD 0X XXX XXXX
317
- \+971\d{8} | # +971 XXXXXXXX
318
- 0\d{8} | # 0XXXXXXXX
319
- \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
320
- \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
321
- 0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
322
- \+54\d{10} | # +54 9 XXXXXXXXXX
323
- \+54\d{9} | # +54 XXXXXXXXX
324
- 0\d{7} | # 0XXXXXXX
325
- \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
326
- 0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
327
- \+966\d{8} | # +966 XXXXXXXX
328
- 0\d{8} | # 0XXXXXXXX
329
- \+1\d{10} | # +1 XXXXXXXXXX
330
- \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX
331
- \d{5}\s\d{5} | # XXXXX XXXXX
332
- \d{10} | # XXXXXXXXXX
333
- \+44\d{10} | # +44 XXXXXXXXXX
334
- 0\d{10} | # 0XXXXXXXXXX
335
- \+61\d{9} | # +61 XXXXXXXXX
336
- 0\d{9} | # 0XXXXXXXXX
337
- \+91\d{10} | # +91 XXXXXXXXXX
338
- \+49\d{12} | # +49 XXXXXXXXXXXX
339
- \+49\d{10} | # +49 XXXXXXXXXX
340
- 0\d{11} | # 0XXXXXXXXXXX
341
- \+86\d{11} | # +86 XXXXXXXXXXX
342
- \+81\d{10} | # +81 XXXXXXXXXX
343
- \+81\d{9} | # +81 XXXXXXXXX
344
- 0\d{9} | # 0XXXXXXXXX
345
- \+55\d{11} | # +55 XXXXXXXXXXX
346
- \+55\d{10} | # +55 XXXXXXXXXX
347
- 0\d{10} | # 0XXXXXXXXXX
348
- \+33\d{9} | # +33 XXXXXXXXX
349
- 0\d{9} | # 0XXXXXXXXX
350
- \+7\d{10} | # +7 XXXXXXXXXX
351
- 8\d{10} | # 8 XXXXXXXXXX
352
- \+27\d{9} | # +27 XXXXXXXXX
353
- 0\d{9} | # 0XXXXXXXXX (South Africa STD)
354
- \+52\d{10} | # +52 XXXXXXXXXX
355
- 01\d{7} | # 01 XXXXXXX
356
- \+234\d{10} | # +234 XXXXXXXXXX
357
- 0\d{10} | # 0XXXXXXXXXX
358
- \+971\d{8} | # +971 XXXXXXXX
359
- 0\d{8} | # 0XXXXXXXX
360
- \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
361
- \+54\d{9} | # +54 XXXXXXXXX
362
- 0\d{7} | # 0XXXXXXX
363
- \+966\d{8} | # +966 XXXXXXXX
364
- 0\d{8} # 0XXXXXXXX
365
  \+\d{3}-\d{3}-\d{4}
366
- )
 
367
 
368
- ''',re.VERBOSE)
369
-
370
- # Email regex
371
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
372
-
373
- # URL and links regex, updated to avoid conflicts with email domains
374
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
375
-
376
- # Find all matches in the text
377
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
378
-
379
  emails = email_regex.findall(text)
380
-
381
- links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
382
-
383
- # Remove profile links that might conflict with emails
384
  links_RE = [link for link in links_RE if not any(email in link for email in emails)]
385
-
386
  return {
387
  "phone_numbers": phone_numbers,
388
  "emails": emails,
389
  "links_RE": links_RE
390
- }
 
391
 
392
- # preprocessing the data
393
  def process_extracted_text(extracted_text):
394
- # Load JSON data
395
- data = json.dumps(extracted_text, indent=4)
396
- data = json.loads(data)
397
 
398
- # Create a single dictionary to hold combined results
399
  combined_results = {
400
  "phone_numbers": [],
401
  "emails": [],
402
  "links_RE": []
403
  }
404
 
405
- # Process each text entry
406
  for filename, text in data.items():
407
  contact_details = extract_contact_details(text)
408
- # Extend combined results with the details from this file
409
  combined_results["phone_numbers"].extend(contact_details["phone_numbers"])
410
  combined_results["emails"].extend(contact_details["emails"])
411
  combined_results["links_RE"].extend(contact_details["links_RE"])
412
 
413
- # Convert the combined results to JSON
414
- #combined_results_json = json.dumps(combined_results, indent=4)
415
- combined_results_json = combined_results
416
-
417
- # Print the final JSON results
418
  print("Combined contact details in JSON format:")
419
- print(combined_results_json)
 
 
420
 
421
- return combined_results_json
422
 
423
- # Function to remove duplicates (case-insensitive) from each list in the dictionary
424
  def remove_duplicates_case_insensitive(data_dict):
425
  for key, value_list in data_dict.items():
 
 
 
426
  seen = set()
427
  unique_list = []
428
-
429
  for item in value_list:
430
- if item.lower() not in seen:
431
- unique_list.append(item) # Add original item (preserving its case)
432
- seen.add(item.lower()) # Track lowercase version
433
-
434
- # Update the dictionary with unique values
 
435
  data_dict[key] = unique_list
 
436
  return data_dict
437
 
438
- # Process the model output for parsed result
439
- def process_resume_data(LLMdata,cont_data,extracted_text):
440
-
441
- # Removing duplicate emails
442
- unique_emails = []
443
- for email in cont_data['emails']:
444
- if not any(email.lower() == existing_email.lower() for existing_email in LLMdata['Email']):
445
- unique_emails.append(email)
446
-
447
- # Removing duplicate links (case insensitive)
448
- unique_links = []
449
- for link in cont_data['links_RE']:
450
- if not any(link.lower() == existing_link.lower() for existing_link in LLMdata['Link']):
451
- unique_links.append(link)
452
-
453
- # Removing duplicate phone numbers
454
- normalized_contact = [num[-10:] for num in LLMdata['Contact']]
455
- unique_numbers = []
456
- for num in cont_data['phone_numbers']:
457
- if num[-10:] not in normalized_contact:
458
- unique_numbers.append(num)
459
-
460
- # Add unique emails, links, and phone numbers to the original LLMdata
461
- LLMdata['Email'] += unique_emails
462
- LLMdata['Link'] += unique_links
463
- LLMdata['Contact'] += unique_numbers
464
-
465
- # Apply the function to the data
466
- LLMdata=remove_duplicates_case_insensitive(LLMdata)
467
-
468
- # Initialize the processed data dictionary
469
- processed_data = {
470
- "name": [],
471
- "contact_number": [],
472
- "Designation":[],
473
- "email": [],
474
- "Location": [],
475
- "Link": [],
476
- "Company":[],
477
- "extracted_text": extracted_text
478
- }
479
- #LLM
480
-
481
- processed_data['name'].extend(LLMdata.get('Name', None))
482
- #processed_data['contact_number'].extend(LLMdata.get('Contact', []))
483
- processed_data['Designation'].extend(LLMdata.get('Designation', []))
484
- #processed_data['email'].extend(LLMdata.get("Email", []))
485
- processed_data['Location'].extend(LLMdata.get('Address', []))
486
- #processed_data['Link'].extend(LLMdata.get('Link', []))
487
- processed_data['Company'].extend(LLMdata.get('Company', []))
488
-
489
- #Contact
490
- #processed_data['email'].extend(cont_data.get("emails", []))
491
- #processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
492
- #processed_data['Link'].extend(cont_data.get("links_RE", []))
493
-
494
- #New_merge_data
495
- processed_data['email'].extend(LLMdata['Email'])
496
- processed_data['contact_number'].extend(LLMdata['Contact'])
497
- processed_data['Link'].extend(LLMdata['Link'])
498
-
499
- #to remove not found fields
500
- # List of keys to check for 'Not found'
501
- keys_to_check = ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]
502
-
503
- # Replace 'Not found' with an empty list for each key
504
- for key in keys_to_check:
505
- if processed_data[key] == ['Not found'] or processed_data[key] == ['not found']:
506
- processed_data[key] = []
507
-
508
- return processed_data
 
1
  # libraries
2
  import os
3
+ import base64
 
4
  import json
5
  import re
 
 
 
 
 
6
  import logging
7
  from datetime import datetime
8
 
9
+ import cv2
10
+ import numpy as np
11
+ import requests
12
+ from dotenv import load_dotenv
13
+ from PIL import Image, ImageEnhance
14
+
15
  # Configure logging
16
  logging.basicConfig(
17
  level=logging.INFO,
18
+ handlers=[logging.StreamHandler()]
 
 
19
  )
20
 
21
+ # Load environment variables from .env file
22
+ load_dotenv()
 
 
 
 
 
23
 
24
+ # Groq config
25
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
26
+ GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
27
+ GROQ_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
28
 
29
+ RESULT_FOLDER = "static/results/"
30
+ JSON_FOLDER = "static/json/"
 
 
 
 
31
 
32
+ os.makedirs(RESULT_FOLDER, exist_ok=True)
33
+ os.makedirs(JSON_FOLDER, exist_ok=True)
34
 
35
+ # PaddleOCR home directory is no longer needed for the main path,
36
+ # but keeping this does not hurt if something else imports it.
37
+ os.environ["PADDLEOCR_HOME"] = "/tmp/.paddleocr"
38
+ os.makedirs(os.environ["PADDLEOCR_HOME"], exist_ok=True)
39
 
 
 
40
 
41
  def load_image(image_path):
42
  ext = os.path.splitext(image_path)[1].lower()
43
+ if ext in [".png", ".jpg", ".jpeg", ".webp", ".tiff", ".bmp"]:
44
  image = cv2.imread(image_path)
45
  if image is None:
46
+ raise ValueError(f"Failed to load image from {image_path}")
47
  return image
48
+ raise ValueError(f"Unsupported image format: {ext}")
49
+
50
+
 
51
  def upscale_image(image, scale=2):
52
  height, width = image.shape[:2]
53
+ return cv2.resize(image, (width * scale, height * scale), interpolation=cv2.INTER_CUBIC)
54
+
55
 
 
56
  def reduce_noise(image):
57
  return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
58
 
59
+
60
  def sharpen_image(image):
61
+ kernel = np.array([
62
+ [0, -1, 0],
63
+ [-1, 5, -1],
64
+ [0, -1, 0]
65
+ ])
66
+ return cv2.filter2D(image, -1, kernel)
67
+
68
 
 
69
  def enhance_image(image):
70
  pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
71
  enhancer = ImageEnhance.Contrast(pil_img)
72
  enhanced_image = enhancer.enhance(1.5)
73
+ return cv2.cvtColor(np.array(enhanced_image), cv2.COLOR_RGB2BGR)
74
+
75
 
 
76
  def process_image(image_path, scale=2):
 
77
  image = load_image(image_path)
 
 
78
  upscaled_image = upscale_image(image, scale)
 
 
79
  denoised_image = reduce_noise(upscaled_image)
 
 
80
  sharpened_image = sharpen_image(denoised_image)
 
 
81
  final_image = enhance_image(sharpened_image)
 
82
  return final_image
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def image_to_base64(image):
86
+ """
87
+ image: OpenCV BGR numpy array
88
+ returns: base64 string of JPEG bytes
89
+ """
90
+ ok, buffer = cv2.imencode(".jpg", image)
91
+ if not ok:
92
+ raise ValueError("Failed to encode image to JPEG.")
93
+ return base64.b64encode(buffer).decode("utf-8")
94
+
95
+
96
+ def _empty_schema():
97
+ return {
98
+ "Name": [],
99
+ "Designation": [],
100
+ "Company": [],
101
+ "Contact": [],
102
+ "Address": [],
103
+ "Email": [],
104
+ "Link": []
105
+ }
106
 
 
 
 
107
 
108
+ def _coerce_list(value):
109
+ if value is None:
110
+ return []
111
+ if isinstance(value, list):
112
+ return [v for v in value if v is not None and str(v).strip() != ""]
113
+ if isinstance(value, tuple):
114
+ return [v for v in value if v is not None and str(v).strip() != ""]
115
+ if isinstance(value, str):
116
+ s = value.strip()
117
+ return [] if s == "" else [s]
118
+ return [value]
119
+
120
+
121
+ def _strip_code_fences(text):
122
+ if not isinstance(text, str):
123
+ return text
124
+ text = text.strip()
125
+ if text.startswith("```"):
126
+ text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.IGNORECASE)
127
+ text = re.sub(r"\s*```$", "", text)
128
+ return text.strip()
129
+
130
+
131
+ def _parse_json_content(content):
132
+ """
133
+ Parses Groq response content into dict.
134
+ Handles:
135
+ - plain JSON string
136
+ - fenced JSON
137
+ - accidental text around JSON
138
+ """
139
+ if isinstance(content, dict):
140
+ return content
141
+
142
+ if content is None:
143
+ return {}
144
+
145
+ content = _strip_code_fences(str(content))
146
 
147
+ try:
148
+ return json.loads(content)
149
+ except json.JSONDecodeError:
150
+ # Try to recover a JSON object embedded in text
151
+ match = re.search(r"\{.*\}", content, flags=re.DOTALL)
152
+ if match:
153
+ return json.loads(match.group(0))
154
+ raise
155
+
156
+
157
+ def normalize_llm_schema(data):
158
+ """
159
+ Normalizes model output to:
160
+ {
161
+ "Name": [],
162
+ "Designation": [],
163
+ "Company": [],
164
+ "Contact": [],
165
+ "Address": [],
166
+ "Email": [],
167
+ "Link": []
168
+ }
169
+ Accepts a dict that may have nulls, strings, or alternate key spellings.
170
+ """
171
+ data = data or {}
172
+
173
+ # Common alternate keys seen in model outputs
174
+ key_aliases = {
175
+ "Name": ["Name", "name", "FullName", "full_name", "person_name"],
176
+ "Designation": ["Designation", "designation", "Title", "title", "Role", "role"],
177
+ "Company": ["Company", "company", "Organization", "organization", "Org", "org"],
178
+ "Contact": ["Contact", "contact", "Phone", "phone", "Mobile", "mobile", "PhoneNumber", "phone_number"],
179
+ "Address": ["Address", "address", "Location", "location"],
180
+ "Email": ["Email", "email", "E-mail", "e_mail"],
181
+ "Link": ["Link", "link", "URL", "url", "Website", "website", "Portfolio", "portfolio"]
182
+ }
183
 
184
+ normalized = _empty_schema()
 
 
185
 
186
+ for canonical_key, aliases in key_aliases.items():
187
+ chosen = []
188
+ for alias in aliases:
189
+ if alias in data and data[alias] is not None:
190
+ chosen = _coerce_list(data[alias])
191
+ break
192
+ normalized[canonical_key] = chosen
193
 
194
+ return normalized
195
+
196
+
197
+ def call_groq_vlm(image_bgr, prompt, timeout=120, retries=2):
198
+ if not GROQ_API_KEY:
199
+ raise ValueError("GROQ_API_KEY is missing from environment variables.")
200
+
201
+ base64_image = image_to_base64(image_bgr)
202
+
203
+ headers = {
204
+ "Content-Type": "application/json",
205
+ "Authorization": f"Bearer {GROQ_API_KEY}"
206
+ }
207
+
208
+ payload = {
209
+ "model": GROQ_MODEL,
210
+ "messages": [
211
+ {
212
+ "role": "system",
213
+ "content": (
214
+ "You are a strict information extraction engine. "
215
+ "Return only valid JSON and no markdown."
216
+ )
217
+ },
218
+ {
219
+ "role": "user",
220
+ "content": [
221
+ {"type": "text", "text": prompt},
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {
225
+ "url": f"data:image/jpeg;base64,{base64_image}"
226
+ }
227
+ }
228
+ ]
229
+ }
230
+ ],
231
+ "temperature": 0.1,
232
+ "top_p": 1,
233
+ "max_completion_tokens": 1024,
234
+ "stream": False,
235
+ "response_format": {"type": "json_object"}
236
+ }
237
+
238
+ last_error = None
239
+ for attempt in range(retries + 1):
240
+ try:
241
+ resp = requests.post(GROQ_URL, headers=headers, json=payload, timeout=timeout)
242
+ resp.raise_for_status()
243
+ data = resp.json()
244
+
245
+ content = data["choices"][0]["message"]["content"]
246
+ parsed = _parse_json_content(content)
247
+ return normalize_llm_schema(parsed)
248
+
249
+ except Exception as e:
250
+ last_error = e
251
+ logging.exception(f"Groq VLM request failed on attempt {attempt + 1}")
252
+ if attempt < retries:
253
+ continue
254
+
255
+ raise last_error
256
+
257
+
258
+ def build_vlm_prompt():
259
+ return """
260
+ Extract structured text from this image and return ONLY valid JSON.
261
+
262
+ Schema:
263
+ {
264
+ "Name": [],
265
+ "Designation": [],
266
+ "Company": [],
267
+ "Contact": [],
268
+ "Address": [],
269
+ "Email": [],
270
+ "Link": []
271
+ }
272
+
273
+ Rules:
274
+ - Always return all keys.
275
+ - Every value must be a JSON array.
276
+ - If a field is not found, return [].
277
+ - Do not return null.
278
+ - Do not add explanations or markdown.
279
+ - Extract all visible text from the image, including business card text, printed labels, logos, URLs, and contact details.
280
+ """
281
 
 
282
 
283
  def extract_text_from_images(image_paths):
284
+ """
285
+ Groq VLM single-pass extraction.
286
+ Returns:
287
+ merged_llm_data: dict with the schema above
288
+ all_extracted_texts: dict[path] -> JSON string per image
289
+ all_extracted_imgs: dict[path] -> processed image path
290
+ """
291
+ merged_llm_data = _empty_schema()
292
  all_extracted_texts = {}
293
  all_extracted_imgs = {}
294
+
295
  for image_path in image_paths:
296
  try:
 
297
  enhanced_image = process_image(image_path, scale=2)
298
 
 
 
 
 
 
 
 
 
 
299
  current_time = datetime.now()
 
 
300
  unique_id = current_time.strftime("%Y%m%d%H%M%S%f")
301
+ result_image_path = os.path.join(
302
+ RESULT_FOLDER,
303
+ f"result_{unique_id}_{os.path.basename(image_path)}"
304
+ )
305
+
306
+ cv2.imwrite(result_image_path, enhanced_image)
307
 
308
+ single_data = call_groq_vlm(
309
+ enhanced_image,
310
+ build_vlm_prompt()
311
+ )
312
 
313
+ # Merge into combined schema
314
+ for key in merged_llm_data.keys():
315
+ merged_llm_data[key].extend(_coerce_list(single_data.get(key)))
 
316
 
317
+ # Keep per-image extracted JSON as text for downstream regex processing
318
+ all_extracted_texts[image_path] = json.dumps(single_data, ensure_ascii=False)
319
  all_extracted_imgs[image_path] = result_image_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ logging.info(f"Processed image: {image_path}")
322
+
323
+ except Exception as e:
324
+ logging.exception(f"Error processing image {image_path}: {e}")
325
+ continue
326
+
327
+ return merged_llm_data, all_extracted_texts, all_extracted_imgs
328
+
329
 
 
330
  def extract_contact_details(text):
331
+ # Keep your existing regex logic here exactly as-is.
332
+ # This function is unchanged from your current file.
333
  combined_phone_regex = re.compile(r'''
334
+ (?:
335
+ \+1\s\(\d{3}\)\s\d{3}-\d{4} |
336
+ \(\d{3}\)\s\d{3}-\d{4} |
337
+ \(\d{3}\)\s\d{3}\s\d{4} |
338
+ \+1\d{10} |
339
+ \d{10} |
340
+ \+44\s\d{4}\s\d{6} |
341
+ \+44\s\d{3}\s\d{3}\s\d{4} |
342
+ 0\d{4}\s\d{6} |
343
+ 0\d{3}\s\d{3}\s\d{4} |
344
+ \+44\d{10} |
345
+ 0\d{10} |
346
+ \+91\s\d{5}-\d{5} |
347
+ \+91\s\d{4}-\d{6} |
348
+ \+91\s\d{10} |
349
+ \+91\s\d{3}\s\d{3}\s\d{4} |
350
+ \+91\s\d{3}-\d{3}-\d{4} |
351
+ \+91\s\d{2}\s\d{4}\s\d{4} |
352
+ \+91\s\d{2}-\d{4}-\d{4} |
353
+ \+91\s\d{5}\s\d{5} |
354
+ \d{5}\s\d{5} |
355
+ \d{5}-\d{5} |
356
+ 0\d{2}-\d{7} |
357
+ \+91\d{10} |
358
+ \d{6}-\d{4} |
359
+ \d{4}-\d{6} |
360
+ \d{3}\s\d{3}\s\d{4} |
361
+ \d{3}-\d{3}-\d{4} |
362
+ \d{4}\s\d{3}\s\d{3} |
363
+ \d{4}-\d{3}-\d{3} |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  \+\d{3}-\d{3}-\d{4}
365
+ )
366
+ ''', re.VERBOSE)
367
 
 
 
 
368
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
 
 
369
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
370
+
 
371
  phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
 
372
  emails = email_regex.findall(text)
373
+ links_RE = [link for link in link_regex.findall(text) if len(link) >= 11]
 
 
 
374
  links_RE = [link for link in links_RE if not any(email in link for email in emails)]
375
+
376
  return {
377
  "phone_numbers": phone_numbers,
378
  "emails": emails,
379
  "links_RE": links_RE
380
+ }
381
+
382
 
 
383
  def process_extracted_text(extracted_text):
384
+ data = json.loads(json.dumps(extracted_text, indent=4))
 
 
385
 
 
386
  combined_results = {
387
  "phone_numbers": [],
388
  "emails": [],
389
  "links_RE": []
390
  }
391
 
 
392
  for filename, text in data.items():
393
  contact_details = extract_contact_details(text)
 
394
  combined_results["phone_numbers"].extend(contact_details["phone_numbers"])
395
  combined_results["emails"].extend(contact_details["emails"])
396
  combined_results["links_RE"].extend(contact_details["links_RE"])
397
 
 
 
 
 
 
398
  print("Combined contact details in JSON format:")
399
+ print(combined_results)
400
+
401
+ return combined_results
402
 
 
403
 
 
404
  def remove_duplicates_case_insensitive(data_dict):
405
  for key, value_list in data_dict.items():
406
+ if not isinstance(value_list, list):
407
+ continue
408
+
409
  seen = set()
410
  unique_list = []
411
+
412
  for item in value_list:
413
+ item_str = str(item)
414
+ key_lower = item_str.lower()
415
+ if key_lower not in seen:
416
+ unique_list.append(item)
417
+ seen.add(key_lower)
418
+
419
  data_dict[key] = unique_list
420
+
421
  return data_dict
422
 
423
+
424
+ def process_resume_data(LLMdata, cont_data, extracted_text):
425
+ """
426
+ Final merge step.
427
+ Keeps the output structure exactly as you currently use in result.html.
428
+ """
429
+ LLMdata = normalize_llm_schema(LLMdata)
430
+ cont_data = cont_data or {}
431
+
432
+ cont_data.setdefault("emails", [])
433
+ cont_data.setdefault("phone_numbers", [])
434
+ cont_data.setdefault("links_RE", [])
435
+
436
+ # Merge regex-detected emails
437
+ existing_emails = {str(e).lower() for e in LLMdata["Email"]}
438
+ for email in cont_data["emails"]:
439
+ if str(email).lower() not in existing_emails:
440
+ LLMdata["Email"].append(email)
441
+ existing_emails.add(str(email).lower())
442
+
443
+ # Merge regex-detected links
444
+ existing_links = {str(l).lower() for l in LLMdata["Link"]}
445
+ for link in cont_data["links_RE"]:
446
+ if str(link).lower() not in existing_links:
447
+ LLMdata["Link"].append(link)
448
+ existing_links.add(str(link).lower())
449
+
450
+ # Merge regex-detected contacts using last-10-digit normalization
451
+ normalized_contacts = {str(num)[-10:] for num in LLMdata["Contact"] if num}
452
+ for num in cont_data["phone_numbers"]:
453
+ norm = str(num)[-10:]
454
+ if norm not in normalized_contacts:
455
+ LLMdata["Contact"].append(num)
456
+ normalized_contacts.add(norm)
457
+
458
+ LLMdata = remove_duplicates_case_insensitive(LLMdata)
459
+
460
+ processed_data = {
461
+ "name": LLMdata.get("Name", []),
462
+ "contact_number": LLMdata.get("Contact", []),
463
+ "Designation": LLMdata.get("Designation", []),
464
+ "email": LLMdata.get("Email", []),
465
+ "Location": LLMdata.get("Address", []),
466
+ "Link": LLMdata.get("Link", []),
467
+ "Company": LLMdata.get("Company", []),
468
+ "extracted_text": extracted_text
469
+ }
470
+
471
+ for key in ["name", "contact_number", "Designation", "email", "Location", "Link", "Company"]:
472
+ processed_data[key] = [
473
+ v for v in processed_data[key]
474
+ if str(v).strip().lower() not in {"not found", "none", "null", ""}
475
+ ]
476
+
477
+ return processed_data
478
+
479
+
480
+ # Optional compatibility helper; no longer needed by the main flow.
481
+ def json_to_llm_str(textJson):
482
+ s = ""
483
+ for _, item in textJson.items():
484
+ s += str(item) + " "
485
+ return s