WebashalarForML commited on
Commit
ab72f2c
·
verified ·
1 Parent(s): f74e1b2

Update utility/utils.py

Browse files
Files changed (1) hide show
  1. utility/utils.py +135 -34
utility/utils.py CHANGED
@@ -328,42 +328,143 @@ def extract_text_from_images(image_paths):
328
 
329
 
330
  def extract_contact_details(text):
331
- # Keep your existing regex logic here exactly as-is.
332
- # This function is unchanged from your current file.
333
  combined_phone_regex = re.compile(r'''
334
- (?:
335
- \+1\s\(\d{3}\)\s\d{3}-\d{4} |
336
- \(\d{3}\)\s\d{3}-\d{4} |
337
- \(\d{3}\)\s\d{3}\s\d{4} |
338
- \+1\d{10} |
339
- \d{10} |
340
- \+44\s\d{4}\s\d{6} |
341
- \+44\s\d{3}\s\d{3}\s\d{4} |
342
- 0\d{4}\s\d{6} |
343
- 0\d{3}\s\d{3}\s\d{4} |
344
- \+44\d{10} |
345
- 0\d{10} |
346
- \+91\s\d{5}-\d{5} |
347
- \+91\s\d{4}-\d{6} |
348
- \+91\s\d{10} |
349
- \+91\s\d{3}\s\d{3}\s\d{4} |
350
- \+91\s\d{3}-\d{3}-\d{4} |
351
- \+91\s\d{2}\s\d{4}\s\d{4} |
352
- \+91\s\d{2}-\d{4}-\d{4} |
353
- \+91\s\d{5}\s\d{5} |
354
- \d{5}\s\d{5} |
355
- \d{5}-\d{5} |
356
- 0\d{2}-\d{7} |
357
- \+91\d{10} |
358
- \d{6}-\d{4} |
359
- \d{4}-\d{6} |
360
- \d{3}\s\d{3}\s\d{4} |
361
- \d{3}-\d{3}-\d{4} |
362
- \d{4}\s\d{3}\s\d{3} |
363
- \d{4}-\d{3}-\d{3} |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  \+\d{3}-\d{3}-\d{4}
365
- )
366
- ''', re.VERBOSE)
 
 
367
 
368
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
369
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
 
328
 
329
 
330
  def extract_contact_details(text):
331
+ # Regex patterns
332
+ # Phone numbers with at least 5 digits in any segment
333
  combined_phone_regex = re.compile(r'''
334
+ (?:
335
+ #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
336
+ \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
337
+ \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
338
+ \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
339
+ \(\d{3}\)\s\d{3}\s\d{3} | # USA/Canada (XXX) XXX XXX
340
+ \+1\d{10} | # +1 XXXXXXXXXX
341
+ \d{10} | # XXXXXXXXXX
342
+ \+44\s\d{4}\s\d{6} | # UK Intl +44 XXXX XXXXXX
343
+ \+44\s\d{3}\s\d{3}\s\d{4} | # UK Intl +44 XXX XXX XXXX
344
+ 0\d{4}\s\d{6} | # UK STD 0XXXX XXXXXX
345
+ 0\d{3}\s\d{3}\s\d{4} | # UK STD 0XXX XXX XXXX
346
+ \+44\d{10} | # +44 XXXXXXXXXX
347
+ 0\d{10} | # 0XXXXXXXXXX
348
+ \+61\s\d\s\d{4}\s\d{4} | # Australia Intl +61 X XXXX XXXX
349
+ 0\d\s\d{4}\s\d{4} | # Australia STD 0X XXXX XXXX
350
+ \+61\d{9} | # +61 XXXXXXXXX
351
+ 0\d{9} | # 0XXXXXXXXX
352
+ \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
353
+ \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
354
+ \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
355
+ \+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
356
+ \+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
357
+ \+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
358
+ \+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
359
+ \+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
360
+ \d{5}\s\d{5} | # India XXXXX XXXXX
361
+ \d{5}-\d{5} | # India XXXXX-XXXXX
362
+ 0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
363
+ \+91\d{10} | # +91 XXXXXXXXXX
364
+ \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
365
+ \d{6}-\d{4} | # XXXXXX-XXXX
366
+ \d{4}-\d{6} | # XXXX-XXXXXX
367
+ \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
368
+ \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
369
+ \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
370
+ \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
371
+ \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
372
+ \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
373
+ 0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
374
+ \+49\d{12} | # +49 XXXXXXXXXXXX
375
+ \+49\d{10} | # +49 XXXXXXXXXX
376
+ 0\d{11} | # 0XXXXXXXXXXX
377
+ \+86\s\d{3}\s\d{4}\s\d{4} | # China Intl +86 XXX XXXX XXXX
378
+ 0\d{3}\s\d{4}\s\d{4} | # China STD 0XXX XXXX XXXX
379
+ \+86\d{11} | # +86 XXXXXXXXXXX
380
+ \+81\s\d\s\d{4}\s\d{4} | # Japan Intl +81 X XXXX XXXX
381
+ \+81\s\d{2}\s\d{4}\s\d{4} | # Japan Intl +81 XX XXXX XXXX
382
+ 0\d\s\d{4}\s\d{4} | # Japan STD 0X XXXX XXXX
383
+ \+81\d{10} | # +81 XXXXXXXXXX
384
+ \+81\d{9} | # +81 XXXXXXXXX
385
+ 0\d{9} | # 0XXXXXXXXX
386
+ \+55\s\d{2}\s\d{5}-\d{4} | # Brazil Intl +55 XX XXXXX-XXXX
387
+ \+55\s\d{2}\s\d{4}-\d{4} | # Brazil Intl +55 XX XXXX-XXXX
388
+ 0\d{2}\s\d{4}\s\d{4} | # Brazil STD 0XX XXXX XXXX
389
+ \+55\d{11} | # +55 XXXXXXXXXXX
390
+ \+55\d{10} | # +55 XXXXXXXXXX
391
+ 0\d{10} | # 0XXXXXXXXXX
392
+ \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France Intl +33 X XX XX XX XX
393
+ 0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France STD 0X XX XX XX XX
394
+ \+33\d{9} | # +33 XXXXXXXXX
395
+ 0\d{9} | # 0XXXXXXXXX
396
+ \+7\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia Intl +7 XXX XXX-XX-XX
397
+ 8\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia STD 8 XXX XXX-XX-XX
398
+ \+7\d{10} | # +7 XXXXXXXXXX
399
+ 8\d{10} | # 8 XXXXXXXXXX
400
+ \+27\s\d{2}\s\d{3}\s\d{4} | # South Africa Intl +27 XX XXX XXXX
401
+ 0\d{2}\s\d{3}\s\d{4} | # South Africa STD 0XX XXX XXXX
402
+ \+27\d{9} | # +27 XXXXXXXXX
403
+ 0\d{9} | # 0XXXXXXXXX
404
+ \+52\s\d{3}\s\d{3}\s\d{4} | # Mexico Intl +52 XXX XXX XXXX
405
+ \+52\s\d{2}\s\d{4}\s\d{4} | # Mexico Intl +52 XX XXXX XXXX
406
+ 01\s\d{3}\s\d{4} | # Mexico STD 01 XXX XXXX
407
+ \+52\d{10} | # +52 XXXXXXXXXX
408
+ 01\d{7} | # 01 XXXXXXX
409
+ \+234\s\d{3}\s\d{3}\s\d{4} | # Nigeria Intl +234 XXX XXX XXXX
410
+ 0\d{3}\s\d{3}\s\d{4} | # Nigeria STD 0XXX XXX XXXX
411
+ \+234\d{10} | # +234 XXXXXXXXXX
412
+ 0\d{10} | # 0XXXXXXXXXX
413
+ \+971\s\d\s\d{3}\s\d{4} | # UAE Intl +971 X XXX XXXX
414
+ 0\d\s\d{3}\s\d{4} | # UAE STD 0X XXX XXXX
415
+ \+971\d{8} | # +971 XXXXXXXX
416
+ 0\d{8} | # 0XXXXXXXX
417
+ \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
418
+ \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
419
+ 0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
420
+ \+54\d{10} | # +54 9 XXXXXXXXXX
421
+ \+54\d{9} | # +54 XXXXXXXXX
422
+ 0\d{7} | # 0XXXXXXX
423
+ \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
424
+ 0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
425
+ \+966\d{8} | # +966 XXXXXXXX
426
+ 0\d{8} | # 0XXXXXXXX
427
+ \+1\d{10} | # +1 XXXXXXXXXX
428
+ \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX
429
+ \d{5}\s\d{5} | # XXXXX XXXXX
430
+ \d{10} | # XXXXXXXXXX
431
+ \+44\d{10} | # +44 XXXXXXXXXX
432
+ 0\d{10} | # 0XXXXXXXXXX
433
+ \+61\d{9} | # +61 XXXXXXXXX
434
+ 0\d{9} | # 0XXXXXXXXX
435
+ \+91\d{10} | # +91 XXXXXXXXXX
436
+ \+49\d{12} | # +49 XXXXXXXXXXXX
437
+ \+49\d{10} | # +49 XXXXXXXXXX
438
+ 0\d{11} | # 0XXXXXXXXXXX
439
+ \+86\d{11} | # +86 XXXXXXXXXXX
440
+ \+81\d{10} | # +81 XXXXXXXXXX
441
+ \+81\d{9} | # +81 XXXXXXXXX
442
+ 0\d{9} | # 0XXXXXXXXX
443
+ \+55\d{11} | # +55 XXXXXXXXXXX
444
+ \+55\d{10} | # +55 XXXXXXXXXX
445
+ 0\d{10} | # 0XXXXXXXXXX
446
+ \+33\d{9} | # +33 XXXXXXXXX
447
+ 0\d{9} | # 0XXXXXXXXX
448
+ \+7\d{10} | # +7 XXXXXXXXXX
449
+ 8\d{10} | # 8 XXXXXXXXXX
450
+ \+27\d{9} | # +27 XXXXXXXXX
451
+ 0\d{9} | # 0XXXXXXXXX (South Africa STD)
452
+ \+52\d{10} | # +52 XXXXXXXXXX
453
+ 01\d{7} | # 01 XXXXXXX
454
+ \+234\d{10} | # +234 XXXXXXXXXX
455
+ 0\d{10} | # 0XXXXXXXXXX
456
+ \+971\d{8} | # +971 XXXXXXXX
457
+ 0\d{8} | # 0XXXXXXXX
458
+ \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
459
+ \+54\d{9} | # +54 XXXXXXXXX
460
+ 0\d{7} | # 0XXXXXXX
461
+ \+966\d{8} | # +966 XXXXXXXX
462
+ 0\d{8} # 0XXXXXXXX
463
  \+\d{3}-\d{3}-\d{4}
464
+ )
465
+
466
+
467
+ ''',re.VERBOSE)
468
 
469
  email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
470
  link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')