Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update utility/utils.py
Browse files- utility/utils.py +46 -17
    	
        utility/utils.py
    CHANGED
    
    | @@ -24,6 +24,8 @@ logging.basicConfig( | |
| 24 | 
             
            os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr' 
         | 
| 25 |  | 
| 26 | 
             
            RESULT_FOLDER = 'static/results/'
         | 
|  | |
|  | |
| 27 | 
             
            if not os.path.exists('/tmp/.paddleocr'):
         | 
| 28 | 
             
                os.makedirs(RESULT_FOLDER, exist_ok=True)
         | 
| 29 |  | 
| @@ -45,11 +47,13 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT) | |
| 45 |  | 
| 46 | 
             
            # Load image using OpenCV
         | 
| 47 | 
             
            def load_image(image_path):
         | 
| 48 | 
            -
                 | 
| 49 | 
            -
                if  | 
| 50 | 
            -
                     | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
|  | |
|  | |
| 53 | 
             
            # Function for upscaling image using OpenCV's INTER_CUBIC
         | 
| 54 | 
             
            def upscale_image(image, scale=2):
         | 
| 55 | 
             
                height, width = image.shape[:2]
         | 
| @@ -171,7 +175,7 @@ def extract_text_from_images(image_paths): | |
| 171 | 
             
            # Function to call the Gemma model and process the output as Json 
         | 
| 172 | 
             
            def Data_Extractor(data, client=client):
         | 
| 173 | 
             
                text = f'''Act as a  Text extractor for the following text given in text: {data} 
         | 
| 174 | 
            -
                 | 
| 175 | 
             
                {{
         | 
| 176 | 
             
                "Name": ["Identify and Extract All the person's name from the text."],
         | 
| 177 | 
             
                "Designation": ["Extract All the designation or job title mentioned in the text."],
         | 
| @@ -180,17 +184,19 @@ def Data_Extractor(data, client=client): | |
| 180 | 
             
                "Address": ["Extract All the full postal address or location mentioned in the text."],
         | 
| 181 | 
             
                "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
         | 
| 182 | 
             
                "Link": ["Identify and Extract any website URLs or social media links present in the text."]
         | 
| 183 | 
            -
                }} | 
| 184 | 
            -
                 | 
|  | |
| 185 | 
             
                '''
         | 
| 186 | 
             
                # Call the API for inference
         | 
| 187 | 
            -
                response = client.text_generation(text, max_new_tokens= | 
| 188 |  | 
| 189 | 
             
                print("parse in text ---:",response)
         | 
| 190 |  | 
| 191 | 
             
                # Convert the response text to JSON
         | 
| 192 | 
             
                try:
         | 
| 193 | 
             
                    json_data = json.loads(response)
         | 
|  | |
| 194 | 
             
                    return json_data
         | 
| 195 | 
             
                except json.JSONDecodeError as e:
         | 
| 196 | 
             
                    return {"error": f"Error decoding JSON: {e}"}   
         | 
| @@ -228,8 +234,22 @@ def extract_contact_details(text): | |
| 228 | 
             
                    \+91\s\d{5}-\d{5} |                         # India Intl +91 XXXXX-XXXXX
         | 
| 229 | 
             
                    \+91\s\d{4}-\d{6} |                         # India Intl +91 XXXX-XXXXXX
         | 
| 230 | 
             
                    \+91\s\d{10} |                              # India Intl +91 XXXXXXXXXX
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 231 | 
             
                    0\d{2}-\d{7} |                              # India STD 0XX-XXXXXXX
         | 
| 232 | 
             
                    \+91\d{10} |                                # +91 XXXXXXXXXX
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 233 | 
             
                    \+49\s\d{4}\s\d{8} |                        # Germany Intl +49 XXXX XXXXXXXX
         | 
| 234 | 
             
                    \+49\s\d{3}\s\d{7} |                        # Germany Intl +49 XXX XXXXXXX
         | 
| 235 | 
             
                    0\d{3}\s\d{8} |                             # Germany STD 0XXX XXXXXXXX
         | 
| @@ -385,16 +405,25 @@ def process_resume_data(LLMdata,cont_data,extracted_text): | |
| 385 |  | 
| 386 | 
             
                # Initialize the processed data dictionary
         | 
| 387 | 
             
                processed_data = {            
         | 
| 388 | 
            -
                        "name": [ | 
| 389 | 
            -
                        "contact_number": [ | 
| 390 | 
            -
                        "Designation":[ | 
| 391 | 
            -
                        "email": [ | 
| 392 | 
            -
                        "Location": [ | 
| 393 | 
            -
                        "Link": [ | 
| 394 | 
            -
                        "Company":[ | 
| 395 | 
             
                        "extracted_text": extracted_text
         | 
| 396 | 
             
                        }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 397 | 
             
                processed_data['email'].extend(cont_data.get("emails", [])) 
         | 
| 398 | 
             
                processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
         | 
| 399 | 
             
                processed_data['Link'].extend(cont_data.get("links_RE", []))
         | 
| 400 | 
            -
                return processed_data
         | 
|  | |
| 24 | 
             
            os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr' 
         | 
| 25 |  | 
| 26 | 
             
            RESULT_FOLDER = 'static/results/'
         | 
| 27 | 
            +
            JSON_FOLDER = 'static/json/'
         | 
| 28 | 
            +
             | 
| 29 | 
             
            if not os.path.exists('/tmp/.paddleocr'):
         | 
| 30 | 
             
                os.makedirs(RESULT_FOLDER, exist_ok=True)
         | 
| 31 |  | 
|  | |
| 47 |  | 
| 48 | 
             
            # Load image using OpenCV
         | 
| 49 | 
             
            def load_image(image_path):
         | 
| 50 | 
            +
                ext = os.path.splitext(image_path)[1].lower()
         | 
| 51 | 
            +
                if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
         | 
| 52 | 
            +
                    image = cv2.imread(image_path)
         | 
| 53 | 
            +
                    return image
         | 
| 54 | 
            +
                else:
         | 
| 55 | 
            +
                    raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect. or in a not supported format")
         | 
| 56 | 
            +
               
         | 
| 57 | 
             
            # Function for upscaling image using OpenCV's INTER_CUBIC
         | 
| 58 | 
             
            def upscale_image(image, scale=2):
         | 
| 59 | 
             
                height, width = image.shape[:2]
         | 
|  | |
| 175 | 
             
            # Function to call the Gemma model and process the output as Json 
         | 
| 176 | 
             
            def Data_Extractor(data, client=client):
         | 
| 177 | 
             
                text = f'''Act as a  Text extractor for the following text given in text: {data} 
         | 
| 178 | 
            +
                Extract text in the following output JSON string:
         | 
| 179 | 
             
                {{
         | 
| 180 | 
             
                "Name": ["Identify and Extract All the person's name from the text."],
         | 
| 181 | 
             
                "Designation": ["Extract All the designation or job title mentioned in the text."],
         | 
|  | |
| 184 | 
             
                "Address": ["Extract All the full postal address or location mentioned in the text."],
         | 
| 185 | 
             
                "Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
         | 
| 186 | 
             
                "Link": ["Identify and Extract any website URLs or social media links present in the text."]
         | 
| 187 | 
            +
                }}
         | 
| 188 | 
            +
                
         | 
| 189 | 
            +
                Output:    
         | 
| 190 | 
             
                '''
         | 
| 191 | 
             
                # Call the API for inference
         | 
| 192 | 
            +
                response = client.text_generation(text, max_new_tokens=1000, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
         | 
| 193 |  | 
| 194 | 
             
                print("parse in text ---:",response)
         | 
| 195 |  | 
| 196 | 
             
                # Convert the response text to JSON
         | 
| 197 | 
             
                try:
         | 
| 198 | 
             
                    json_data = json.loads(response)
         | 
| 199 | 
            +
                    print("Json_data-------------->",json_data)
         | 
| 200 | 
             
                    return json_data
         | 
| 201 | 
             
                except json.JSONDecodeError as e:
         | 
| 202 | 
             
                    return {"error": f"Error decoding JSON: {e}"}   
         | 
|  | |
| 234 | 
             
                    \+91\s\d{5}-\d{5} |                         # India Intl +91 XXXXX-XXXXX
         | 
| 235 | 
             
                    \+91\s\d{4}-\d{6} |                         # India Intl +91 XXXX-XXXXXX
         | 
| 236 | 
             
                    \+91\s\d{10} |                              # India Intl +91 XXXXXXXXXX
         | 
| 237 | 
            +
                    \+91\s\d{3}\s\d{3}\s\d{4} |                 # India Intl +91 XXX XXX XXXX
         | 
| 238 | 
            +
                    \+91\s\d{3}-\d{3}-\d{4} |                   # India Intl +91 XXX-XXX-XXXX
         | 
| 239 | 
            +
                    \+91\s\d{2}\s\d{4}\s\d{4} |                 # India Intl +91 XX XXXX XXXX
         | 
| 240 | 
            +
                    \+91\s\d{2}-\d{4}-\d{4} |                   # India Intl +91 XX-XXXX-XXXX
         | 
| 241 | 
            +
                    \+91\s\d{5}\s\d{5} |                        # India Intl +91 XXXXX XXXXX 
         | 
| 242 | 
            +
                    \d{5}\s\d{5} |                              # India XXXXX XXXXX 
         | 
| 243 | 
            +
                    \d{5}-\d{5} |                               # India XXXXX-XXXXX 
         | 
| 244 | 
             
                    0\d{2}-\d{7} |                              # India STD 0XX-XXXXXXX
         | 
| 245 | 
             
                    \+91\d{10} |                                # +91 XXXXXXXXXX
         | 
| 246 | 
            +
                    \d{10} |                                    # XXXXXXXXXX   # Here is the regex to handle all possible combination of the contact
         | 
| 247 | 
            +
                    \d{6}-\d{4} |                               # XXXXXX-XXXX
         | 
| 248 | 
            +
                    \d{4}-\d{6} |                               # XXXX-XXXXXX
         | 
| 249 | 
            +
                    \d{3}\s\d{3}\s\d{4} |                       # XXX XXX XXXX
         | 
| 250 | 
            +
                    \d{3}-\d{3}-\d{4} |                         # XXX-XXX-XXXX
         | 
| 251 | 
            +
                    \d{4}\s\d{3}\s\d{3} |                       # XXXX XXX XXX
         | 
| 252 | 
            +
                    \d{4}-\d{3}-\d{3} |                         # XXXX-XXX-XXX #-----
         | 
| 253 | 
             
                    \+49\s\d{4}\s\d{8} |                        # Germany Intl +49 XXXX XXXXXXXX
         | 
| 254 | 
             
                    \+49\s\d{3}\s\d{7} |                        # Germany Intl +49 XXX XXXXXXX
         | 
| 255 | 
             
                    0\d{3}\s\d{8} |                             # Germany STD 0XXX XXXXXXXX
         | 
|  | |
| 405 |  | 
| 406 | 
             
                # Initialize the processed data dictionary
         | 
| 407 | 
             
                processed_data = {            
         | 
| 408 | 
            +
                        "name": [],
         | 
| 409 | 
            +
                        "contact_number": [],
         | 
| 410 | 
            +
                        "Designation":[],
         | 
| 411 | 
            +
                        "email": [],
         | 
| 412 | 
            +
                        "Location": [],
         | 
| 413 | 
            +
                        "Link": [],
         | 
| 414 | 
            +
                        "Company":[],
         | 
| 415 | 
             
                        "extracted_text": extracted_text
         | 
| 416 | 
             
                        }
         | 
| 417 | 
            +
                #LLM
         | 
| 418 | 
            +
                processed_data['name'].extend(LLMdata.get('Name', []))
         | 
| 419 | 
            +
                processed_data['contact_number'].extend(LLMdata.get('Contact', []))
         | 
| 420 | 
            +
                processed_data['Designation'].extend(LLMdata.get('Designation', []))
         | 
| 421 | 
            +
                processed_data['email'].extend(LLMdata.get("Email", []))
         | 
| 422 | 
            +
                processed_data['Location'].extend(LLMdata.get('Address', []))
         | 
| 423 | 
            +
                processed_data['Link'].extend(LLMdata.get('Link', []))
         | 
| 424 | 
            +
                processed_data['Company'].extend(LLMdata.get('Company', []))
         | 
| 425 | 
            +
                #Contact
         | 
| 426 | 
             
                processed_data['email'].extend(cont_data.get("emails", [])) 
         | 
| 427 | 
             
                processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
         | 
| 428 | 
             
                processed_data['Link'].extend(cont_data.get("links_RE", []))
         | 
| 429 | 
            +
                return processed_data
         |