Spaces:
No application file
No application file
File size: 1,855 Bytes
37d3a3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import requests
import time
import json
import os
from dotenv import load_dotenv
load_dotenv()
HEADERS = {
'app_id': os.environ.get('MATHPIX_APP_ID', 'default_app_id'),
'app_key': os.environ.get('MATHPIX_APP_KEY', 'default_app_key')
}
def extract_text(file_path: str) -> str:
print("Parsing resume")
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file at {file_path} does not exist.")
file_name = os.path.basename(file_path)
url1 = 'https://api.mathpix.com/v3/pdf'
with open(file_path, 'rb') as file:
files = {'file': file}
data = {'options_json': json.dumps({
"conversion_formats": {"md": True},
"math_inline_delimiters": ["$", "$"],
"rm_spaces": True
})}
status_resp = requests.post(url1, headers=HEADERS, files=files, data=data)
if status_resp.status_code != 200:
raise Exception(f"Failed to upload PDF: {status_resp.text}")
status_resp_data = status_resp.json()
pdf_id = status_resp_data.get('pdf_id')
if not pdf_id:
raise Exception("Failed to retrieve PDF ID from response.")
time.sleep(1)
url2 = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
while True:
challenge_resp = requests.get(url2, headers=HEADERS)
challenge_resp_data = challenge_resp.json()
if challenge_resp_data.get('status') == 'completed':
break
time.sleep(1)
url3 = f'https://api.mathpix.com/v3/pdf/{pdf_id}.mmd'
contents = requests.get(url3, headers=HEADERS)
if contents.status_code != 200:
raise Exception(f"Failed to download converted file: {contents.text}")
open(os.path.join(os.getcwd(),"resume_mmds", (str(file_name)+'.mmd')),"w").write(contents.text)
return contents.text
|