carlafileai commited on
Commit
8b21338
·
1 Parent(s): 566718b

Added more error handling, improved logging, and added tiemout protection

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. .gitignore +1 -0
  3. .gradio/certificate.pem +31 -0
  4. app.py +80 -50
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitignore CHANGED
@@ -5,3 +5,4 @@ env/
5
  .Python
6
  .ipynb_checkpoints
7
  __pycache__
 
 
5
  .Python
6
  .ipynb_checkpoints
7
  __pycache__
8
+ app_old.py
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -19,31 +19,25 @@ def perform_ocr(uploaded_file):
19
  max_size = 10 * 1024 * 1024 # 10 MB limit
20
  if os.path.getsize(uploaded_file) > max_size:
21
  return "File is too large. Please upload a file smaller than 10 MB."
22
- files={}
23
 
24
  try:
25
  # Read and encode the image file
26
  print("Processing file:", uploaded_file)
27
- with open(uploaded_file, "rb") as image_file:
28
- if uploaded_file.endswith(('.pdf', '.PDF')):
29
- files['file'] = (os.path.basename(uploaded_file), image_file, 'application/pdf')
30
- elif uploaded_file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
31
- mime_type, _ = mimetypes.guess_type(uploaded_file)
32
- files['file'] = (os.path.basename(uploaded_file), image_file, mime_type)
33
-
34
  filename = os.path.basename(uploaded_file)
35
-
36
- with open(uploaded_file.name, 'rb') as f:
37
- # Read the file content
38
- file_content = f.read()
39
- files_to_send = {
40
- 'file': (filename, file_content, files['file'][2]) # (filename, file_object, mime_type)
41
- }
 
 
 
42
 
43
  payload = {
44
- "fileName": files['file'][0],
45
- # Use the correct MIME type for the file
46
- "fileType": files['file'][2],
47
  "isSplit": False,
48
  "callbackURL": "https://example.com/callback",
49
  "ocrModel": "Beethoven_ENG_G5.0",
@@ -57,21 +51,22 @@ def perform_ocr(uploaded_file):
57
  # Step 1: Request upload URL and uploadId
58
  response = requests.post(os.getenv("UPLOAD_URL"), json=payload, headers=headers)
59
  # Check if request was successful
60
- if response.status_code != 201:
61
- return result.get("uploadId", "No upload ID found in the response.")
 
62
 
63
  # STEP 2: upload the file
64
  result = response.json()
65
  upload_id = result['uploadId'] #will need this later to get the file id
66
  url = result['presignedUploadURL']
67
  headers_put = {
68
- "Content-Type": "application/pdf"
69
  }
70
- with open(uploaded_file.name, "rb") as f:
 
71
  put_response = requests.put(url, data=f, headers=headers_put)
72
  if put_response.status_code != 200:
73
- print("Error uploading file:", put_response.status_code, put_response.text)
74
- exit(1)
75
 
76
 
77
  # Step 3: Poll the files endpoint to find the fileId with the matching uploadId
@@ -81,41 +76,76 @@ def perform_ocr(uploaded_file):
81
  }
82
  files_endpoint = os.getenv("GET_FILES_URL")
83
  poll_interval = 4 # seconds
 
84
  start_time = time.time()
85
  file_id = None
86
- print("Searching for fileId matching uploadId...")
87
  while True:
88
- resp = requests.get(files_endpoint, headers=headers_get)
89
- files = resp.json().get('files', [])
90
- for file in files:
91
- if file.get('uploadId') == upload_id:
92
- file_id = file.get('fileId')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  break
94
- if file_id:
95
- break
96
- print("File not found yet, retrying...")
97
- time.sleep(poll_interval)
 
 
98
 
99
  # Step 4: Poll for OCR result using fileId
100
  ocr_endpoint = f"{os.getenv('GET_OCR_URL')}/{file_id}/ocr"
 
 
101
  while True:
102
- response_ocr = requests.get(ocr_endpoint, headers=headers_get)
103
- if response_ocr.status_code == 200:
104
- ocr_result = response_ocr.json()
105
- # Format the OCR result nicely for the user
106
- if "ocr" in ocr_result and isinstance(ocr_result["ocr"], list):
107
- formatted_text = ""
108
- for page in ocr_result["ocr"]:
109
- page_num = page.get("page_number", "Unknown")
110
- page_text = page.get("text", "")
111
- formatted_text += f"--- Page {page_num} ---\n{page_text.strip()}\n\n"
112
- return formatted_text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
113
  else:
114
- return ocr_result.get("text", "")
115
-
116
- else:
117
- print("OCR not ready yet, retrying...")
118
- time.sleep(poll_interval)
 
 
119
 
120
  except requests.exceptions.Timeout:
121
  return "Request timed out. Please try again."
 
19
  max_size = 10 * 1024 * 1024 # 10 MB limit
20
  if os.path.getsize(uploaded_file) > max_size:
21
  return "File is too large. Please upload a file smaller than 10 MB."
 
22
 
23
  try:
24
  # Read and encode the image file
25
  print("Processing file:", uploaded_file)
 
 
 
 
 
 
 
26
  filename = os.path.basename(uploaded_file)
27
+ if uploaded_file.endswith(('.pdf', '.PDF')):
28
+ mime_type = 'application/pdf'
29
+ elif uploaded_file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
30
+ mime_type, _ = mimetypes.guess_type(uploaded_file)
31
+ if not mime_type:
32
+ mime_type = 'image/jpeg'
33
+ elif uploaded_file.endswith('.txt'):
34
+ mime_type = 'text/plain'
35
+ else:
36
+ return "Unsupported file type. Please upload a PDF, image, or text file."
37
 
38
  payload = {
39
+ "fileName": filename,
40
+ "fileType": mime_type,
 
41
  "isSplit": False,
42
  "callbackURL": "https://example.com/callback",
43
  "ocrModel": "Beethoven_ENG_G5.0",
 
51
  # Step 1: Request upload URL and uploadId
52
  response = requests.post(os.getenv("UPLOAD_URL"), json=payload, headers=headers)
53
  # Check if request was successful
54
+ if response.status_code != 201:
55
+ return f"Failed to get upload URL: {response.status_code} - {response.text}"
56
+
57
 
58
  # STEP 2: upload the file
59
  result = response.json()
60
  upload_id = result['uploadId'] #will need this later to get the file id
61
  url = result['presignedUploadURL']
62
  headers_put = {
63
+ "Content-Type": mime_type # Use the actual MIME type
64
  }
65
+
66
+ with open(uploaded_file, "rb") as f:
67
  put_response = requests.put(url, data=f, headers=headers_put)
68
  if put_response.status_code != 200:
69
+ return f"Error uploading file: {put_response.status_code} - {put_response.text}"
 
70
 
71
 
72
  # Step 3: Poll the files endpoint to find the fileId with the matching uploadId
 
76
  }
77
  files_endpoint = os.getenv("GET_FILES_URL")
78
  poll_interval = 4 # seconds
79
+ max_wait_time = 120 # 2minutes timeout
80
  start_time = time.time()
81
  file_id = None
82
+ print(f"Searching for fileId matching uploadId: {upload_id}")
83
  while True:
84
+ current_time = time.time() #___ADDED___
85
+ if current_time - start_time > max_wait_time:
86
+ return f"Timeout: File with uploadId {upload_id} not found after {max_wait_time} seconds"
87
+
88
+ try:
89
+ resp = requests.get(files_endpoint, headers=headers_get, timeout=10) #adding tiemout
90
+ if resp.status_code != 200:
91
+ print(f"Error getting files list: {resp.status_code} - {resp.text}")
92
+ time.sleep(poll_interval)
93
+ continue
94
+
95
+ files_data = resp.json()
96
+ files = files_data.get('files', [])
97
+ print(f"Found {len(files)} files in the system")
98
+ for file in files:
99
+ if file.get('uploadId') == upload_id:
100
+ file_id = file.get('fileId')
101
+ print(f"Found matching file with fileId: {file_id}")
102
+ break
103
+
104
+ if file_id:
105
  break
106
+ print(f"File not found yet, retrying in {poll_interval} seconds... (elapsed: {current_time - start_time:.1f}s)")
107
+ time.sleep(poll_interval)
108
+ except requests.exceptions.RequestException as e:
109
+ print(f"Network error while searching for file: {e}")
110
+ time.sleep(poll_interval)
111
+ continue
112
 
113
  # Step 4: Poll for OCR result using fileId
114
  ocr_endpoint = f"{os.getenv('GET_OCR_URL')}/{file_id}/ocr"
115
+ ocr_start_time = time.time()
116
+ ocr_max_wait = 180
117
  while True:
118
+ current_time = time.time()
119
+ if current_time - ocr_start_time > ocr_max_wait:
120
+ return f"Timeout: OCR processing took longer than {ocr_max_wait} seconds"
121
+ try:
122
+ response_ocr = requests.get(ocr_endpoint, headers=headers_get, timeout=15)
123
+
124
+ if response_ocr.status_code == 200:
125
+ ocr_result = response_ocr.json()
126
+
127
+ # Format the OCR result nicely for the user
128
+ if "ocr" in ocr_result and isinstance(ocr_result["ocr"], list):
129
+ formatted_text = ""
130
+ for page in ocr_result["ocr"]:
131
+ page_num = page.get("page_number", "Unknown")
132
+ page_text = page.get("text", "")
133
+ formatted_text += f"--- Page {page_num} ---\n{page_text.strip()}\n\n"
134
+ return formatted_text.strip()
135
+ else:
136
+ return ocr_result.get("text", "No text found")
137
+
138
+ elif response_ocr.status_code == 404:
139
+ return f"OCR result not found for fileId: {file_id}"
140
+
141
  else:
142
+ print(f"OCR not ready yet (status: {response_ocr.status_code}), retrying in {poll_interval} seconds...")
143
+ time.sleep(poll_interval)
144
+
145
+ except requests.exceptions.RequestException as e:
146
+ print(f"Network error while getting OCR result: {e}")
147
+ time.sleep(poll_interval)
148
+ continue
149
 
150
  except requests.exceptions.Timeout:
151
  return "Request timed out. Please try again."