Spaces:

file-ai
/

fileAI

Running

App Files Files Community

carlafileai commited on Jul 9

Commit

8b21338

1 Parent(s): 566718b

Added more error handling, improved logging, and added tiemout protection

Browse files

Files changed (4) hide show

.DS_Store +0 -0
.gitignore +1 -0
.gradio/certificate.pem +31 -0
app.py +80 -50

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitignore CHANGED Viewed

@@ -5,3 +5,4 @@ env/
 .Python
 .ipynb_checkpoints
 __pycache__

 .Python
 .ipynb_checkpoints
 __pycache__
+app_old.py

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -19,31 +19,25 @@ def perform_ocr(uploaded_file):
     max_size = 10 * 1024 * 1024  # 10 MB limit
     if os.path.getsize(uploaded_file) > max_size:
         return "File is too large. Please upload a file smaller than 10 MB."
-    files={}
     try:
         # Read and encode the image file
         print("Processing file:", uploaded_file)
-        with open(uploaded_file, "rb") as image_file:
-            if uploaded_file.endswith(('.pdf', '.PDF')):
-                files['file'] = (os.path.basename(uploaded_file), image_file, 'application/pdf')
-            elif uploaded_file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
-                mime_type, _ = mimetypes.guess_type(uploaded_file)
-                files['file'] = (os.path.basename(uploaded_file), image_file, mime_type)
         filename = os.path.basename(uploaded_file)
-        with open(uploaded_file.name, 'rb') as f:
-            # Read the file content
-            file_content = f.read()
-        files_to_send = {
-            'file': (filename, file_content, files['file'][2])  # (filename, file_object, mime_type)
-        }
         payload = {
-            "fileName": files['file'][0],
-            # Use the correct MIME type for the file
-            "fileType": files['file'][2],
             "isSplit": False,
             "callbackURL": "https://example.com/callback",
             "ocrModel": "Beethoven_ENG_G5.0",
@@ -57,21 +51,22 @@ def perform_ocr(uploaded_file):
         # Step 1: Request upload URL and uploadId
         response = requests.post(os.getenv("UPLOAD_URL"), json=payload, headers=headers)
         # Check if request was successful
-        if response.status_code != 201:
-            return result.get("uploadId", "No upload ID found in the response.")
         # STEP 2: upload the file
         result = response.json()
         upload_id = result['uploadId'] #will need this later to get the file id
         url = result['presignedUploadURL']
         headers_put = {
-        "Content-Type": "application/pdf"
         }
-        with open(uploaded_file.name, "rb") as f:
             put_response = requests.put(url, data=f, headers=headers_put)
         if put_response.status_code != 200:
-            print("Error uploading file:", put_response.status_code, put_response.text)
-            exit(1)
         # Step 3: Poll the files endpoint to find the fileId with the matching uploadId
@@ -81,41 +76,76 @@ def perform_ocr(uploaded_file):
         }
         files_endpoint = os.getenv("GET_FILES_URL")
         poll_interval = 4  # seconds
         start_time = time.time()
         file_id = None
-        print("Searching for fileId matching uploadId...")
         while True:
-            resp = requests.get(files_endpoint, headers=headers_get)
-            files = resp.json().get('files', [])
-            for file in files:
-                if file.get('uploadId') == upload_id:
-                    file_id = file.get('fileId')
                     break
-            if file_id:
-                break
-            print("File not found yet, retrying...")
-            time.sleep(poll_interval)
         # Step 4: Poll for OCR result using fileId
         ocr_endpoint = f"{os.getenv('GET_OCR_URL')}/{file_id}/ocr"
         while True:
-            response_ocr = requests.get(ocr_endpoint, headers=headers_get)
-            if response_ocr.status_code == 200:
-                ocr_result = response_ocr.json()
-                # Format the OCR result nicely for the user
-                if "ocr" in ocr_result and isinstance(ocr_result["ocr"], list):
-                    formatted_text = ""
-                    for page in ocr_result["ocr"]:
-                        page_num = page.get("page_number", "Unknown")
-                        page_text = page.get("text", "")
-                        formatted_text += f"--- Page {page_num} ---\n{page_text.strip()}\n\n"
-                    return formatted_text.strip()
                 else:
-                    return ocr_result.get("text", "")
-            else:
-                print("OCR not ready yet, retrying...")
-                time.sleep(poll_interval)
     except requests.exceptions.Timeout:
         return "Request timed out. Please try again."

     max_size = 10 * 1024 * 1024  # 10 MB limit
     if os.path.getsize(uploaded_file) > max_size:
         return "File is too large. Please upload a file smaller than 10 MB."
     try:
         # Read and encode the image file
         print("Processing file:", uploaded_file)
         filename = os.path.basename(uploaded_file)
+        if uploaded_file.endswith(('.pdf', '.PDF')):
+            mime_type = 'application/pdf'
+        elif uploaded_file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
+            mime_type, _ = mimetypes.guess_type(uploaded_file)
+            if not mime_type:
+                mime_type = 'image/jpeg'
+        elif uploaded_file.endswith('.txt'):
+            mime_type = 'text/plain'
+        else:
+            return "Unsupported file type. Please upload a PDF, image, or text file."
         payload = {
+            "fileName": filename,
+            "fileType": mime_type,
             "isSplit": False,
             "callbackURL": "https://example.com/callback",
             "ocrModel": "Beethoven_ENG_G5.0",
         # Step 1: Request upload URL and uploadId
         response = requests.post(os.getenv("UPLOAD_URL"), json=payload, headers=headers)
         # Check if request was successful
+        if response.status_code != 201:
+            return f"Failed to get upload URL: {response.status_code} - {response.text}"
         # STEP 2: upload the file
         result = response.json()
         upload_id = result['uploadId'] #will need this later to get the file id
         url = result['presignedUploadURL']
         headers_put = {
+            "Content-Type": mime_type  # Use the actual MIME type
         }
+        with open(uploaded_file, "rb") as f:
             put_response = requests.put(url, data=f, headers=headers_put)
         if put_response.status_code != 200:
+            return f"Error uploading file: {put_response.status_code} - {put_response.text}"
         # Step 3: Poll the files endpoint to find the fileId with the matching uploadId
         }
         files_endpoint = os.getenv("GET_FILES_URL")
         poll_interval = 4  # seconds
+        max_wait_time = 120  # 2minutes timeout
         start_time = time.time()
         file_id = None
+        print(f"Searching for fileId matching uploadId: {upload_id}")
         while True:
+            current_time = time.time()  #___ADDED___
+            if current_time - start_time > max_wait_time:
+                return f"Timeout: File with uploadId {upload_id} not found after {max_wait_time} seconds"
+            try:
+                resp = requests.get(files_endpoint, headers=headers_get, timeout=10) #adding tiemout
+                if resp.status_code != 200:
+                    print(f"Error getting files list: {resp.status_code} - {resp.text}")
+                    time.sleep(poll_interval)
+                    continue
+                files_data = resp.json()
+                files = files_data.get('files', [])
+                print(f"Found {len(files)} files in the system")
+                for file in files:
+                    if file.get('uploadId') == upload_id:
+                        file_id = file.get('fileId')
+                        print(f"Found matching file with fileId: {file_id}")
+                        break
+                if file_id:
                     break
+                print(f"File not found yet, retrying in {poll_interval} seconds... (elapsed: {current_time - start_time:.1f}s)")
+                time.sleep(poll_interval)
+            except requests.exceptions.RequestException as e:
+                print(f"Network error while searching for file: {e}")
+                time.sleep(poll_interval)
+                continue
         # Step 4: Poll for OCR result using fileId
         ocr_endpoint = f"{os.getenv('GET_OCR_URL')}/{file_id}/ocr"
+        ocr_start_time = time.time()
+        ocr_max_wait = 180
         while True:
+            current_time = time.time()
+            if current_time - ocr_start_time > ocr_max_wait:
+                return f"Timeout: OCR processing took longer than {ocr_max_wait} seconds"
+            try:
+                response_ocr = requests.get(ocr_endpoint, headers=headers_get, timeout=15)
+                if response_ocr.status_code == 200:
+                    ocr_result = response_ocr.json()
+                    # Format the OCR result nicely for the user
+                    if "ocr" in ocr_result and isinstance(ocr_result["ocr"], list):
+                        formatted_text = ""
+                        for page in ocr_result["ocr"]:
+                            page_num = page.get("page_number", "Unknown")
+                            page_text = page.get("text", "")
+                            formatted_text += f"--- Page {page_num} ---\n{page_text.strip()}\n\n"
+                        return formatted_text.strip()
+                    else:
+                        return ocr_result.get("text", "No text found")
+                elif response_ocr.status_code == 404:
+                    return f"OCR result not found for fileId: {file_id}"
                 else:
+                    print(f"OCR not ready yet (status: {response_ocr.status_code}), retrying in {poll_interval} seconds...")
+                    time.sleep(poll_interval)
+            except requests.exceptions.RequestException as e:
+                print(f"Network error while getting OCR result: {e}")
+                time.sleep(poll_interval)
+                continue
     except requests.exceptions.Timeout:
         return "Request timed out. Please try again."