Spaces:

MJobe
/

document-vqa-v2

Sleeping

App Files Files Community

MJobe commited on Dec 17, 2023

Commit

f660b8b

1 Parent(s): 0f0bcd2

Update main.py

Browse files

Files changed (1) hide show

main.py +27 -31

main.py CHANGED Viewed

@@ -73,46 +73,42 @@ async def pdf_question_answering(
         # Read the uploaded file as bytes
         contents = await file.read()
-        # Initialize an empty string to store the text content of the PDF
-        all_text = ""
-        # Use PyMuPDF to process the PDF and extract text
         pdf_document = fitz.open_from_bytes(contents)
-        # Loop through each page and perform OCR
         for page_num in range(pdf_document.page_count):
             page = pdf_document.load_page(page_num)
-            print(f"Processing page {page_num + 1}...")
-            text = page.get_text()
-            all_text += text + '\n'
-        # Print or do something with the collected text
-        print(all_text)
-        # List of questions
-        question_list = questions.split(',')
-        # Initialize an empty dictionary to store questions and answers
-        qa_dict = {}
-        # Get answers for each question with the same context
-        for question in question_list:
-            result = nlp_qa({
-                'question': question,
-                'context': all_text
-            })
-            # Access the 'answer' key from the result
-            answer = result['answer']
-            # Store the question and answer in the dictionary
-            qa_dict[question] = answer
-        return qa_dict
     except Exception as e:
         return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins
 app.add_middleware(

         # Read the uploaded file as bytes
         contents = await file.read()
+        # Initialize an empty list to store image bytes
+        images = []
+        # Use PyMuPDF to process the PDF and convert each page to an image
         pdf_document = fitz.open_from_bytes(contents)
         for page_num in range(pdf_document.page_count):
             page = pdf_document.load_page(page_num)
+            print(f"Converting page {page_num + 1} to image...")
+            # Convert the page to an image
+            image = Image.frombytes("RGB", page.get_size(), page.get_pixmap().samples)
+            # Convert the image to bytes
+            img_byte_array = BytesIO()
+            image.save(img_byte_array, format='PNG')
+            images.append(img_byte_array.getvalue())
+        # Perform document question answering for each image
+        answers_dict = {}
+        for idx, image_bytes in enumerate(images):
+            image = Image.open(BytesIO(image_bytes))
+            for question in questions.split(','):
+                result = nlp_qa(
+                    image,
+                    question.strip()
+                )
+                answer = result[0]['answer']
+                formatted_question = f"{question.strip('[]')} (Page {idx + 1})"
+                answers_dict[formatted_question] = answer
+        return answers_dict
     except Exception as e:
         return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins
 app.add_middleware(