Spaces:

MJobe
/

document-vqa-v2

Running

App Files Files Community

MJobe commited on Dec 18, 2023

Commit

0ddbc70

•

1 Parent(s): 1106695

Update main.py

Browse files

Files changed (1) hide show

main.py +21 -42

main.py CHANGED Viewed

@@ -21,6 +21,7 @@ app.add_middleware(
 )
 nlp_qa = pipeline("document-question-answering", model="jinhybr/OCR-DocVQA-Donut")
 description = """
 ## Image-based Document QA
@@ -65,8 +66,8 @@ async def perform_document_qa(
     except Exception as e:
         return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)
-@app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.")
-async def pdf_question_answering(
     file: UploadFile = File(...),
     questions: str = Form(...),
 ):
@@ -74,50 +75,28 @@ async def pdf_question_answering(
         # Read the uploaded file as bytes
         contents = await file.read()
-        # Save the PDF bytes to a temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
-            temp_pdf.write(contents)
-            temp_pdf_path = temp_pdf.name
-        # Initialize an empty list to store image bytes
-        images = []
-        # Use PyMuPDF to process the PDF and convert each page to an image
-        pdf_document = fitz.open(temp_pdf_path)
-        for page_num in range(pdf_document.page_count):
-            page = pdf_document.load_page(page_num)
-            print(f"Converting page {page_num + 1} to image...")
-            # Convert the page to an image
-            image = Image.frombytes("RGB", page.get_size(), page.get_pixmap().samples)
-            # Convert the image to bytes
-            img_byte_array = BytesIO()
-            image.save(img_byte_array, format='PNG')
-            images.append(img_byte_array.getvalue())
-        # Perform document question answering for each image
         answers_dict = {}
-        for idx, image_bytes in enumerate(images):
-            image = Image.open(BytesIO(image_bytes))
-            for question in questions.split(','):
-                result = nlp_qa(
-                    image,
-                    question.strip()
-                )
-                answer = result[0]['answer']
-                formatted_question = f"{question.strip('[]')} (Page {idx + 1})"
-                answers_dict[formatted_question] = answer
-        # Delete the temporary PDF file
-        temp_pdf.close()
-        os.remove(temp_pdf_path)
-        return answers_dict
     except Exception as e:
-        return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins

 )
 nlp_qa = pipeline("document-question-answering", model="jinhybr/OCR-DocVQA-Donut")
+nlp_qa_v2 = pipeline("document-question-answering", model="fxmarty/tiny-doc-qa-vision-encoder-decoder")
 description = """
 ## Image-based Document QA
     except Exception as e:
         return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)
+@app.post("/uploadfilev2/", description="Upload an image file to extract text and answer provided questions.")
+async def perform_document_qa(
     file: UploadFile = File(...),
     questions: str = Form(...),
 ):
         # Read the uploaded file as bytes
         contents = await file.read()
+        # Open the image using PIL
+        image = Image.open(BytesIO(contents))
+        # Perform document question answering for each question using LayoutLMv2-based model
         answers_dict = {}
+        for question in questions.split(','):
+            result = nlp_qa_v2(
+                image,
+                question.strip()
+            )
+            # Access the 'answer' key from the first item in the result list
+            answer = result[0]['answer']
+            # Format the question as a string without extra characters
+            formatted_question = question.strip("[]")
+            answers_dict[formatted_question] = answer
+        return answers_dict
     except Exception as e:
+        return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins