Spaces:

MJobe
/

document-vqa-v2

Running

File size: 3,716 Bytes

8700a34
574f9e3
6bbd3ca
 
c39e604
 
af17670
 
836458e
86a0b7a
6bbd3ca
574f9e3
 
 
 
 
 
 
 
 
 
4e3cfd3
574f9e3
 
 
 
 
 
 
 
 
6bbd3ca
574f9e3
6bbd3ca
574f9e3
 
6bbd3ca
 
 
 
a82199b
6bbd3ca
 
c39e604
 
 
574f9e3
 
 
 
 
 
 
420d3c9
574f9e3
 
41d335c
574f9e3
 
41d335c
574f9e3
f198fb3
574f9e3
f8ec4b3
574f9e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86a0b7a
574f9e3
 
 
 
 
 
86a0b7a
574f9e3
 
86a0b7a
574f9e3
 
f8ec4b3
574f9e3

import fitz
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from transformers import pipeline
from PIL import Image
from io import BytesIO
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware

app = FastAPI()

# Set up CORS middleware
origins = ["*"]  # or specify your list of allowed origins
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

nlp_qa = pipeline("document-question-answering", model="tiennvcs/layoutlmv2-base-uncased-finetuned-infovqa")

description = """
## Image-based Document QA
This API performs document question answering using a LayoutLMv2-based model.

### Endpoints:
- **POST /uploadfile/:** Upload an image file to extract text and answer provided questions.
- **POST /pdfQA/:** Provide a PDF file to extract text and answer provided questions.
"""

app = FastAPI(docs_url="/", description=description)

@app.post("/uploadfile/", description="Upload an image file to extract text and answer provided questions.")
async def perform_document_qa(
    file: UploadFile = File(...),
    questions: str = Form(...),
):
    try:
        # Read the uploaded file as bytes
        contents = await file.read()

        # Open the image using PIL
        image = Image.open(BytesIO(contents))

        # Perform document question answering for each question using LayoutLMv2-based model
        answers_dict = {}
        for question in questions.split(','):
            result = nlp_qa(
                image,
                question.strip()
            )

            # Access the 'answer' key from the first item in the result list
            answer = result[0]['answer']

            # Format the question as a string without extra characters
            formatted_question = question.strip("[]")

            answers_dict[formatted_question] = answer

        return answers_dict
    except Exception as e:
        return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)

@app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.")
async def pdf_question_answering(
    file: UploadFile = File(...),
    questions: str = Form(...),
):
    try:
        # Read the uploaded file as bytes
        contents = await file.read()

        # Initialize an empty string to store the text content of the PDF
        all_text = ""

        # Use PyMuPDF to process the PDF and extract text
        pdf_document = fitz.open_from_bytes(contents)
        
        # Loop through each page and perform OCR
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            print(f"Processing page {page_num + 1}...")
            text = page.get_text()
            all_text += text + '\n'

        # Print or do something with the collected text
        print(all_text)

        # List of questions
        question_list = questions.split(',')

        # Initialize an empty dictionary to store questions and answers
        qa_dict = {}

        # Get answers for each question with the same context
        for question in question_list:
            result = nlp_qa({
                'question': question,
                'context': all_text
            })

            # Access the 'answer' key from the result
            answer = result['answer']

            # Store the question and answer in the dictionary
            qa_dict[question] = answer

        return qa_dict

    except Exception as e:
        return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)