document-vqa-v2 / main.py
MJobe's picture
Update main.py
f660b8b
raw
history blame
4.04 kB
import fitz
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from transformers import pipeline
from PIL import Image
from io import BytesIO
from starlette.middleware import Middleware
from starlette.middleware.cors import CORSMiddleware
app = FastAPI()
# Set up CORS middleware
origins = ["*"] # or specify your list of allowed origins
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
nlp_qa = pipeline("document-question-answering", model="jinhybr/OCR-DocVQA-Donut")
description = """
## Image-based Document QA
This API performs document question answering using a LayoutLMv2-based model.
### Endpoints:
- **POST /uploadfile/:** Upload an image file to extract text and answer provided questions.
- **POST /pdfQA/:** Provide a PDF file to extract text and answer provided questions.
"""
app = FastAPI(docs_url="/", description=description)
@app.post("/uploadfile/", description="Upload an image file to extract text and answer provided questions.")
async def perform_document_qa(
file: UploadFile = File(...),
questions: str = Form(...),
):
try:
# Read the uploaded file as bytes
contents = await file.read()
# Open the image using PIL
image = Image.open(BytesIO(contents))
# Perform document question answering for each question using LayoutLMv2-based model
answers_dict = {}
for question in questions.split(','):
result = nlp_qa(
image,
question.strip()
)
# Access the 'answer' key from the first item in the result list
answer = result[0]['answer']
# Format the question as a string without extra characters
formatted_question = question.strip("[]")
answers_dict[formatted_question] = answer
return answers_dict
except Exception as e:
return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)
@app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.")
async def pdf_question_answering(
file: UploadFile = File(...),
questions: str = Form(...),
):
try:
# Read the uploaded file as bytes
contents = await file.read()
# Initialize an empty list to store image bytes
images = []
# Use PyMuPDF to process the PDF and convert each page to an image
pdf_document = fitz.open_from_bytes(contents)
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
print(f"Converting page {page_num + 1} to image...")
# Convert the page to an image
image = Image.frombytes("RGB", page.get_size(), page.get_pixmap().samples)
# Convert the image to bytes
img_byte_array = BytesIO()
image.save(img_byte_array, format='PNG')
images.append(img_byte_array.getvalue())
# Perform document question answering for each image
answers_dict = {}
for idx, image_bytes in enumerate(images):
image = Image.open(BytesIO(image_bytes))
for question in questions.split(','):
result = nlp_qa(
image,
question.strip()
)
answer = result[0]['answer']
formatted_question = f"{question.strip('[]')} (Page {idx + 1})"
answers_dict[formatted_question] = answer
return answers_dict
except Exception as e:
return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)
# Set up CORS middleware
origins = ["*"] # or specify your list of allowed origins
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)