Spaces:
Running
Running
import fitz | |
from fastapi import FastAPI, File, UploadFile, Form | |
from fastapi.responses import JSONResponse | |
from transformers import pipeline | |
from PIL import Image | |
from io import BytesIO | |
from starlette.middleware import Middleware | |
from starlette.middleware.cors import CORSMiddleware | |
app = FastAPI() | |
# Set up CORS middleware | |
origins = ["*"] # or specify your list of allowed origins | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
nlp_qa = pipeline("document-question-answering", model="jinhybr/OCR-DocVQA-Donut") | |
description = """ | |
## Image-based Document QA | |
This API performs document question answering using a LayoutLMv2-based model. | |
### Endpoints: | |
- **POST /uploadfile/:** Upload an image file to extract text and answer provided questions. | |
- **POST /pdfQA/:** Provide a PDF file to extract text and answer provided questions. | |
""" | |
app = FastAPI(docs_url="/", description=description) | |
async def perform_document_qa( | |
file: UploadFile = File(...), | |
questions: str = Form(...), | |
): | |
try: | |
# Read the uploaded file as bytes | |
contents = await file.read() | |
# Open the image using PIL | |
image = Image.open(BytesIO(contents)) | |
# Perform document question answering for each question using LayoutLMv2-based model | |
answers_dict = {} | |
for question in questions.split(','): | |
result = nlp_qa( | |
image, | |
question.strip() | |
) | |
# Access the 'answer' key from the first item in the result list | |
answer = result[0]['answer'] | |
# Format the question as a string without extra characters | |
formatted_question = question.strip("[]") | |
answers_dict[formatted_question] = answer | |
return answers_dict | |
except Exception as e: | |
return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500) | |
async def pdf_question_answering( | |
file: UploadFile = File(...), | |
questions: str = Form(...), | |
): | |
try: | |
# Read the uploaded file as bytes | |
contents = await file.read() | |
# Initialize an empty list to store image bytes | |
images = [] | |
# Use PyMuPDF to process the PDF and convert each page to an image | |
pdf_document = fitz.open_from_bytes(contents) | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
print(f"Converting page {page_num + 1} to image...") | |
# Convert the page to an image | |
image = Image.frombytes("RGB", page.get_size(), page.get_pixmap().samples) | |
# Convert the image to bytes | |
img_byte_array = BytesIO() | |
image.save(img_byte_array, format='PNG') | |
images.append(img_byte_array.getvalue()) | |
# Perform document question answering for each image | |
answers_dict = {} | |
for idx, image_bytes in enumerate(images): | |
image = Image.open(BytesIO(image_bytes)) | |
for question in questions.split(','): | |
result = nlp_qa( | |
image, | |
question.strip() | |
) | |
answer = result[0]['answer'] | |
formatted_question = f"{question.strip('[]')} (Page {idx + 1})" | |
answers_dict[formatted_question] = answer | |
return answers_dict | |
except Exception as e: | |
return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500) | |
# Set up CORS middleware | |
origins = ["*"] # or specify your list of allowed origins | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) |