Spaces:

MJobe
/

document-vqa-v2

Running

App Files Files Community

document-vqa-v2 / main.py

MJobe

Update main.py

f660b8b 11 months ago

raw

history blame

4.04 kB

	import fitz
	from fastapi import FastAPI, File, UploadFile, Form
	from fastapi.responses import JSONResponse
	from transformers import pipeline
	from PIL import Image
	from io import BytesIO
	from starlette.middleware import Middleware
	from starlette.middleware.cors import CORSMiddleware

	app = FastAPI()

	# Set up CORS middleware
	origins = ["*"] # or specify your list of allowed origins
	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	nlp_qa = pipeline("document-question-answering", model="jinhybr/OCR-DocVQA-Donut")

	description = """
	## Image-based Document QA
	This API performs document question answering using a LayoutLMv2-based model.

	### Endpoints:
	- POST /uploadfile/: Upload an image file to extract text and answer provided questions.
	- POST /pdfQA/: Provide a PDF file to extract text and answer provided questions.
	"""

	app = FastAPI(docs_url="/", description=description)

	@app.post("/uploadfile/", description="Upload an image file to extract text and answer provided questions.")
	async def perform_document_qa(
	file: UploadFile = File(...),
	questions: str = Form(...),
	):
	try:
	# Read the uploaded file as bytes
	contents = await file.read()

	# Open the image using PIL
	image = Image.open(BytesIO(contents))

	# Perform document question answering for each question using LayoutLMv2-based model
	answers_dict = {}
	for question in questions.split(','):
	result = nlp_qa(
	image,
	question.strip()
	)

	# Access the 'answer' key from the first item in the result list
	answer = result[0]['answer']

	# Format the question as a string without extra characters
	formatted_question = question.strip("[]")

	answers_dict[formatted_question] = answer

	return answers_dict
	except Exception as e:
	return JSONResponse(content=f"Error processing file: {str(e)}", status_code=500)

	@app.post("/pdfQA/", description="Provide a PDF file to extract text and answer provided questions.")
	async def pdf_question_answering(
	file: UploadFile = File(...),
	questions: str = Form(...),
	):
	try:
	# Read the uploaded file as bytes
	contents = await file.read()

	# Initialize an empty list to store image bytes
	images = []

	# Use PyMuPDF to process the PDF and convert each page to an image
	pdf_document = fitz.open_from_bytes(contents)

	for page_num in range(pdf_document.page_count):
	page = pdf_document.load_page(page_num)
	print(f"Converting page {page_num + 1} to image...")

	# Convert the page to an image
	image = Image.frombytes("RGB", page.get_size(), page.get_pixmap().samples)

	# Convert the image to bytes
	img_byte_array = BytesIO()
	image.save(img_byte_array, format='PNG')
	images.append(img_byte_array.getvalue())

	# Perform document question answering for each image
	answers_dict = {}
	for idx, image_bytes in enumerate(images):
	image = Image.open(BytesIO(image_bytes))
	for question in questions.split(','):
	result = nlp_qa(
	image,
	question.strip()
	)
	answer = result[0]['answer']
	formatted_question = f"{question.strip('[]')} (Page {idx + 1})"
	answers_dict[formatted_question] = answer

	return answers_dict

	except Exception as e:
	return JSONResponse(content=f"Error processing PDF file: {str(e)}", status_code=500)

	# Set up CORS middleware
	origins = ["*"] # or specify your list of allowed origins
	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)