fastapi-document-qa_semantic

Runtime error

App Files Files Community

fastapi-document-qa_semantic / main.py

umair894

Update main.py

22ee86e almost 2 years ago

raw

history blame contribute delete

3.21 kB


	import fitz
	from fastapi import FastAPI, File, UploadFile
	#from pyngrok import ngrok
	from typing import List

	import pytesseract
	import requests
	from io import BytesIO

	from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

	from top2vec import Top2Vec
	from llama_index.node_parser import SimpleNodeParser




	description = """
	## DocQA
	This app shows how to do Document Question Answering
	Check out the docs for the `/predict` endpoint below to try it out!
	"""

	app = FastAPI(docs_url="/", description=description)

	def doc_chunk(data):
	node_parser = SimpleNodeParser.from_defaults(chunk_size=256)
	nodes = node_parser.get_nodes_from_documents(data)
	return nodes

	def create_train_data(nodes):
	data = []
	for i in range(len(nodes)):
	#print(nodes[i].get_content())
	data.append(nodes[i].get_content())
	return data

	def get_model(data):
	model = Top2Vec(data, embedding_model='universal-sentence-encoder')
	return model

	def get_search_result(model, question):
	documents, doc_scores, doc_ids = model.query_documents(question, 1)

	return documents


	# pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")


	# @app.post("/predict")
	# def predict(image_file: bytes = File(...), question: str = Form(...)):
	@app.post("/predict")
	def load_file(file_url: str, sentences: List[str]):



	# URL to the PDF file
	pdf_url = file_url

	# Initialize an empty variable to store the extracted text
	all_text = ''

	# Download the PDF from the URL
	response = requests.get(pdf_url)
	if response.status_code == 200:
	pdf_bytes = BytesIO(response.content)

	# Open the PDF file using PyMuPDF
	pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

	# Loop through each page and perform OCR
	for page_num in range(pdf_document.page_count):
	page = pdf_document.load_page(page_num)
	print(f"Processing page {page_num + 1}...")
	text = page.get_text()
	all_text += text + '\n'

	# Print or do something with the collected text
	print(all_text)



	model_name = "deepset/roberta-base-squad2"

	nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
	##########################
	nodes = doc_chunk(all_text)
	data = create_train_data(nodes)
	model = get_model(data)
	#context = get_search_result(model, question)

	# Define the common context
	#context = all_text

	# List of questions
	questions = sentences
	# Initialize an empty dictionary to store questions and answers
	qa_dict = {}
	# Get answers for each question with the same context
	for question in questions:
	context = get_search_result(model, question)
	QA_input = {
	'question': question,
	'context': context
	}
	res = nlp(QA_input)
	print(f"Question: {question}")
	print(f"Answer: {res['answer']}")
	qa_dict[question] = res['answer']


	return qa_dict