umair894's picture
Update main.py
22ee86e
import fitz
from fastapi import FastAPI, File, UploadFile
#from pyngrok import ngrok
from typing import List
import pytesseract
import requests
from io import BytesIO
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from top2vec import Top2Vec
from llama_index.node_parser import SimpleNodeParser
description = """
## DocQA
This app shows how to do Document Question Answering
Check out the docs for the `/predict` endpoint below to try it out!
"""
app = FastAPI(docs_url="/", description=description)
def doc_chunk(data):
node_parser = SimpleNodeParser.from_defaults(chunk_size=256)
nodes = node_parser.get_nodes_from_documents(data)
return nodes
def create_train_data(nodes):
data = []
for i in range(len(nodes)):
#print(nodes[i].get_content())
data.append(nodes[i].get_content())
return data
def get_model(data):
model = Top2Vec(data, embedding_model='universal-sentence-encoder')
return model
def get_search_result(model, question):
documents, doc_scores, doc_ids = model.query_documents(question, 1)
return documents
# pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
# @app.post("/predict")
# def predict(image_file: bytes = File(...), question: str = Form(...)):
@app.post("/predict")
def load_file(file_url: str, sentences: List[str]):
# URL to the PDF file
pdf_url = file_url
# Initialize an empty variable to store the extracted text
all_text = ''
# Download the PDF from the URL
response = requests.get(pdf_url)
if response.status_code == 200:
pdf_bytes = BytesIO(response.content)
# Open the PDF file using PyMuPDF
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
# Loop through each page and perform OCR
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
print(f"Processing page {page_num + 1}...")
text = page.get_text()
all_text += text + '\n'
# Print or do something with the collected text
print(all_text)
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
##########################
nodes = doc_chunk(all_text)
data = create_train_data(nodes)
model = get_model(data)
#context = get_search_result(model, question)
# Define the common context
#context = all_text
# List of questions
questions = sentences
# Initialize an empty dictionary to store questions and answers
qa_dict = {}
# Get answers for each question with the same context
for question in questions:
context = get_search_result(model, question)
QA_input = {
'question': question,
'context': context
}
res = nlp(QA_input)
print(f"Question: {question}")
print(f"Answer: {res['answer']}")
qa_dict[question] = res['answer']
return qa_dict