import fitz from fastapi import FastAPI, File, UploadFile #from pyngrok import ngrok from typing import List import pytesseract import requests from io import BytesIO from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline from top2vec import Top2Vec from llama_index.node_parser import SimpleNodeParser description = """ ## DocQA This app shows how to do Document Question Answering Check out the docs for the `/predict` endpoint below to try it out! """ app = FastAPI(docs_url="/", description=description) def doc_chunk(data): node_parser = SimpleNodeParser.from_defaults(chunk_size=256) nodes = node_parser.get_nodes_from_documents(data) return nodes def create_train_data(nodes): data = [] for i in range(len(nodes)): #print(nodes[i].get_content()) data.append(nodes[i].get_content()) return data def get_model(data): model = Top2Vec(data, embedding_model='universal-sentence-encoder') return model def get_search_result(model, question): documents, doc_scores, doc_ids = model.query_documents(question, 1) return documents # pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa") # @app.post("/predict") # def predict(image_file: bytes = File(...), question: str = Form(...)): @app.post("/predict") def load_file(file_url: str, sentences: List[str]): # URL to the PDF file pdf_url = file_url # Initialize an empty variable to store the extracted text all_text = '' # Download the PDF from the URL response = requests.get(pdf_url) if response.status_code == 200: pdf_bytes = BytesIO(response.content) # Open the PDF file using PyMuPDF pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") # Loop through each page and perform OCR for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) print(f"Processing page {page_num + 1}...") text = page.get_text() all_text += text + '\n' # Print or do something with the collected text print(all_text) model_name = "deepset/roberta-base-squad2" nlp = pipeline('question-answering', model=model_name, tokenizer=model_name) ########################## nodes = doc_chunk(all_text) data = create_train_data(nodes) model = get_model(data) #context = get_search_result(model, question) # Define the common context #context = all_text # List of questions questions = sentences # Initialize an empty dictionary to store questions and answers qa_dict = {} # Get answers for each question with the same context for question in questions: context = get_search_result(model, question) QA_input = { 'question': question, 'context': context } res = nlp(QA_input) print(f"Question: {question}") print(f"Answer: {res['answer']}") qa_dict[question] = res['answer'] return qa_dict