import fitz
from fastapi import FastAPI, File, UploadFile
#from pyngrok import ngrok
from typing import List

import pytesseract
import requests
from io import BytesIO

from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

from top2vec import Top2Vec
from llama_index.node_parser import SimpleNodeParser


description = """
## DocQA 
This app shows how to do Document Question Answering
Check out the docs for the `/predict` endpoint below to try it out!
"""

app = FastAPI(docs_url="/", description=description)

def doc_chunk(data):
  node_parser = SimpleNodeParser.from_defaults(chunk_size=256)
  nodes = node_parser.get_nodes_from_documents(data)
  return nodes

def create_train_data(nodes):
  data = []
  for i in range(len(nodes)):
    #print(nodes[i].get_content())
    data.append(nodes[i].get_content())
  return data

def get_model(data):
  model = Top2Vec(data, embedding_model='universal-sentence-encoder')
  return model

def get_search_result(model, question):
  documents, doc_scores, doc_ids  = model.query_documents(question, 1)

  return documents


# pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")


# @app.post("/predict")
# def predict(image_file: bytes = File(...), question: str = Form(...)):
@app.post("/predict")
def load_file(file_url: str, sentences: List[str]):


        # URL to the PDF file
        pdf_url = file_url

        # Initialize an empty variable to store the extracted text
        all_text = ''

        # Download the PDF from the URL
        response = requests.get(pdf_url)
        if response.status_code == 200:
            pdf_bytes = BytesIO(response.content)

            # Open the PDF file using PyMuPDF
            pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

            # Loop through each page and perform OCR
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                print(f"Processing page {page_num + 1}...")
                text = page.get_text()
                all_text += text + '\n'

            # Print or do something with the collected text
            print(all_text)


        model_name = "deepset/roberta-base-squad2"

        nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
        ##########################
        nodes = doc_chunk(all_text)
        data = create_train_data(nodes)
        model = get_model(data)
        #context = get_search_result(model, question)

        # Define the common context
        #context = all_text

        # List of questions
        questions = sentences
        # Initialize an empty dictionary to store questions and answers
        qa_dict = {}
        # Get answers for each question with the same context
        for question in questions:
            context = get_search_result(model, question)
            QA_input = {
                'question': question,
                'context': context
            }
            res = nlp(QA_input)
            print(f"Question: {question}")
            print(f"Answer: {res['answer']}")
            qa_dict[question] = res['answer']


        return qa_dict