import streamlit as st
from PyPDF2 import PdfReader
import pytesseract
from PIL import Image
import fitz
import io
# from transformers import pipeline
import requests
import os

import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
my_token = os.getenv('my_repo_token')
# Function to get embeddings using a pre-trained model
def get_embeddings(texts, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    return embeddings

# Function to find the most relevant context using FAISS
def find_most_relevant_context(contexts, question, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    # Get embeddings for contexts and question
    all_texts = [question] + contexts
    embeddings = get_embeddings(all_texts, model_name=model_name)
    
    # Separate the question embedding and context embeddings
    question_embedding = embeddings[0]
    context_embeddings = embeddings[1:]
    
    # Create a FAISS index and add context embeddings
    dimension = context_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(context_embeddings)
    
    # Search for the nearest neighbor to the question embedding
    _, indices = index.search(question_embedding.reshape(1, -1), 1)
    
    # Get the most relevant context
    most_relevant_index = indices[0][0]
    return contexts[most_relevant_index]


API_URL = "https://api-inference.huggingface.co/models/google/gemma-7b"
API_URL_2 = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
API_URL_LLMA = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {my_token}"}
# pipe = pipeline("text-generation", model="mistralai/Mixtral-8x7B-v0.1", token = my_token)

def query(payload):
	response = requests.post(API_URL_LLMA, headers=headers, json=payload)
    
	return response.json()
    # return pipe(payload)
	

# Mock function for answering questions from the PDF
# Replace this with your actual backend function
def answer_question_from_pdf(pdf_text, question):
    # This function should return the answer to the question based on the PDF content
    # Here we just return a mock response
 
    answer = query(   {"inputs": "Based on this content: " + pdf_text+" The Question is: "+ question + " Provide the answer with max lenghth of about 1500",})
    answer = answer[0]["generated_text"]
    answer = answer[answer.find("Answer")+6:]
    return answer
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    pdf_text = ""
    pdf_arr = []
    for page_num in range(len(pdf_reader.pages)):
        pdf_text = pdf_reader.pages[page_num].extract_text()
        pdf_arr.append(pdf_text)
    return pdf_arr
# Streamlit app
st.title("PDF Explorer")

# File uploader
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file is not None:
    # Extract text from uploaded PDF
    pdf_arr = extract_text_from_pdf(uploaded_file)
    
    st.write("PDF Uploaded Successfully.")
    
    # Text area for entering a question
    question = st.text_input("Ask a question about the PDF")
    pdf_text = find_most_relevant_context(pdf_arr,question)
    
    if st.button("Get Answer"):
        if question:
            # Get the answer from the backend
            answer = answer_question_from_pdf(pdf_text, question)
            st.write("Answer:", answer)
        else:
            st.write("Please enter a question.")
else:
    st.write("Please upload a PDF file.")