import fitz  # PyMuPDF
import gradio as gr
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    document = fitz.open(pdf_path.name)
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Load the model and tokenizer
model_name = "distilbert-base-cased-distilled-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Function to answer questions based on the PDF content
def answer_question(pdf_file, question):
    # Extract text from the uploaded PDF file
    content = extract_text_from_pdf(pdf_file)
    # Get the answer using the question-answering pipeline
    result = qa_pipeline(question=question, context=content)
    return result['answer']

# Define the Gradio interface
iface = gr.Interface(
    fn=answer_question,
    inputs=[gr.File(label="PDF File", file_types=[".pdf"]), gr.Textbox(lines=2, placeholder="Ask a question...")],
    outputs="text",
    title="DistilBERT Question Answering",
    description="Upload a PDF and ask questions based on the content of the PDF."
)

# Launch the interface
iface.launch()