import fitz # PyMuPDF import gradio as gr from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline # Function to extract text from PDF def extract_text_from_pdf(pdf_path): text = "" document = fitz.open(pdf_path.name) for page_num in range(len(document)): page = document.load_page(page_num) text += page.get_text() return text # Load the model and tokenizer model_name = "distilbert-base-cased-distilled-squad" model = AutoModelForQuestionAnswering.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Initialize the question-answering pipeline qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) # Function to answer questions based on the PDF content def answer_question(pdf_file, question): # Extract text from the uploaded PDF file content = extract_text_from_pdf(pdf_file) # Get the answer using the question-answering pipeline result = qa_pipeline(question=question, context=content) return result['answer'] # Define the Gradio interface iface = gr.Interface( fn=answer_question, inputs=[gr.File(label="PDF File", file_types=[".pdf"]), gr.Textbox(lines=2, placeholder="Ask a question...")], outputs="text", title="DistilBERT Question Answering", description="Upload a PDF and ask questions based on the content of the PDF." ) # Launch the interface iface.launch()