|
import gradio as gr |
|
from PyPDF2 import PdfReader |
|
|
|
|
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_core.prompts import PromptTemplate |
|
|
|
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
def load_llm(): |
|
try: |
|
|
|
pipe = pipeline( |
|
"text2text-generation", |
|
model="google/flan-t5-base", |
|
max_length=512, |
|
temperature=0.1 |
|
) |
|
print("✅ Successfully loaded model: google/flan-t5-base") |
|
return pipe |
|
except Exception as e: |
|
print(f"⚠️ Failed to load model: {e}") |
|
return None |
|
|
|
|
|
llm = load_llm() |
|
|
|
|
|
|
|
def process_pdf(pdf_files): |
|
text = "" |
|
for pdf in pdf_files: |
|
reader = PdfReader(pdf) |
|
for page in reader.pages: |
|
extracted = page.extract_text() |
|
if extracted: |
|
text += extracted + "\n" |
|
|
|
if not text.strip(): |
|
return None |
|
|
|
|
|
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100) |
|
texts = splitter.split_text(text) |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
db = FAISS.from_texts(texts, embeddings) |
|
|
|
return db |
|
|
|
|
|
|
|
def ask_question(pdf_files, question): |
|
try: |
|
if not pdf_files: |
|
return "⚠️ Please upload at least one PDF file." |
|
|
|
if not llm: |
|
return "⚠️ Language model failed to load. Please try again later." |
|
|
|
db = process_pdf(pdf_files) |
|
if not db: |
|
return "⚠️ No text found in the uploaded PDF(s)." |
|
|
|
retriever = db.as_retriever(search_kwargs={"k": 4}) |
|
docs = retriever.get_relevant_documents(question) |
|
|
|
|
|
context = "\n".join([doc.page_content for doc in docs]) |
|
|
|
|
|
context = " ".join(context.split()) |
|
|
|
|
|
prompt = f"""Based on the following information, answer the question clearly and concisely. |
|
|
|
Information: |
|
{context} |
|
|
|
Question: {question} |
|
|
|
Answer:""" |
|
|
|
|
|
result = llm( |
|
prompt, |
|
max_length=300, |
|
num_return_sequences=1, |
|
do_sample=False, |
|
temperature=0.1 |
|
) |
|
|
|
response = result[0]['generated_text'].strip() |
|
|
|
|
|
if response.startswith("Answer:"): |
|
response = response.replace("Answer:", "").strip() |
|
|
|
|
|
if not response or len(response) < 10: |
|
return "I couldn't find a clear answer to your question in the provided documents. Please try rephrasing your question or check if the relevant information is in the uploaded PDFs." |
|
|
|
return response |
|
|
|
except Exception as e: |
|
return f"⚠️ Error: {str(e)}" |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## 📚 PDF Question Answering System") |
|
gr.Markdown("Upload PDF files and ask questions about their content.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
pdf_input = gr.File( |
|
label="Upload PDF Files", |
|
file_types=[".pdf"], |
|
file_count="multiple" |
|
) |
|
with gr.Column(): |
|
question_input = gr.Textbox( |
|
label="Your Question", |
|
placeholder="What would you like to know about the document?", |
|
lines=2 |
|
) |
|
submit_btn = gr.Button("Ask Question", variant="primary") |
|
|
|
with gr.Row(): |
|
output = gr.Textbox( |
|
label="Answer", |
|
lines=4, |
|
interactive=False |
|
) |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
["What is the main topic of this document?"], |
|
["Can you summarize the key points?"], |
|
["What are the main findings or conclusions?"], |
|
["Who are the authors and what are their credentials?"] |
|
], |
|
inputs=question_input, |
|
label="Example Questions" |
|
) |
|
|
|
|
|
submit_btn.click(ask_question, inputs=[pdf_input, question_input], outputs=output) |
|
question_input.submit(ask_question, inputs=[pdf_input, question_input], outputs=output) |
|
|
|
demo.launch() |