Spaces:
Running
Running
import fitz # PyMuPDF | |
import gradio as gr | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
# Initialize summarizer pipeline | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# Initialize embedding model | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Initialize question-answering pipeline | |
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") | |
def extract_text_from_pdf(file_path): | |
doc = fitz.open(file_path) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def chunk_text(text, max_chunk_size=500): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for word in words: | |
current_chunk.append(word) | |
current_length += len(word) + 1 # +1 for space | |
if current_length >= max_chunk_size: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_length = 0 | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def build_faiss_index(chunks): | |
embeddings = embedding_model.encode(chunks) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(np.array(embeddings)) | |
return index, embeddings | |
def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3): | |
query_embedding = embedding_model.encode([query]) | |
distances, indices = index.search(np.array(query_embedding), top_k) | |
retrieved_chunks = [chunks[i] for i in indices[0]] | |
return retrieved_chunks | |
def summarize_pdf(file_path): | |
raw_text = extract_text_from_pdf(file_path) | |
max_chunk = 1024 | |
chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)] | |
summary = "" | |
for chunk in chunks: | |
res = summarizer(chunk, max_length=130, min_length=30, do_sample=False) | |
summary += res[0]['summary_text'] + " " | |
return summary.strip() | |
def answer_question(file_path, question): | |
raw_text = extract_text_from_pdf(file_path) | |
chunks = chunk_text(raw_text) | |
index, embeddings = build_faiss_index(chunks) | |
relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings) | |
context = " ".join(relevant_chunks) | |
answer = qa_pipeline(question=question, context=context) | |
return answer['answer'] | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF Summarizer and Q&A") | |
with gr.Tab("Summarization"): | |
with gr.Row(): | |
pdf_input = gr.File(type="filepath", label="Upload a PDF") | |
summarize_button = gr.Button("Summarize") | |
summary_output = gr.Textbox(label="Summary", lines=10) | |
summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output) | |
with gr.Tab("Question Answering"): | |
with gr.Row(): | |
pdf_input_qa = gr.File(type="filepath", label="Upload a PDF") | |
question_input = gr.Textbox(label="Enter your question") | |
answer_button = gr.Button("Get Answer") | |
answer_output = gr.Textbox(label="Answer", lines=2) | |
answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output) | |
if __name__ == "__main__": | |
demo.launch() | |