Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import faiss | |
| import numpy as np | |
| import torch | |
| from pypdf import PdfReader | |
| from transformers import AutoTokenizer, AutoModel, pipeline | |
| from langchain.text_splitter import CharacterTextSplitter | |
| # Load embedding and QA models | |
| tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') | |
| model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') | |
| qa_pipeline = pipeline('question-answering', model="distilbert-base-uncased-distilled-squad") | |
| # PDF text extraction and text chunking | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def split_text_into_chunks(text, chunk_size=500, overlap=50): | |
| splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
| return splitter.split_text(text) | |
| # Function to embed text using the embedding model | |
| def embed_text(text): | |
| inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
| with torch.no_grad(): | |
| embeddings = model(**inputs).last_hidden_state.mean(dim=1) | |
| return embeddings.numpy() | |
| # Function to create FAISS index | |
| def create_faiss_index(embeddings): | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index | |
| # Function to answer questions based on retrieved context | |
| def answer_question(question, index, chunks, top_k=3): | |
| question_embedding = embed_text(question) | |
| _, indices = index.search(question_embedding, top_k) | |
| context = " ".join([chunks[i] for i in indices[0]]) | |
| result = qa_pipeline(question=question, context=context) | |
| return result['answer'] | |
| # Streamlit app layout | |
| st.title("PDF Question-Answering Chatbot with RAG") | |
| st.write("Upload a PDF, and ask questions based on its content.") | |
| # File uploader | |
| pdf_file = st.file_uploader("Upload PDF", type="pdf") | |
| if pdf_file is not None: | |
| # Extract and split text from PDF | |
| with st.spinner("Processing PDF..."): | |
| text = extract_text_from_pdf(pdf_file) | |
| chunks = split_text_into_chunks(text) | |
| # Embed and index the chunks | |
| embeddings = np.vstack([embed_text(chunk) for chunk in chunks]) | |
| index = create_faiss_index(embeddings) | |
| st.success("PDF processed and indexed successfully!") | |
| st.write("You can now ask questions based on the content of the PDF.") | |
| # Input for user question | |
| question = st.text_input("Ask a question:") | |
| if question: | |
| with st.spinner("Searching for the answer..."): | |
| answer = answer_question(question, index, chunks) | |
| st.write("**Answer:**", answer) | |