Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
import numpy as np | |
from sentence_transformers import SentenceTransformer, util | |
from huggingface_hub import HfApi, ModelRepository | |
import os | |
# Set up HuggingFace API token | |
HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN'] | |
# Set up model and vector database | |
model_name = "google/flan-t5-xl" | |
model_kwargs = {"temperature": 0.2, "max_length": 100} | |
model = HfApi().model_info(model_name) | |
repo = ModelRepository(model_name, token=HUGGINGFACEHUB_API_TOKEN) | |
model = repo.load_model(**model_kwargs) | |
# Set up vector database | |
vector_db = SentenceTransformer('all-MiniLM-L6-v2') | |
# Function to extract text from PDF documents | |
def extract_text_from_pdfs(pdfs): | |
texts = [] | |
for pdf in pdfs: | |
with pdfplumber.open(pdf) as pdf_file: | |
for page in pdf_file.pages: | |
texts.append(page.extract_text()) | |
return''.join(texts) | |
# Function to split text into chunks | |
def split_text_into_chunks(text): | |
chunks = [] | |
for i in range(0, len(text), 9000): | |
chunk = text[i:i+10000] | |
chunks.append(chunk) | |
return chunks | |
# Function to create and save vector database | |
def create_vector_db(chunks): | |
vectors = [] | |
for chunk in chunks: | |
vector = vector_db.encode(chunk) | |
vectors.append(vector) | |
np.save('vector_db.npy', vectors) | |
# Function to create conversational chain | |
def create_conversational_chain(chunks, question): | |
prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information.""" | |
responses = [] | |
for chunk in chunks: | |
prompt = prompt_template.format(context=chunk, question=question) | |
response = model.generate(prompt, **model_kwargs) | |
responses.append(response) | |
return responses | |
# Streamlit UI creation | |
st.title("PDF Chatbot") | |
st.write("Upload multiple PDF files and ask a question to get a response based on the content of the PDFs.") | |
pdfs = st.file_uploader("Select PDF files", type=["pdf"], accept_multiple_files=True) | |
question = st.text_input("Enter your question") | |
if st.button("Get Response"): | |
# Extract text from PDFs | |
text = extract_text_from_pdfs(pdfs) | |
# Split text into chunks | |
chunks = split_text_into_chunks(text) | |
# Create and save vector database | |
create_vector_db(chunks) | |
# Create conversational chain | |
responses = create_conversational_chain(chunks, question) | |
# Display responses | |
for response in responses: | |
st.write(response) |