chat-with-docs / app.py
adi-123's picture
Create app.py
ce59d1a verified
import streamlit as st
import pdfplumber
import numpy as np
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import HfApi, ModelRepository
import os
# Set up HuggingFace API token
HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']
# Set up model and vector database
model_name = "google/flan-t5-xl"
model_kwargs = {"temperature": 0.2, "max_length": 100}
model = HfApi().model_info(model_name)
repo = ModelRepository(model_name, token=HUGGINGFACEHUB_API_TOKEN)
model = repo.load_model(**model_kwargs)
# Set up vector database
vector_db = SentenceTransformer('all-MiniLM-L6-v2')
# Function to extract text from PDF documents
def extract_text_from_pdfs(pdfs):
texts = []
for pdf in pdfs:
with pdfplumber.open(pdf) as pdf_file:
for page in pdf_file.pages:
texts.append(page.extract_text())
return''.join(texts)
# Function to split text into chunks
def split_text_into_chunks(text):
chunks = []
for i in range(0, len(text), 9000):
chunk = text[i:i+10000]
chunks.append(chunk)
return chunks
# Function to create and save vector database
def create_vector_db(chunks):
vectors = []
for chunk in chunks:
vector = vector_db.encode(chunk)
vectors.append(vector)
np.save('vector_db.npy', vectors)
# Function to create conversational chain
def create_conversational_chain(chunks, question):
prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information."""
responses = []
for chunk in chunks:
prompt = prompt_template.format(context=chunk, question=question)
response = model.generate(prompt, **model_kwargs)
responses.append(response)
return responses
# Streamlit UI creation
st.title("PDF Chatbot")
st.write("Upload multiple PDF files and ask a question to get a response based on the content of the PDFs.")
pdfs = st.file_uploader("Select PDF files", type=["pdf"], accept_multiple_files=True)
question = st.text_input("Enter your question")
if st.button("Get Response"):
# Extract text from PDFs
text = extract_text_from_pdfs(pdfs)
# Split text into chunks
chunks = split_text_into_chunks(text)
# Create and save vector database
create_vector_db(chunks)
# Create conversational chain
responses = create_conversational_chain(chunks, question)
# Display responses
for response in responses:
st.write(response)