Spaces:

adi-123
/

chat-with-docs

Sleeping

App Files Files Community

chat-with-docs / app.py

adi-123

Create app.py

ce59d1a verified 5 months ago

raw

history blame contribute delete

3.11 kB

	import streamlit as st
	import pdfplumber
	import numpy as np
	from sentence_transformers import SentenceTransformer, util
	from huggingface_hub import HfApi, ModelRepository
	import os

	# Set up HuggingFace API token
	HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']

	# Set up model and vector database
	model_name = "google/flan-t5-xl"
	model_kwargs = {"temperature": 0.2, "max_length": 100}
	model = HfApi().model_info(model_name)
	repo = ModelRepository(model_name, token=HUGGINGFACEHUB_API_TOKEN)
	model = repo.load_model(**model_kwargs)

	# Set up vector database
	vector_db = SentenceTransformer('all-MiniLM-L6-v2')

	# Function to extract text from PDF documents
	def extract_text_from_pdfs(pdfs):
	texts = []
	for pdf in pdfs:
	with pdfplumber.open(pdf) as pdf_file:
	for page in pdf_file.pages:
	texts.append(page.extract_text())
	return''.join(texts)

	# Function to split text into chunks
	def split_text_into_chunks(text):
	chunks = []
	for i in range(0, len(text), 9000):
	chunk = text[i:i+10000]
	chunks.append(chunk)
	return chunks

	# Function to create and save vector database
	def create_vector_db(chunks):
	vectors = []
	for chunk in chunks:
	vector = vector_db.encode(chunk)
	vectors.append(vector)
	np.save('vector_db.npy', vectors)

	# Function to create conversational chain
	def create_conversational_chain(chunks, question):
	prompt_template = """Answer the question concisely, focusing on the most relevant and important details from the PDF context. Refrain from mentioning any mathematical equations, even if they are present in provided context. Focus on the textual information available. Please provide direct quotations or references from PDF to back up your response. If the answer is not found within the PDF, please state "answer is not available in the context."\n\nContext:\n {context}?\nQuestion: \n{question}\nExample response format:Overview: (brief summary or introduction)Key points: (point 1: paragraph for key details)(point 2: paragraph for key details)...Use a mix of paragraphs and points to effectively convey the information."""
	responses = []
	for chunk in chunks:
	prompt = prompt_template.format(context=chunk, question=question)
	response = model.generate(prompt, **model_kwargs)
	responses.append(response)
	return responses

	# Streamlit UI creation
	st.title("PDF Chatbot")
	st.write("Upload multiple PDF files and ask a question to get a response based on the content of the PDFs.")

	pdfs = st.file_uploader("Select PDF files", type=["pdf"], accept_multiple_files=True)
	question = st.text_input("Enter your question")

	if st.button("Get Response"):
	# Extract text from PDFs
	text = extract_text_from_pdfs(pdfs)

	# Split text into chunks
	chunks = split_text_into_chunks(text)

	# Create and save vector database
	create_vector_db(chunks)

	# Create conversational chain
	responses = create_conversational_chain(chunks, question)

	# Display responses
	for response in responses:
	st.write(response)