Spaces:

gopalnoutiyal
/

test

Sleeping

App Files Files Community

test / app.py

gopalnoutiyal

Update app.py

e0ffeff verified about 2 months ago

raw

history blame contribute delete

No virus

4.16 kB

	import streamlit as st
	import fitz # PyMuPDF
	import os
	from langchain.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.llms import HuggingFaceHub
	from langchain.prompts import ChatPromptTemplate # Use correct import

	api_token = os.environ.get("HF_TOKEN", None)

	# Simple document class
	class Document:
	def __init__(self, page_content):
	self.page_content = page_content
	self.metadata = {} # Add a metadata attribute

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	# Function to embed PDF text in the vector store
	def pdf_to_vector_store(pdf_text):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

	documents = [Document(page_content=pdf_text)]
	print("Documents before splitting:", documents)

	split_docs = text_splitter.split_documents(documents)
	print("Documents after splitting:", split_docs)

	if len(split_docs) > 0:
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	db = FAISS.from_documents(split_docs, embeddings)
	return db
	return None

	# Streamlit app
	st.title("Chat with PDF using LLAMA Model")

	# File uploader
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Extract text from the uploaded PDF
	pdf_text = extract_text_from_pdf(uploaded_file)

	# Display extracted text (or handle it as needed)
	st.write("Extracted Text from PDF:")
	st.write(pdf_text[:100]) # Display first 100 characters for brevity

	# Embed PDF text in the vector store
	st.write("Embedding PDF text into the vector store...")
	db = pdf_to_vector_store(pdf_text)
	if db:
	st.write("FAISS and embeddings setup completed.")
	else:
	st.write("Failed to setup FAISS and embeddings.")

	# If embedding was successful, proceed to Q&A
	if db:
	st.write("You can now ask questions about the PDF.")

	# Text input for user question
	user_question = st.text_input("Enter your question:")

	if user_question:
	# Function to answer questions using LLAMA model and vector store
	def answer_question(query, db):
	# Define the search type, e.g., 'similarity'
	search_type = "similarity"
	docs = db.search(
	query, search_type=search_type, k=5
	) # Retrieve top 5 relevant document chunks

	# Extract text from the documents
	context = " ".join([doc.page_content for doc in docs])

	# Construct the prompt
	prompt_template = ChatPromptTemplate.from_template(
	"""
	Answer the following question based only on the context from vector store I have provided. Think step by step before providing a detailed answer.
	<context>
	{context}
	</context>
	Question: {input}
	"""
	)

	prompt = prompt_template.format(context=context, input=query)

	# Define model parameters
	model_id = "google/flan-t5-large" # Use a smaller model
	temperature = 0.7
	max_tokens = 300
	top_k = 450

	# Initialize the HuggingFaceHub model
	llm = HuggingFaceHub(
	repo_id=model_id,
	huggingfacehub_api_token=api_token
	)

	# Get the response
	response = llm(prompt)
	return response

	# Get the answer
	answer = answer_question(user_question, db)
	st.write("Answer from LLAMA Model:")
	st.write(answer)

	# Note: Ensure you handle large PDFs appropriately to avoid performance issues