Spaces:

kpawargi
/

PDF_Query_Chatbot

Sleeping

App Files Files Community

PDF_Query_Chatbot / app.py

kpawargi

Update app.py

3904554 verified 29 days ago

raw

history blame

3.15 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain.vectorstores.cassandra import Cassandra
	from langchain.indexes.vectorstore import VectorStoreIndexWrapper
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.llms import HuggingFaceHub
	from langchain.text_splitter import CharacterTextSplitter
	import cassio
	from dotenv import load_dotenv
	import os

	load_dotenv()

	ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
	ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
	HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

	# === Streamlit UI Setup ===
	st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
	st.title("📄💬 Query PDF using LangChain + AstraDB (Free Hugging Face Models)")

	# === File Upload ===
	uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])

	if uploaded_file:
	st.success("✅ PDF uploaded successfully!")
	process_button = st.button("🔄 Process PDF")

	if process_button:
	# Initialize AstraDB
	cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

	# Read PDF contents
	pdf_reader = PdfReader(uploaded_file)
	raw_text = ""
	for page in pdf_reader.pages:
	content = page.extract_text()
	if content:
	raw_text += content

	# Split text into chunks
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
	)
	texts = text_splitter.split_text(raw_text)

	# === Embeddings ===
	embedding = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	# === Hugging Face LLM ===
	llm = HuggingFaceHub(
	repo_id="mistralai/Mistral-7B-Instruct-v0.1",
	huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
	model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
	)

	# === Create vector store and index ===
	vector_store = Cassandra(
	embedding=embedding,
	table_name="qa_mini_demo",
	session=None,
	keyspace=None,
	)
	vector_store.add_texts(texts[:50])
	st.success(f"📚 {len(texts[:50])} chunks embedded and stored in AstraDB.")

	astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)

	# === Ask Questions ===
	st.header("🤖 Ask a question about your PDF")
	user_question = st.text_input("💬 Type your question here")

	if user_question:
	with st.spinner("Thinking..."):
	answer = astra_vector_index.query(user_question, llm=llm).strip()
	st.markdown(f"### 🧠 Answer:\n{answer}")

	st.markdown("### 🔍 Top Relevant Chunks")
	docs = vector_store.similarity_search_with_score(user_question, k=4)
	for i, (doc, score) in enumerate(docs, 1):
	st.markdown(f"Chunk {i} — Relevance Score: `{score:.4f}`")
	st.code(doc.page_content[:500], language="markdown")