Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
from langchain.vectorstores.cassandra import Cassandra | |
from langchain.indexes.vectorstore import VectorStoreIndexWrapper | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import HuggingFaceHub | |
from langchain.text_splitter import CharacterTextSplitter | |
import cassio | |
from dotenv import load_dotenv | |
import os | |
load_dotenv() | |
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | |
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") | |
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
# === Streamlit UI Setup === | |
st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide") | |
st.title("ππ¬ Query PDF using LangChain + AstraDB (Free Hugging Face Models)") | |
# === File Upload === | |
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) | |
if uploaded_file: | |
st.success("β PDF uploaded successfully!") | |
process_button = st.button("π Process PDF") | |
if process_button: | |
# Initialize AstraDB | |
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) | |
# Read PDF contents | |
pdf_reader = PdfReader(uploaded_file) | |
raw_text = "" | |
for page in pdf_reader.pages: | |
content = page.extract_text() | |
if content: | |
raw_text += content | |
# Split text into chunks | |
text_splitter = CharacterTextSplitter( | |
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len | |
) | |
texts = text_splitter.split_text(raw_text) | |
# === Embeddings === | |
embedding = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2" | |
) | |
# === Hugging Face LLM === | |
llm = HuggingFaceHub( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.1", | |
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, | |
model_kwargs={"temperature": 0.5, "max_new_tokens": 512} | |
) | |
# === Create vector store and index === | |
vector_store = Cassandra( | |
embedding=embedding, | |
table_name="qa_mini_demo", | |
session=None, | |
keyspace=None, | |
) | |
vector_store.add_texts(texts[:50]) | |
st.success(f"π {len(texts[:50])} chunks embedded and stored in AstraDB.") | |
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) | |
# === Ask Questions === | |
st.header("π€ Ask a question about your PDF") | |
user_question = st.text_input("π¬ Type your question here") | |
if user_question: | |
with st.spinner("Thinking..."): | |
answer = astra_vector_index.query(user_question, llm=llm).strip() | |
st.markdown(f"### π§ Answer:\n{answer}") | |
st.markdown("### π Top Relevant Chunks") | |
docs = vector_store.similarity_search_with_score(user_question, k=4) | |
for i, (doc, score) in enumerate(docs, 1): | |
st.markdown(f"**Chunk {i}** β Relevance Score: `{score:.4f}`") | |
st.code(doc.page_content[:500], language="markdown") | |