Smart_Notes

Running

App Files Files Community

Smart_Notes / app.py

amanapk

Update app.py

9f7b11b verified about 1 month ago

raw

history blame contribute delete

8.85 kB


	import streamlit as st
	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai_v1
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import textwrap
	import os
	import json
	import tempfile
	import os
	import requests





	# ------------------- Secure Credential Loading for Hugging Face ------------------- #
	# This section loads the Service Account from Hugging Face Secrets for ADC

	# 1. Load the Service Account JSON string from the environment variable (secret)
	gcp_credentials_json_str = os.getenv("GCP_CREDENTIALS_JSON")
	project_id = "wise-env-461717-t5" # Initialize project_id
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")

	# 2. Check if the secret is present
	if gcp_credentials_json_str:
	try:
	# --- FIX: Write to the /tmp/ directory, which is writable on Hugging Face Spaces ---
	credentials_file_path = "/tmp/gcp_service_account.json"

	# 3. Write the JSON string to the file in the temporary directory
	with open(credentials_file_path, "w") as f:
	f.write(gcp_credentials_json_str)

	# 4. Set the environment variable to point to this file
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file_path

	# Extract project_id from the credentials for convenience
	creds_dict = json.loads(gcp_credentials_json_str)
	project_id = creds_dict.get("project_id")

	except Exception as e:
	st.error(f"🚨 Failed to process GCP credentials: {e}")
	st.stop()
	else:
	st.error("🚨 GCP_CREDENTIALS_JSON secret not found! Please add it to your Hugging Face Space settings.")
	st.stop()


	# ------------------- Configuration ------------------- #
	# Project ID is now dynamically loaded from the service account
	if not project_id:
	st.error("🚨 Project ID could not be found in the GCP credentials.")
	st.stop()

	# You still need to provide your Processor ID and location
	processor_id = "86a7eec52bbb9616" # <-- REPLACE WITH YOUR PROCESSOR ID
	location = "us" # e.g., "us" or "eu"




	# ------------------- Google Document AI Client (Uses ADC) ------------------- #
	# The client now automatically finds and uses the credentials set above
	try:
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
	docai_client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)
	full_processor_name = docai_client.processor_path(project_id, location, processor_id)
	except Exception as e:
	st.error(f"Error initializing Document AI client: {e}")
	st.stop()


	@st.cache_resource
	def load_embedding_model():
	# Use a writable cache directory
	cache_dir = "/tmp/hf_cache"
	os.makedirs(cache_dir, exist_ok=True)

	# Set Hugging Face environment variables
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HF_HOME"] = cache_dir

	# Load embedding model
	return SentenceTransformer("all-MiniLM-L6-v2", cache_folder=cache_dir)
	embed_model = load_embedding_model()


	# ------------------- Utility Functions ------------------- #
	def chunk_text(text, max_chars=500):
	return textwrap.wrap(text, max_chars)
	def extract_text_with_documentai(file_path):
	with open(file_path, "rb") as f:
	content = f.read()
	raw_document = documentai_v1.RawDocument(content=content, mime_type="application/pdf")
	request = documentai_v1.ProcessRequest(name=full_processor_name, raw_document=raw_document)
	result = docai_client.process_document(request=request)
	document = result.document
	return document.text

	def build_index(text):
	text_chunks = chunk_text(text)
	embeddings = embed_model.encode(text_chunks)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatL2(dim)
	index.add(np.array(embeddings))
	return index, text_chunks

	def retrieve_context(query, index, text_chunks, top_k=5):
	query_embed = embed_model.encode([query])
	distances, indices = index.search(np.array(query_embed), top_k)
	return [text_chunks[i] for i in indices[0]]

	# ------------------- Gemini API Functions ------------------- #
	def ask_groq_agent(query, context):
	prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
	json={
	"model": "llama3-70b-8192",
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.3
	}
	)
	return response.json()["choices"][0]["message"]["content"]
	def get_summary(text):
	prompt = f"Please provide a concise summary of the following document:\n\n{text[:4000]}"
	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
	json={
	"model": "llama3-70b-8192",
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.3
	}
	)
	return response.json()["choices"][0]["message"]["content"]


	def generate_flashcards(text_chunks):
	joined_text = "\n".join(text_chunks)
	prompt = (
	"Generate 5 helpful flashcards from the following content. "
	"Use the format exactly like this:\n\n"
	"Q: What is ...?\nA: ...\n\nQ: How does ...?\nA: ...\n\n"
	"Text:\n" + joined_text
	)

	response = requests.post(
	"https://api.groq.com/openai/v1/chat/completions",
	headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
	json={
	"model": "llama3-70b-8192",
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.5
	}
	)
	content = response.json()["choices"][0]["message"]["content"]

	flashcards = []
	question = None
	for line in content.strip().splitlines():
	line = line.strip()
	if line.lower().startswith("q:"):
	question = line[2:].strip()
	elif line.lower().startswith("a:") and question:
	answer = line[2:].strip()
	flashcards.append({"question": question, "answer": answer})
	question = None
	return flashcards

	st.title("📄 PDF AI Assistant (Groq + DocAI)")

	if "index" not in st.session_state:
	st.session_state.index = None
	st.session_state.text_chunks = []
	st.session_state.raw_text = ""

	with st.sidebar:
	st.header("📤 Upload PDF")
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.read())
	tmp_file.flush()
	tmp_path = tmp_file.name

	# DEBUG: File info
	st.write("Saved file at:", tmp_path)
	st.write("File size:", os.path.getsize(tmp_path), "bytes")
	st.write("File exists:", os.path.exists(tmp_path))

	with st.spinner("Extracting text using Document AI..."):
	raw_text = extract_text_with_documentai(tmp_path)
	index, text_chunks = build_index(raw_text)
	st.session_state.index = index
	st.session_state.text_chunks = text_chunks
	st.session_state.raw_text = raw_text
	st.success("✅ Document processed successfully.")
	except Exception as e:
	st.error(f"Error: {e}")
	finally:
	os.unlink(tmp_path)


	# ------------------- Q&A Interface ------------------- #
	st.subheader("❓ Ask Questions")
	if st.session_state.index:
	question = st.text_input("Enter your question")
	if st.button("Ask"):
	context = "\n\n".join(retrieve_context(question, st.session_state.index, st.session_state.text_chunks))
	answer = ask_groq_agent(question, context)
	st.markdown(f"Answer: {answer}")
	else:
	st.info("Upload a PDF to start asking questions.")

	# ------------------- Summary Interface ------------------- #
	st.subheader("📝 Document Summary")
	if st.session_state.text_chunks:
	if st.button("Generate Summary"):
	with st.spinner("Generating summary..."):
	summary = get_summary(" ".join(st.session_state.text_chunks))
	st.markdown(summary)
	else:
	st.info("Upload a PDF to get a summary.")

	# ------------------- Flashcards ------------------- #
	st.subheader("🧠 Flashcards")
	if st.session_state.text_chunks:
	if st.button("Generate Flashcards"):
	with st.spinner("Generating flashcards..."):
	flashcards = generate_flashcards(st.session_state.text_chunks)
	for fc in flashcards:
	st.markdown(f"Q: {fc['question']}\n\nA: {fc['answer']}")
	else:
	st.info("Upload a PDF to generate flashcards.")