# ======================== # 📄 streamlit_app.py # LangChain + Gemini 1.5 Flash without FAISS # ======================== import streamlit as st from PyPDF2 import PdfReader from docx import Document from bs4 import BeautifulSoup import os from dotenv import load_dotenv from langchain_google_genai import ChatGoogleGenerativeAI from langchain.prompts import PromptTemplate from langchain.chains.question_answering import load_qa_chain from langchain_core.documents import Document # ======================== # 1️⃣ Configuration and Setup # ======================== load_dotenv() GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") if not GOOGLE_API_KEY: st.error("Missing GOOGLE_API_KEY in environment variables.") st.stop() # ======================== # 2️⃣ File Size Limits # ======================== MAX_TOTAL_SIZE_MB = 5 MAX_FILE_SIZE_MB = 2 def validate_file_sizes(uploaded_files): total_size = 0 for file in uploaded_files: size_mb = file.size / (1024 * 1024) if size_mb > MAX_FILE_SIZE_MB: st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.") return False total_size += size_mb if total_size > MAX_TOTAL_SIZE_MB: st.warning(f"Total size of all files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB total.") return False return True # ======================== # 3️⃣ Text Extraction # ======================== def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: reader = PdfReader(pdf) for page in reader.pages: content = page.extract_text() if content: text += content return text def get_docx_text(docx_file): doc = Document(docx_file) return "\n".join([para.text for para in doc.paragraphs]) def get_html_text(html_file): content = html_file.read() soup = BeautifulSoup(content, "html.parser") return soup.get_text() # ======================== # 4️⃣ LangChain Q&A Chain # ======================== def get_conversational_chain(): prompt_template = """ Answer the question as detailed as possible from the provided context. If the answer is not available, say "answer is not available in the context." Context: {context} Question: {question} Answer: """ model = ChatGoogleGenerativeAI( model="gemini-1.5-flash", temperature=0.3, google_api_key=GOOGLE_API_KEY ) prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) return chain # ======================== # 5️⃣ Streamlit App # ======================== def main(): st.set_page_config(page_title="Gemini Q&A Without FAISS") st.header("📄 Chat with Uploaded Documents (FAISS-Free Gemini Q&A)") # Upload and extract with st.sidebar: st.title("Upload Files") uploaded_files = st.file_uploader( "Upload PDF, DOCX, or HTML files (Max 2MB/file, 5MB total)", accept_multiple_files=True, type=['pdf', 'docx', 'html'] ) full_text = "" if st.button("Submit & Extract"): if not uploaded_files: st.warning("Please upload at least one file.") return if not validate_file_sizes(uploaded_files): return with st.spinner("Extracting file content..."): for file in uploaded_files: if file.name.endswith(".pdf"): full_text += get_pdf_text([file]) elif file.name.endswith(".docx"): full_text += get_docx_text(file) elif file.name.endswith(".html"): full_text += get_html_text(file) else: st.warning(f"Unsupported file type: {file.name}") st.session_state["context_text"] = full_text[:3000] # Limit for Gemini token safety st.success("Text extracted. You can now ask questions.") # Ask questions if "context_text" in st.session_state: user_question = st.text_input("Ask a question based on the uploaded document:") if user_question: with st.spinner("Thinking..."): try: chain = get_conversational_chain() # ✅ Wrap the extracted context text in a Document object doc = Document(page_content=st.session_state["context_text"]) # ✅ Pass it using the correct input key response = chain( { "input_documents": [doc], "question": user_question }, return_only_outputs=True ) st.markdown(f"**Gemini says:**\n\n{response['output_text']}") except Exception as e: st.error(f"Error from Gemini: {e}") if __name__ == "__main__": main()