Spaces:

Anshviradiya
/

Syllabus-Rag-System

Sleeping

App Files Files Community

Anshviradiya commited on Jan 21

Commit

6a04848

verified ·

1 Parent(s): 500eed3

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -223

app.py CHANGED Viewed

@@ -1,223 +1,190 @@
-import os
-os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
-import streamlit as st
-import re
-from dotenv import load_dotenv
-from unstructured.partition.auto import partition
-import google.generativeai as genai
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_ollama import ChatOllama
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_core.prompts import PromptTemplate
-from langchain_community.embeddings import HuggingFaceEmbeddings
-import pytesseract
-from pdf2image import convert_from_path
-from dotenv import load_dotenv
-import unstructured
-from unstructured.partition.auto import partition
-load_dotenv()
-os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
-# ==================== GEMINI CONFIG ====================
-genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-if os.name == "nt":  # Only on Windows (local)
-    pytesseract.pytesseract.tesseract_cmd = (
-        r"C:\Program Files\Tesseract-OCR\tesseract.exe"
-    )
-def split_questions(text):
-    text = text.replace("\n", " ").strip()
-    questions = re.split(r'(?<=[?.])\s+', text)
-    return [q.strip() for q in questions if q.strip()]
-# ==================== PROMPT ====================
-PROMPT = PromptTemplate(
-    template="""
-    Answer the question using ONLY the given context.
-    Respond in the SAME language as the question.
-    If the answer is not present, say:
-    "Answer is not available in the context."
-    Context:
-    {context}
-    Question:
-    {question}
-    Answer:
-    """,
-    input_variables=["context", "question"]
-)
-def extract_text_unstructured(uploaded_files):
-    full_text = ""
-    for file in uploaded_files:
-        with open(file.name, "wb") as f:
-            f.write(file.getbuffer())
-        elements = partition(
-            filename=file.name,
-            strategy="fast"
-        )
-        file_text = "\n".join(el.text for el in elements if el.text)
-        full_text += f"\n\n--- Source: {file.name} ---\n\n{file_text}"
-        os.remove(file.name)
-    return full_text
-# ==================== CHUNKING ====================
-def get_text_chunks(text):
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200
-    )
-    return splitter.split_text(text)
-# ==================== EMBEDDINGS ====================
-@st.cache_resource
-def load_embeddings():
-    return HuggingFaceEmbeddings(
-        model_name="paraphrase-multilingual-MiniLM-L12-v2"
-    )
-# ==================== VECTOR STORE ====================
-def get_vector_store(text_chunks):
-    embeddings = load_embeddings()
-    db = FAISS.from_texts(text_chunks, embedding=embeddings)
-    db.save_local("faiss_index")
-def ask_gemini(context, question):
-    llm = ChatGoogleGenerativeAI(
-        model="gemini-2.5-flash",
-        temperature=0.3
-    )
-    response = llm.invoke(
-        PROMPT.format(context=context, question=question)
-    )
-    return response.content
-def ask_phi3(context, question):
-    llm = ChatOllama(
-        model="phi3",
-        temperature=0.3,
-        timeout=120
-    )
-    response = llm.invoke(
-        PROMPT.format(context=context, question=question)
-    )
-    return response.content
-# ==================== HYBRID LOGIC ====================
-def ask_llm_with_fallback(context, question):
-    try:
-        return ask_gemini(context, question)
-    except Exception:
-        st.warning(" Gemini failed. Falling back to local Phi-3.")
-        return ask_phi3(context, question)
-def clear_cache():
-    st.cache_resource.clear()
-    st.cache_data.clear()
-def user_input(user_question):
-    if not os.path.exists("faiss_index"):
-        st.warning("Please upload and process PDFs first.")
-        return
-    embeddings = load_embeddings()
-    db = FAISS.load_local(
-        "faiss_index",
-        embeddings,
-        allow_dangerous_deserialization=True
-    )
-    questions = split_questions(user_question)
-    docs = db.similarity_search(user_question, k=3)
-    if not docs:
-        st.write("Answer is not available in the context.")
-        return
-    context = "\n\n".join(doc.page_content for doc in docs)
-    with st.spinner("Thinking..."):
-        answer = ask_llm_with_fallback(context, user_question)
-    st.write("### Reply:")
-    st.write(answer)
-# ==================== STREAMLIT UI ====================
-def main():
-    st.set_page_config(page_title="Chat PDF")
-    st.header(" Syllabus RAG System ")
-    user_question = st.text_input("Ask a question from the PDF")
-    if user_question:
-        user_input(user_question)
-    with st.sidebar:
-        st.title("Menu")
-        pdf_docs = st.file_uploader(
-            "Upload PDF files",
-            type=["pdf", "txt", "md", "docx", "html"],
-            accept_multiple_files=True
-        )
-        if st.button("Submit & Process"):
-            if not pdf_docs:
-                st.warning("Please upload at least one PDF.")
-                return
-            with st.spinner("Processing Files..."):
-                raw_text = extract_text_unstructured(pdf_docs)
-                chunks = get_text_chunks(raw_text)
-                get_vector_store(chunks)
-                st.success(" Files processed successfully!")
-        if st.button(" Clear Cache"):
-            clear_cache()
-            st.success("Cache cleared successfully!")
-if __name__ == "__main__":
-    main()

+import streamlit as st
+import os
+import re
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_core.prompts import PromptTemplate
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from dotenv import load_dotenv
+from unstructured.partition.auto import partition
+# ==================== ENV SETUP ====================
+load_dotenv()
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+    if not GOOGLE_API_KEY:
+    st.error("❌ GOOGLE_API_KEY not found. Add it in Hugging Face Secrets.")
+    st.stop()
+# Disable inference for safety (you can remove this in Docker if you want full inference)
+os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
+# ==================== QUESTION SPLITTER ====================
+def split_questions(text):
+    text = text.replace("\n", " ").strip()
+    questions = re.split(r'(?<=[?.])\s+', text)
+    return [q.strip() for q in questions if q.strip()]
+# ==================== PROMPT ====================
+PROMPT = PromptTemplate(
+    template="""
+Answer the question using ONLY the given context.
+Respond in the SAME language as the question.
+If the answer is not present, say:
+"Answer is not available in the context."
+Context:
+{context}
+Question:
+{question}
+Answer:
+""",
+    input_variables=["context", "question"]
+)
+# ==================== DOCUMENT INGESTION ====================
+def extract_text_unstructured(uploaded_files):
+    full_text = ""
+    for file in uploaded_files:
+        with open(file.name, "wb") as f:
+            f.write(file.getbuffer())
+        elements = partition(
+            filename=file.name,
+            strategy="fast"
+        )
+        file_text = "\n".join(el.text for el in elements if el.text)
+        full_text += f"\n\n--- Source: {file.name} ---\n\n{file_text}"
+        os.remove(file.name)
+    return full_text
+# ==================== CHUNKING ====================
+def get_text_chunks(text):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+    return splitter.split_text(text)
+# ==================== EMBEDDINGS ====================
+@st.cache_resource
+def load_embeddings():
+    return HuggingFaceEmbeddings(
+        model_name="paraphrase-multilingual-MiniLM-L12-v2"
+    )
+# ==================== VECTOR STORE ====================
+def get_vector_store(text_chunks):
+    embeddings = load_embeddings()
+    db = FAISS.from_texts(text_chunks, embedding=embeddings)
+    db.save_local("faiss_index")
+# ==================== GEMINI ====================
+def ask_gemini(context, question):
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-2.5-flash",
+        temperature=0.3
+    )
+    response = llm.invoke(
+        PROMPT.format(context=context, question=question)
+    )
+    return response.content
+# ==================== USER QUERY ====================
+def user_input(user_question):
+    if not os.path.exists("faiss_index"):
+        st.warning("Please upload and process files first.")
+        return
+    embeddings = load_embeddings()
+    db = FAISS.load_local(
+        "faiss_index",
+        embeddings,
+        allow_dangerous_deserialization=True
+    )
+    questions = split_questions(user_question)
+    for idx, question in enumerate(questions, start=1):
+        st.markdown(f"### ❓ Question {idx}")
+        st.write(question)
+        docs = db.similarity_search(question, k=3)
+        if not docs:
+            st.write("Answer is not available in the context.")
+            st.divider()
+            continue
+        context = "\n\n".join(doc.page_content for doc in docs)
+        with st.spinner("Thinking..."):
+            answer = ask_gemini(context, question)
+        st.markdown("**✅ Reply:**")
+        st.write(answer)
+        st.divider()
+# ==================== CACHE ====================
+def clear_cache():
+    st.cache_resource.clear()
+    st.cache_data.clear()
+# ==================== STREAMLIT UI ====================
+def main():
+    st.set_page_config(page_title="Chat PDF")
+    st.header("📘 Syllabus RAG System")
+    user_question = st.text_input("Ask a question from the uploaded documents")
+    if user_question:
+        user_input(user_question)
+    with st.sidebar:
+        st.title("Menu")
+        pdf_docs = st.file_uploader(
+            "Upload files",
+            type=["pdf", "txt", "md", "docx", "html"],
+            accept_multiple_files=True
+        )
+        if st.button("Submit & Process"):
+            if not pdf_docs:
+                st.warning("Please upload at least one file.")
+                return
+            with st.spinner("Processing files..."):
+                raw_text = extract_text_unstructured(pdf_docs)
+                chunks = get_text_chunks(raw_text)
+                get_vector_store(chunks)
+                st.success("✅ Files processed successfully!")
+        if st.button("Clear Cache"):
+            clear_cache()
+            st.success("Cache cleared successfully!")
+if __name__ == "__main__":
+    main()