Spaces:

Xindus
/

chatpdf-rafeeq

Sleeping

App Files Files Community

rafeeqxindus commited on Jun 25

Commit

3742e69

1 Parent(s): 3a69823

final

Browse files

Files changed (2) hide show

Dockerfile +2 -5
streamlit_app.py +10 -46

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # Use official lightweight Python image
-FROM python:3.12.4-slim
 # Set environment variables to disable usage stats collection (to prevent write errors)
 ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
@@ -12,9 +12,6 @@ ENV HOME=/tmp
 # Set working directory
 WORKDIR /app
-# Create directory to store index with correct permissions
-RUN mkdir -p /app/index && chmod -R 777 /app/index
 # Copy requirements and install
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -23,4 +20,4 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 # Run the app
-CMD ["streamlit", "run", "streamlit_app.py", "--server.port=7860", "--server.enableXsrfProtection=false", "--server.enableCORS=false", "--server.address=0.0.0.0"]

 # Use official lightweight Python image
+FROM python:3.10-slim
 # Set environment variables to disable usage stats collection (to prevent write errors)
 ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
 # Set working directory
 WORKDIR /app
 # Copy requirements and install
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 # Run the app
+CMD ["streamlit", "run", "streamlit_app.py", "--server.enableXsrfProtection=false", "--server.port=7860", "--server.address=0.0.0.0"]

streamlit_app.py CHANGED Viewed

@@ -10,16 +10,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
-import logging
-# ========================
-# Logging Setup
-# ========================
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s"
-)
-logger = logging.getLogger(__name__)
 # ========================
 # 1️⃣ Configuration
@@ -28,11 +18,9 @@ logger = logging.getLogger(__name__)
 load_dotenv()
 api_key = os.getenv("GOOGLE_API_KEY")
 if not api_key:
-    logger.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
     st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
     st.stop()
-logger.info("GOOGLE_API_KEY loaded successfully.")
 genai.configure(api_key=api_key)
 # ========================
@@ -45,19 +33,15 @@ def validate_file_sizes(uploaded_files):
     total_size = 0
     for file in uploaded_files:
         size_mb = file.size / (1024 * 1024)
-        logger.info(f"Checking file: {file.name}, size: {size_mb:.2f} MB")
         if size_mb > MAX_FILE_SIZE_MB:
-            logger.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
             st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
             return False
         total_size += size_mb
     if total_size > MAX_TOTAL_SIZE_MB:
-        logger.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
         st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
         return False
-    logger.info("All file sizes are within limits.")
     return True
 # ========================
@@ -66,7 +50,6 @@ def validate_file_sizes(uploaded_files):
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
-        logger.info(f"Extracting text from PDF: {getattr(pdf, 'name', 'unknown')}")
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
             content = page.extract_text()
@@ -75,12 +58,10 @@ def get_pdf_text(pdf_docs):
     return text
 def get_docx_text(docx_file):
-    logger.info(f"Extracting text from DOCX: {getattr(docx_file, 'name', 'unknown')}")
     doc = Document(docx_file)
     return "\n".join([para.text for para in doc.paragraphs])
 def get_html_text(html_file):
-    logger.info(f"Extracting text from HTML: {getattr(html_file, 'name', 'unknown')}")
     content = html_file.read()
     soup = BeautifulSoup(content, "html.parser")
     return soup.get_text()
@@ -89,19 +70,13 @@ def get_html_text(html_file):
 # 4️⃣ Text Chunking and Vector Store
 # ========================
 def get_text_chunks(text):
-    logger.info(f"Splitting text into chunks. Text length: {len(text)}")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
     return text_splitter.split_text(text)
 def get_vector_store(text_chunks):
-    logger.info(f"Creating vector store with {len(text_chunks)} chunks.")
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
     vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
-    try:
-        vector_store.save_local("/app/index/faiss_index")
-        logger.info("Vector store saved to /app/index/faiss_index")
-    except Exception as e:
-        logger.error(f"Failed to save vector store: {e}")
 # ========================
 # 5️⃣ Conversational Chain Setup
@@ -124,25 +99,15 @@ def get_conversational_chain():
     return chain
 def user_input(user_question):
-    logger.info(f"User question: {user_question}")
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    try:
-        new_db = FAISS.load_local("/app/index/faiss_index", embeddings, allow_dangerous_deserialization=True)
-        docs = new_db.similarity_search(user_question)
-        logger.info(f"Found {len(docs)} similar documents.")
-    except Exception as e:
-        logger.error(f"Error loading vector store or searching: {e}")
-        st.error(f"Error loading vector store or searching: {e}")
-        return
     chain = get_conversational_chain()
-    try:
-        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
-        st.write("Reply:", response["output_text"])
-        logger.info("Response generated successfully.")
-    except Exception as e:
-        logger.error(f"Error generating response: {e}")
-        st.error(f"Error generating response: {e}")
 # ========================
 # 6️⃣ Streamlit App Layout
@@ -160,14 +125,15 @@ def main():
         st.title("Upload & Process Files")
         uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
         if st.button("Submit & Process"):
             if not uploaded_files:
-                logger.warning("No files uploaded.")
                 st.warning("Please upload at least one file.")
                 return
             if not validate_file_sizes(uploaded_files):
-                logger.warning("File size validation failed.")
                 return
             with st.spinner("Processing files..."):
@@ -180,13 +146,11 @@ def main():
                     elif file.name.endswith(".html"):
                         full_text += get_html_text(file)
                     else:
-                        logger.warning(f"Unsupported file type: {file.name}")
                         st.warning(f"Unsupported file type: {file.name}")
                 text_chunks = get_text_chunks(full_text)
                 get_vector_store(text_chunks)
                 st.success("Processing complete!")
-                logger.info("Processing complete!")
 if __name__ == "__main__":
     main()

 from langchain.chains.question_answering import load_qa_chain
 from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 # ========================
 # 1️⃣ Configuration
 load_dotenv()
 api_key = os.getenv("GOOGLE_API_KEY")
 if not api_key:
     st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
     st.stop()
 genai.configure(api_key=api_key)
 # ========================
     total_size = 0
     for file in uploaded_files:
         size_mb = file.size / (1024 * 1024)
         if size_mb > MAX_FILE_SIZE_MB:
             st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
             return False
         total_size += size_mb
     if total_size > MAX_TOTAL_SIZE_MB:
         st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
         return False
     return True
 # ========================
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
             content = page.extract_text()
     return text
 def get_docx_text(docx_file):
     doc = Document(docx_file)
     return "\n".join([para.text for para in doc.paragraphs])
 def get_html_text(html_file):
     content = html_file.read()
     soup = BeautifulSoup(content, "html.parser")
     return soup.get_text()
 # 4️⃣ Text Chunking and Vector Store
 # ========================
 def get_text_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
     return text_splitter.split_text(text)
 def get_vector_store(text_chunks):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
     vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
+    vector_store.save_local("faiss_index")
 # ========================
 # 5️⃣ Conversational Chain Setup
     return chain
 def user_input(user_question):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
+    docs = new_db.similarity_search(user_question)
     chain = get_conversational_chain()
+    response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
+    st.write("Reply:", response["output_text"])
 # ========================
 # 6️⃣ Streamlit App Layout
         st.title("Upload & Process Files")
         uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
         if st.button("Submit & Process"):
             if not uploaded_files:
                 st.warning("Please upload at least one file.")
                 return
             if not validate_file_sizes(uploaded_files):
                 return
             with st.spinner("Processing files..."):
                     elif file.name.endswith(".html"):
                         full_text += get_html_text(file)
                     else:
                         st.warning(f"Unsupported file type: {file.name}")
                 text_chunks = get_text_chunks(full_text)
                 get_vector_store(text_chunks)
                 st.success("Processing complete!")
 if __name__ == "__main__":
     main()