rafeeqxindus commited on
Commit
3742e69
·
1 Parent(s): 3a69823
Files changed (2) hide show
  1. Dockerfile +2 -5
  2. streamlit_app.py +10 -46
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
  # Use official lightweight Python image
2
- FROM python:3.12.4-slim
3
 
4
  # Set environment variables to disable usage stats collection (to prevent write errors)
5
  ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
@@ -12,9 +12,6 @@ ENV HOME=/tmp
12
  # Set working directory
13
  WORKDIR /app
14
 
15
- # Create directory to store index with correct permissions
16
- RUN mkdir -p /app/index && chmod -R 777 /app/index
17
-
18
  # Copy requirements and install
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
@@ -23,4 +20,4 @@ RUN pip install --no-cache-dir -r requirements.txt
23
  COPY . .
24
 
25
  # Run the app
26
- CMD ["streamlit", "run", "streamlit_app.py", "--server.port=7860", "--server.enableXsrfProtection=false", "--server.enableCORS=false", "--server.address=0.0.0.0"]
 
1
  # Use official lightweight Python image
2
+ FROM python:3.10-slim
3
 
4
  # Set environment variables to disable usage stats collection (to prevent write errors)
5
  ENV STREAMLIT_BROWSER_GATHERUSAGESTATS=false
 
12
  # Set working directory
13
  WORKDIR /app
14
 
 
 
 
15
  # Copy requirements and install
16
  COPY requirements.txt .
17
  RUN pip install --no-cache-dir -r requirements.txt
 
20
  COPY . .
21
 
22
  # Run the app
23
+ CMD ["streamlit", "run", "streamlit_app.py", "--server.enableXsrfProtection=false", "--server.port=7860", "--server.address=0.0.0.0"]
streamlit_app.py CHANGED
@@ -10,16 +10,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.chains.question_answering import load_qa_chain
11
  from langchain.prompts import PromptTemplate
12
  from dotenv import load_dotenv
13
- import logging
14
-
15
- # ========================
16
- # Logging Setup
17
- # ========================
18
- logging.basicConfig(
19
- level=logging.INFO,
20
- format="%(asctime)s [%(levelname)s] %(message)s"
21
- )
22
- logger = logging.getLogger(__name__)
23
 
24
  # ========================
25
  # 1️⃣ Configuration
@@ -28,11 +18,9 @@ logger = logging.getLogger(__name__)
28
  load_dotenv()
29
  api_key = os.getenv("GOOGLE_API_KEY")
30
  if not api_key:
31
- logger.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
32
  st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
33
  st.stop()
34
 
35
- logger.info("GOOGLE_API_KEY loaded successfully.")
36
  genai.configure(api_key=api_key)
37
 
38
  # ========================
@@ -45,19 +33,15 @@ def validate_file_sizes(uploaded_files):
45
  total_size = 0
46
  for file in uploaded_files:
47
  size_mb = file.size / (1024 * 1024)
48
- logger.info(f"Checking file: {file.name}, size: {size_mb:.2f} MB")
49
  if size_mb > MAX_FILE_SIZE_MB:
50
- logger.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
51
  st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
52
  return False
53
  total_size += size_mb
54
 
55
  if total_size > MAX_TOTAL_SIZE_MB:
56
- logger.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
57
  st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
58
  return False
59
 
60
- logger.info("All file sizes are within limits.")
61
  return True
62
 
63
  # ========================
@@ -66,7 +50,6 @@ def validate_file_sizes(uploaded_files):
66
  def get_pdf_text(pdf_docs):
67
  text = ""
68
  for pdf in pdf_docs:
69
- logger.info(f"Extracting text from PDF: {getattr(pdf, 'name', 'unknown')}")
70
  pdf_reader = PdfReader(pdf)
71
  for page in pdf_reader.pages:
72
  content = page.extract_text()
@@ -75,12 +58,10 @@ def get_pdf_text(pdf_docs):
75
  return text
76
 
77
  def get_docx_text(docx_file):
78
- logger.info(f"Extracting text from DOCX: {getattr(docx_file, 'name', 'unknown')}")
79
  doc = Document(docx_file)
80
  return "\n".join([para.text for para in doc.paragraphs])
81
 
82
  def get_html_text(html_file):
83
- logger.info(f"Extracting text from HTML: {getattr(html_file, 'name', 'unknown')}")
84
  content = html_file.read()
85
  soup = BeautifulSoup(content, "html.parser")
86
  return soup.get_text()
@@ -89,19 +70,13 @@ def get_html_text(html_file):
89
  # 4️⃣ Text Chunking and Vector Store
90
  # ========================
91
  def get_text_chunks(text):
92
- logger.info(f"Splitting text into chunks. Text length: {len(text)}")
93
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
94
  return text_splitter.split_text(text)
95
 
96
  def get_vector_store(text_chunks):
97
- logger.info(f"Creating vector store with {len(text_chunks)} chunks.")
98
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
99
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
100
- try:
101
- vector_store.save_local("/app/index/faiss_index")
102
- logger.info("Vector store saved to /app/index/faiss_index")
103
- except Exception as e:
104
- logger.error(f"Failed to save vector store: {e}")
105
 
106
  # ========================
107
  # 5️⃣ Conversational Chain Setup
@@ -124,25 +99,15 @@ def get_conversational_chain():
124
  return chain
125
 
126
  def user_input(user_question):
127
- logger.info(f"User question: {user_question}")
128
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
129
- try:
130
- new_db = FAISS.load_local("/app/index/faiss_index", embeddings, allow_dangerous_deserialization=True)
131
- docs = new_db.similarity_search(user_question)
132
- logger.info(f"Found {len(docs)} similar documents.")
133
- except Exception as e:
134
- logger.error(f"Error loading vector store or searching: {e}")
135
- st.error(f"Error loading vector store or searching: {e}")
136
- return
137
 
138
  chain = get_conversational_chain()
139
- try:
140
- response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
141
- st.write("Reply:", response["output_text"])
142
- logger.info("Response generated successfully.")
143
- except Exception as e:
144
- logger.error(f"Error generating response: {e}")
145
- st.error(f"Error generating response: {e}")
146
 
147
  # ========================
148
  # 6️⃣ Streamlit App Layout
@@ -160,14 +125,15 @@ def main():
160
  st.title("Upload & Process Files")
161
  uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
162
 
 
 
 
163
  if st.button("Submit & Process"):
164
  if not uploaded_files:
165
- logger.warning("No files uploaded.")
166
  st.warning("Please upload at least one file.")
167
  return
168
 
169
  if not validate_file_sizes(uploaded_files):
170
- logger.warning("File size validation failed.")
171
  return
172
 
173
  with st.spinner("Processing files..."):
@@ -180,13 +146,11 @@ def main():
180
  elif file.name.endswith(".html"):
181
  full_text += get_html_text(file)
182
  else:
183
- logger.warning(f"Unsupported file type: {file.name}")
184
  st.warning(f"Unsupported file type: {file.name}")
185
 
186
  text_chunks = get_text_chunks(full_text)
187
  get_vector_store(text_chunks)
188
  st.success("Processing complete!")
189
- logger.info("Processing complete!")
190
 
191
  if __name__ == "__main__":
192
  main()
 
10
  from langchain.chains.question_answering import load_qa_chain
11
  from langchain.prompts import PromptTemplate
12
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
13
 
14
  # ========================
15
  # 1️⃣ Configuration
 
18
  load_dotenv()
19
  api_key = os.getenv("GOOGLE_API_KEY")
20
  if not api_key:
 
21
  st.error("GOOGLE_API_KEY not found. Please add it to your .env file.")
22
  st.stop()
23
 
 
24
  genai.configure(api_key=api_key)
25
 
26
  # ========================
 
33
  total_size = 0
34
  for file in uploaded_files:
35
  size_mb = file.size / (1024 * 1024)
 
36
  if size_mb > MAX_FILE_SIZE_MB:
 
37
  st.warning(f"{file.name} is too large ({size_mb:.2f} MB). Limit is {MAX_FILE_SIZE_MB} MB per file.")
38
  return False
39
  total_size += size_mb
40
 
41
  if total_size > MAX_TOTAL_SIZE_MB:
 
42
  st.warning(f"Total size of uploaded files is {total_size:.2f} MB. Limit is {MAX_TOTAL_SIZE_MB} MB in total.")
43
  return False
44
 
 
45
  return True
46
 
47
  # ========================
 
50
  def get_pdf_text(pdf_docs):
51
  text = ""
52
  for pdf in pdf_docs:
 
53
  pdf_reader = PdfReader(pdf)
54
  for page in pdf_reader.pages:
55
  content = page.extract_text()
 
58
  return text
59
 
60
  def get_docx_text(docx_file):
 
61
  doc = Document(docx_file)
62
  return "\n".join([para.text for para in doc.paragraphs])
63
 
64
  def get_html_text(html_file):
 
65
  content = html_file.read()
66
  soup = BeautifulSoup(content, "html.parser")
67
  return soup.get_text()
 
70
  # 4️⃣ Text Chunking and Vector Store
71
  # ========================
72
  def get_text_chunks(text):
 
73
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
74
  return text_splitter.split_text(text)
75
 
76
  def get_vector_store(text_chunks):
 
77
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
78
  vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
79
+ vector_store.save_local("faiss_index")
 
 
 
 
80
 
81
  # ========================
82
  # 5️⃣ Conversational Chain Setup
 
99
  return chain
100
 
101
  def user_input(user_question):
 
102
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
103
+ new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
104
+ docs = new_db.similarity_search(user_question)
105
+
106
+
 
 
 
 
107
 
108
  chain = get_conversational_chain()
109
+ response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
110
+ st.write("Reply:", response["output_text"])
 
 
 
 
 
111
 
112
  # ========================
113
  # 6️⃣ Streamlit App Layout
 
125
  st.title("Upload & Process Files")
126
  uploaded_files = st.file_uploader("Upload PDF, DOCX, or HTML files", accept_multiple_files=True, type=['pdf', 'docx', 'html'])
127
 
128
+
129
+
130
+
131
  if st.button("Submit & Process"):
132
  if not uploaded_files:
 
133
  st.warning("Please upload at least one file.")
134
  return
135
 
136
  if not validate_file_sizes(uploaded_files):
 
137
  return
138
 
139
  with st.spinner("Processing files..."):
 
146
  elif file.name.endswith(".html"):
147
  full_text += get_html_text(file)
148
  else:
 
149
  st.warning(f"Unsupported file type: {file.name}")
150
 
151
  text_chunks = get_text_chunks(full_text)
152
  get_vector_store(text_chunks)
153
  st.success("Processing complete!")
 
154
 
155
  if __name__ == "__main__":
156
  main()