Anshviradiya commited on
Commit
6a04848
·
verified ·
1 Parent(s): 500eed3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -223
app.py CHANGED
@@ -1,223 +1,190 @@
1
- import os
2
- os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
3
-
4
- import streamlit as st
5
- import re
6
- from dotenv import load_dotenv
7
-
8
- from unstructured.partition.auto import partition
9
-
10
- import google.generativeai as genai
11
- from langchain_google_genai import ChatGoogleGenerativeAI
12
- from langchain_ollama import ChatOllama
13
-
14
- from langchain_text_splitters import RecursiveCharacterTextSplitter
15
- from langchain_community.vectorstores import FAISS
16
- from langchain_core.prompts import PromptTemplate
17
- from langchain_community.embeddings import HuggingFaceEmbeddings
18
-
19
- import pytesseract
20
- from pdf2image import convert_from_path
21
-
22
- from dotenv import load_dotenv
23
- import unstructured
24
- from unstructured.partition.auto import partition
25
-
26
- load_dotenv()
27
-
28
-
29
- os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
30
-
31
-
32
-
33
-
34
- # ==================== GEMINI CONFIG ====================
35
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
36
-
37
-
38
- if os.name == "nt": # Only on Windows (local)
39
- pytesseract.pytesseract.tesseract_cmd = (
40
- r"C:\Program Files\Tesseract-OCR\tesseract.exe"
41
- )
42
-
43
-
44
- def split_questions(text):
45
- text = text.replace("\n", " ").strip()
46
- questions = re.split(r'(?<=[?.])\s+', text)
47
- return [q.strip() for q in questions if q.strip()]
48
-
49
-
50
-
51
- # ==================== PROMPT ====================
52
- PROMPT = PromptTemplate(
53
- template="""
54
- Answer the question using ONLY the given context.
55
- Respond in the SAME language as the question.
56
- If the answer is not present, say:
57
- "Answer is not available in the context."
58
-
59
- Context:
60
- {context}
61
-
62
- Question:
63
- {question}
64
-
65
- Answer:
66
- """,
67
- input_variables=["context", "question"]
68
- )
69
-
70
-
71
-
72
-
73
- def extract_text_unstructured(uploaded_files):
74
- full_text = ""
75
-
76
- for file in uploaded_files:
77
- with open(file.name, "wb") as f:
78
- f.write(file.getbuffer())
79
-
80
- elements = partition(
81
- filename=file.name,
82
- strategy="fast"
83
- )
84
-
85
- file_text = "\n".join(el.text for el in elements if el.text)
86
- full_text += f"\n\n--- Source: {file.name} ---\n\n{file_text}"
87
-
88
- os.remove(file.name)
89
-
90
- return full_text
91
-
92
-
93
-
94
-
95
- # ==================== CHUNKING ====================
96
- def get_text_chunks(text):
97
- splitter = RecursiveCharacterTextSplitter(
98
- chunk_size=1000,
99
- chunk_overlap=200
100
- )
101
- return splitter.split_text(text)
102
-
103
-
104
- # ==================== EMBEDDINGS ====================
105
- @st.cache_resource
106
- def load_embeddings():
107
- return HuggingFaceEmbeddings(
108
- model_name="paraphrase-multilingual-MiniLM-L12-v2"
109
- )
110
-
111
-
112
- # ==================== VECTOR STORE ====================
113
- def get_vector_store(text_chunks):
114
- embeddings = load_embeddings()
115
- db = FAISS.from_texts(text_chunks, embedding=embeddings)
116
- db.save_local("faiss_index")
117
-
118
-
119
-
120
- def ask_gemini(context, question):
121
- llm = ChatGoogleGenerativeAI(
122
- model="gemini-2.5-flash",
123
- temperature=0.3
124
- )
125
- response = llm.invoke(
126
- PROMPT.format(context=context, question=question)
127
- )
128
- return response.content
129
-
130
- def ask_phi3(context, question):
131
- llm = ChatOllama(
132
- model="phi3",
133
- temperature=0.3,
134
- timeout=120
135
- )
136
- response = llm.invoke(
137
- PROMPT.format(context=context, question=question)
138
- )
139
- return response.content
140
-
141
-
142
- # ==================== HYBRID LOGIC ====================
143
- def ask_llm_with_fallback(context, question):
144
- try:
145
- return ask_gemini(context, question)
146
- except Exception:
147
- st.warning(" Gemini failed. Falling back to local Phi-3.")
148
- return ask_phi3(context, question)
149
-
150
-
151
- def clear_cache():
152
- st.cache_resource.clear()
153
- st.cache_data.clear()
154
-
155
-
156
-
157
- def user_input(user_question):
158
- if not os.path.exists("faiss_index"):
159
- st.warning("Please upload and process PDFs first.")
160
- return
161
-
162
- embeddings = load_embeddings()
163
-
164
- db = FAISS.load_local(
165
- "faiss_index",
166
- embeddings,
167
- allow_dangerous_deserialization=True
168
- )
169
-
170
- questions = split_questions(user_question)
171
-
172
- docs = db.similarity_search(user_question, k=3)
173
-
174
- if not docs:
175
- st.write("Answer is not available in the context.")
176
- return
177
-
178
- context = "\n\n".join(doc.page_content for doc in docs)
179
-
180
- with st.spinner("Thinking..."):
181
- answer = ask_llm_with_fallback(context, user_question)
182
-
183
- st.write("### Reply:")
184
- st.write(answer)
185
-
186
- # ==================== STREAMLIT UI ====================
187
- def main():
188
- st.set_page_config(page_title="Chat PDF")
189
- st.header(" Syllabus RAG System ")
190
-
191
- user_question = st.text_input("Ask a question from the PDF")
192
-
193
- if user_question:
194
- user_input(user_question)
195
-
196
- with st.sidebar:
197
- st.title("Menu")
198
- pdf_docs = st.file_uploader(
199
- "Upload PDF files",
200
- type=["pdf", "txt", "md", "docx", "html"],
201
- accept_multiple_files=True
202
- )
203
-
204
- if st.button("Submit & Process"):
205
- if not pdf_docs:
206
- st.warning("Please upload at least one PDF.")
207
- return
208
-
209
- with st.spinner("Processing Files..."):
210
- raw_text = extract_text_unstructured(pdf_docs)
211
- chunks = get_text_chunks(raw_text)
212
- get_vector_store(chunks)
213
- st.success(" Files processed successfully!")
214
-
215
- if st.button(" Clear Cache"):
216
- clear_cache()
217
- st.success("Cache cleared successfully!")
218
-
219
-
220
- if __name__ == "__main__":
221
- main()
222
-
223
-
 
1
+ import streamlit as st
2
+ import os
3
+ import re
4
+
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_core.prompts import PromptTemplate
9
+ from langchain_community.embeddings import HuggingFaceEmbeddings
10
+
11
+ from dotenv import load_dotenv
12
+ from unstructured.partition.auto import partition
13
+
14
+ # ==================== ENV SETUP ====================
15
+ load_dotenv()
16
+
17
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
18
+
19
+ if not GOOGLE_API_KEY:
20
+ st.error("❌ GOOGLE_API_KEY not found. Add it in Hugging Face Secrets.")
21
+ st.stop()
22
+
23
+ # Disable inference for safety (you can remove this in Docker if you want full inference)
24
+ os.environ["UNSTRUCTURED_DISABLE_INFERENCE"] = "true"
25
+
26
+
27
+ # ==================== QUESTION SPLITTER ====================
28
+ def split_questions(text):
29
+ text = text.replace("\n", " ").strip()
30
+ questions = re.split(r'(?<=[?.])\s+', text)
31
+ return [q.strip() for q in questions if q.strip()]
32
+
33
+
34
+ # ==================== PROMPT ====================
35
+ PROMPT = PromptTemplate(
36
+ template="""
37
+ Answer the question using ONLY the given context.
38
+ Respond in the SAME language as the question.
39
+ If the answer is not present, say:
40
+ "Answer is not available in the context."
41
+
42
+ Context:
43
+ {context}
44
+
45
+ Question:
46
+ {question}
47
+
48
+ Answer:
49
+ """,
50
+ input_variables=["context", "question"]
51
+ )
52
+
53
+
54
+ # ==================== DOCUMENT INGESTION ====================
55
+ def extract_text_unstructured(uploaded_files):
56
+ full_text = ""
57
+
58
+ for file in uploaded_files:
59
+ with open(file.name, "wb") as f:
60
+ f.write(file.getbuffer())
61
+
62
+ elements = partition(
63
+ filename=file.name,
64
+ strategy="fast"
65
+ )
66
+
67
+ file_text = "\n".join(el.text for el in elements if el.text)
68
+ full_text += f"\n\n--- Source: {file.name} ---\n\n{file_text}"
69
+
70
+ os.remove(file.name)
71
+
72
+ return full_text
73
+
74
+
75
+ # ==================== CHUNKING ====================
76
+ def get_text_chunks(text):
77
+ splitter = RecursiveCharacterTextSplitter(
78
+ chunk_size=1000,
79
+ chunk_overlap=200
80
+ )
81
+ return splitter.split_text(text)
82
+
83
+
84
+ # ==================== EMBEDDINGS ====================
85
+ @st.cache_resource
86
+ def load_embeddings():
87
+ return HuggingFaceEmbeddings(
88
+ model_name="paraphrase-multilingual-MiniLM-L12-v2"
89
+ )
90
+
91
+
92
+ # ==================== VECTOR STORE ====================
93
+ def get_vector_store(text_chunks):
94
+ embeddings = load_embeddings()
95
+ db = FAISS.from_texts(text_chunks, embedding=embeddings)
96
+ db.save_local("faiss_index")
97
+
98
+
99
+ # ==================== GEMINI ====================
100
+ def ask_gemini(context, question):
101
+ llm = ChatGoogleGenerativeAI(
102
+ model="gemini-2.5-flash",
103
+ temperature=0.3
104
+ )
105
+
106
+ response = llm.invoke(
107
+ PROMPT.format(context=context, question=question)
108
+ )
109
+ return response.content
110
+
111
+
112
+ # ==================== USER QUERY ====================
113
+ def user_input(user_question):
114
+ if not os.path.exists("faiss_index"):
115
+ st.warning("Please upload and process files first.")
116
+ return
117
+
118
+ embeddings = load_embeddings()
119
+ db = FAISS.load_local(
120
+ "faiss_index",
121
+ embeddings,
122
+ allow_dangerous_deserialization=True
123
+ )
124
+
125
+ questions = split_questions(user_question)
126
+
127
+ for idx, question in enumerate(questions, start=1):
128
+ st.markdown(f"### ❓ Question {idx}")
129
+ st.write(question)
130
+
131
+ docs = db.similarity_search(question, k=3)
132
+
133
+ if not docs:
134
+ st.write("Answer is not available in the context.")
135
+ st.divider()
136
+ continue
137
+
138
+ context = "\n\n".join(doc.page_content for doc in docs)
139
+
140
+ with st.spinner("Thinking..."):
141
+ answer = ask_gemini(context, question)
142
+
143
+ st.markdown("**✅ Reply:**")
144
+ st.write(answer)
145
+ st.divider()
146
+
147
+
148
+ # ==================== CACHE ====================
149
+ def clear_cache():
150
+ st.cache_resource.clear()
151
+ st.cache_data.clear()
152
+
153
+
154
+ # ==================== STREAMLIT UI ====================
155
+ def main():
156
+ st.set_page_config(page_title="Chat PDF")
157
+ st.header("📘 Syllabus RAG System")
158
+
159
+ user_question = st.text_input("Ask a question from the uploaded documents")
160
+
161
+ if user_question:
162
+ user_input(user_question)
163
+
164
+ with st.sidebar:
165
+ st.title("Menu")
166
+
167
+ pdf_docs = st.file_uploader(
168
+ "Upload files",
169
+ type=["pdf", "txt", "md", "docx", "html"],
170
+ accept_multiple_files=True
171
+ )
172
+
173
+ if st.button("Submit & Process"):
174
+ if not pdf_docs:
175
+ st.warning("Please upload at least one file.")
176
+ return
177
+
178
+ with st.spinner("Processing files..."):
179
+ raw_text = extract_text_unstructured(pdf_docs)
180
+ chunks = get_text_chunks(raw_text)
181
+ get_vector_store(chunks)
182
+ st.success("✅ Files processed successfully!")
183
+
184
+ if st.button("Clear Cache"):
185
+ clear_cache()
186
+ st.success("Cache cleared successfully!")
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()