naveen07garg commited on
Commit
8b58fd4
·
verified ·
1 Parent(s): 147a11f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +418 -211
app.py CHANGED
@@ -1,157 +1,37 @@
1
- import os
2
- import gradio as gr
3
- from huggingface_hub import snapshot_download
4
- from langchain.embeddings import SentenceTransformerEmbeddings
5
- from langchain_chroma import Chroma
6
- #from langchain_community.vectorstores import Chroma
7
- from transformers import pipeline
8
- from langchain_community.llms import HuggingFacePipeline
9
- from langchain.chains import LLMChain
10
- from langchain.prompts import PromptTemplate
11
- from transformers import AutoModelForCausalLM, AutoTokenizer
12
- from langchain_openai import ChatOpenAI
13
- from langchain_openai import OpenAIEmbeddings
14
- #from langchain_community.vectorstores import Chroma
15
-
16
- import spacy
17
  import json
18
- import os
19
- from dotenv import load_dotenv
20
-
21
- #--===============================
22
- # Set ENV Variable
23
- #--===============================
24
- load_dotenv()
25
- os.environ["OPENAI_API_KEY"] = "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"
26
-
27
- #"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"
28
-
29
- #--===============================
30
- # Define Prompt
31
- #--===============================
32
-
33
-
34
- # Define the system message for Flykite Airlines HR Policy Assistant
35
- # --- Refined System Prompt ---
36
- QNA_SYSTEM_MESSAGE = """
37
- You are the Flykite Airlines HR Policy Assistant.
38
-
39
- Your role is to answer employee questions based on official HR documents (handbooks, policy PDFs, etc.).
40
- Each user question will start with the token: ###Question.
41
-
42
- ### Response Rules
43
- - Be clear, factual, and professional.
44
- - Use bullet points (-) or numbered lists (1., 2., etc.) for clarity.
45
- - Begin with a **one-line summary**, then details.
46
- - Cite the Specific policy references (Document → Section → Subsection → Sub-subsection) where
47
- the answer comes from.
48
- - If the answer is not in the source, reply 1 line from generic resonse and post fix with exactly: \n\n **"Could not find anything out from Flyline HR documentation around your query.\n\nPlease rephrase your query."**
49
- - Do **not** make assumptions or fabricate information.
50
-
51
- ### Ambiguity & Context
52
- - If a query could refer to multiple policies or depends on role/location/department, ask **one short clarifying question**.
53
- - If you assume a context, state it clearly (e.g., "Assuming HQ staff...").
54
- - When policies differ by role/location, list variations clearly.
55
-
56
- ### Personalization
57
- - Tailor responses to any role, location, or employment type provided.
58
- - Mention if rules vary and what those differences are.
59
-
60
- ### Format
61
- 1. One-line summary.
62
- 2. Key details, steps, or rules.
63
- 3. Specific policy references (Document → Section → Subsection → Sub-subsection) where
64
  the answer comes from.
65
- 4. Optional follow-up suggestion or clarifying question.
66
-
67
- ### Important
68
- - Never guess or invent policy content.
69
- - Maintain confidentiality and avoid personal data.
70
- - User questions always begin with `###Question`. Respond only to those.
71
  """
72
 
73
-
74
-
75
- # =========================================================
76
- # Step 1: Download Vectorstore from Hugging Face Dataset
77
- # =========================================================
78
- VECTOR_DIR = "naveen07garg/AirlineChatBot/vectorstore/" #=== application space location
79
- DATASET_REPO = "naveen07garg/AirlineChatBot-vectorstore" #== data store space flykite_handbook_chromadb
80
- #OPENAI_API_KEY="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"
81
-
82
- if not os.path.exists(VECTOR_DIR):
83
- print("⬇️ Downloading vectorstore from Hugging Face dataset...")
84
- snapshot_download(
85
- repo_id=DATASET_REPO,
86
- repo_type="dataset",
87
- local_dir=VECTOR_DIR,
88
- ignore_patterns=[".gitattributes"],
89
- )
90
- print("✅ Vectorstore downloaded successfully!")
91
- for root, dirs, files in os.walk(VECTOR_DIR):
92
- for f in files:
93
- print(" ", os.path.join(root, f))
94
- else:
95
- print("📦 Vectorstore already present, skipping download.")
96
-
97
- # =============================
98
- # Step 2: Load Chroma Vectorstore
99
- # =============================
100
- #embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
101
- embedding_fn = OpenAIEmbeddings(model="text-embedding-3-small")
102
- vectordb = Chroma(persist_directory=VECTOR_DIR, embedding_function=embedding_fn)
103
- print("Collections:", vectordb._client.list_collections())
104
-
105
- # Get underlying collection
106
- collection = vectordb._collection
107
- print("Chroma vectorstore collection :\n")
108
- print(collection)
109
-
110
- #res = vectordb._collection.get(ids=["chunk_6"], include=["metadatas", "documents"])
111
- #print("lets check with chunk_6 \n")
112
- #print(res["metadatas"][0])
113
-
114
- retriever = vectordb.as_retriever(
115
- search_type='similarity',
116
- search_kwargs={'k': 3}
117
- )
118
-
119
- query = "What is the leave policy?"
120
- results = retriever.get_relevant_documents(query)
121
-
122
- print("Chroma vectorstore loaded successfully! \n\nWith test results - ")
123
- print(results)
124
-
125
- for i, doc in enumerate(results):
126
- print(f"\nResult {i+1}")
127
- print("Document:", doc.page_content)
128
- print("Metadata:", doc.metadata)
129
-
130
-
131
- # =============================
132
- # Step 3: Load LLM
133
- # =============================
134
- #qa_model = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct") #model="mistralai/Mistral-7B-Instruct-v0.2")
135
- #model_id = "meta-llama/Meta-Llama-3-70B"
136
- #tokenizer = AutoTokenizer.from_pretrained(model_id)
137
- #model = AutoModelForCausalLM.from_pretrained(model_id)
138
-
139
-
140
-
141
- # Low creativity (deterministic) LLM
142
- llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=1500, openai_api_key="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")
143
-
144
- #qa_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
145
- #llm = HuggingFacePipeline(pipeline=qa_model)
146
-
147
- # =============================
148
- # Step 4: RAG Response Function
149
- # =============================
150
-
151
- # Load spaCy NER model
152
- nlp = spacy.load("en_core_web_sm")
153
-
154
- # --- User Prompt Template ---
155
  hr_user_message_template = """
156
  Consider the following ###Context and ###Question:
157
 
@@ -162,6 +42,159 @@ Consider the following ###Context and ###Question:
162
  {question}
163
  """
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  # --- spaCy Extraction ---
166
  def extract_with_spacy(text):
167
  doc = nlp(text)
@@ -225,8 +258,13 @@ def extract_with_llm(text):
225
  """
226
 
227
  try:
228
- response = llm.invoke(prompt)
229
- content = response.content.strip()
 
 
 
 
 
230
 
231
  # Enforce safe parsing
232
  if content.startswith("{"):
@@ -235,21 +273,122 @@ def extract_with_llm(text):
235
  extracted = {"roles": [], "locations": [], "departments": []}
236
 
237
  except Exception:
 
238
  extracted = {"roles": [], "locations": [], "departments": []}
239
 
240
  return extracted
241
 
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  # -----------------------
244
  # User Query Enrichment
245
  # -----------------------
246
  def extract_metadata_from_query(query: str):
247
  """Use spaCy + LLM to extract role/location/department from user query."""
248
  spacy_res = extract_with_spacy(query)
249
- print("spaCy results ## ==>%s", spacy_res)
250
  llm_res = extract_with_llm(query)
251
- print("LLM Extraction Results ## ==>%s", llm_res)
252
-
253
 
254
  return {
255
  "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
@@ -257,9 +396,10 @@ def extract_metadata_from_query(query: str):
257
  "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
258
  }
259
 
260
- # -------------------------------
 
261
  # Helper: Filter docs manually
262
- # -------------------------------
263
  def filter_docs_by_metadata(docs, metadata_filters):
264
  filtered = []
265
  for d in docs:
@@ -277,22 +417,10 @@ def filter_docs_by_metadata(docs, metadata_filters):
277
 
278
 
279
 
280
- def generate_rag_based_response(user_input, retriever, k=3, max_tokens=800, temperature=0, top_p=0.95):
281
- """
282
- Args:
283
- user_input: User query string
284
- retriever: LangChain retriever (from Chroma)
285
- k: number of top documents to retrieve
286
- Returns:
287
- The generated response based on user query + context with citations
288
- """
289
 
290
- # Step 1: Retrieve relevant chunks
291
- # relevant_docs = retriever.get_relevant_documents(user_input)
292
- # selected_docs = relevant_docs[:k]
293
  # relevant_docs = retriever.get_relevant_documents(user_input)[:k]
294
 
295
-
296
  # When user asks a query, we enrich it by extracting role, location, department using the same spaCy + LLM pipeline.
297
  # Pass those extracted values as filters to the retriever → only chunks with matching metadata are considered.
298
  # If nothing matches, fallback to plain semantic search (so we don’t block valid answers).
@@ -301,74 +429,153 @@ def generate_rag_based_response(user_input, retriever, k=3, max_tokens=800, temp
301
  query_metadata = extract_metadata_from_query(user_input)
302
 
303
  print("\n======================")
304
- print("User Query: %s", user_input)
305
- print("Extracted metadata from query: %s", query_metadata) # Investigatory log
 
306
 
307
  # 2. Retrieve top-k docs semantically
308
  retrieved_docs = retriever.get_relevant_documents(user_input, k=k)
309
- print("Retrieved %d docs before filtering", len(retrieved_docs))
310
 
311
  # 3. Apply metadata filtering
312
  filtered_docs = filter_docs_by_metadata(retrieved_docs, query_metadata)
313
  if filtered_docs:
314
  selected_docs = filtered_docs
315
- print("✅ %d docs kept after metadata filtering", len(selected_docs))
316
  else:
317
  selected_docs = retrieved_docs # fallback if no metadata match
318
  print("⚠️ No metadata match, falling back to semantic retrieval only")
319
 
320
 
321
-
322
- # Step 4: Log retrieved docs metadata
323
- print("✅ Retrieved %d docs", len(selected_docs))
324
  for i, d in enumerate(selected_docs, 1):
325
- print("\n--- Chunk %d ---", i)
326
- print("Text: %s...", d.page_content[:200]) # preview first 200 chars
327
- print("Metadata: %s", d.metadata)
328
 
329
 
 
 
330
 
331
- # Step 4: Build context with citations
332
- context_parts = []
333
- for d in selected_docs:
334
- meta = d.metadata
335
- citation = f"{meta.get('document')} → {meta.get('section')}"
336
- if meta.get("subsection"):
337
- citation += f" / {meta.get('subsection')}"
338
- if meta.get("subsubsection"):
339
- citation += f" / {meta.get('subsubsection')}"
340
- context_parts.append(f"Source: {citation}\n{d.page_content}")
341
 
342
- context_for_query = "\n\n---\n\n".join(context_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- # Step 5: Construct prompt
345
- user_prompt = hr_user_message_template.format(
346
- context=context_for_query,
347
- question=user_input
348
  )
349
 
350
- messages = [
351
- {"role": "system", "content": QNA_SYSTEM_MESSAGE},
352
- {"role": "user", "content": user_prompt},
353
- ]
354
 
355
- # Step 6: Query the LLM
356
- llm = ChatOpenAI(model="gpt-4o-mini", temperature=temperature, max_tokens=max_tokens)
357
 
358
- try:
359
- response = llm.invoke(messages)
360
- prediction = response.content
361
- except Exception as e:
362
- prediction = f" Error: {e}"
 
363
 
364
- return prediction
365
 
366
  # =============================
367
  # Step 5: Chat Function
368
  # =============================
369
- def chat_fn(message, history):
370
- answer = generate_rag_based_response(message, retriever)
371
- return f"{answer}\n\n🧠 (Context retrieved from {DATASET_REPO})"
372
 
373
 
374
  # =============================
@@ -463,7 +670,7 @@ css = """
463
  # return f"BubbleBot says: {message}"
464
 
465
  gr.ChatInterface(
466
- fn=chat_fn,
467
  title="Flyline Chatbot ✈ ️",
468
  description="Ask Flyline HR",
469
  theme="soft",
 
1
+ %%writefile backendFiles/app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
+ import re
4
+ import os, requests
5
+ import fitz # PyMuPDF We use PyMuPDF (fitz) to capture hierarchy (section → subsection → subsubsection → content/bullets).
6
+ from collections import Counter
7
+ from fastapi import FastAPI
8
+ from pydantic import BaseModel
9
+ from typing import Optional
10
+
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
13
+
14
+ # --------------------------
15
+ # HR Assistant Prompt Templates
16
+ # --------------------------
17
+
18
+ hr_system_message = """
19
+ You are "Flykite HR Assistant", a helpful and professional AI bot for an airline company.
20
+ You specialize in answering employee questions about HR policies, benefits, and compliance.
21
+
22
+ Rules:
23
+ - Use only the information provided in the ###Context.
24
+ - If the user's role, location, or department is mentioned in the query or appears in the context,
25
+ personalize the answer accordingly. Acknowledge differences (e.g., policies for Field Staff vs Headquarters,
26
+ or India vs UK).
27
+ - Always cite the specific policy references (Document → Section → Subsection → Sub-subsection) where
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  the answer comes from.
29
+ - If the answer cannot be derived from the context, respond only with: "I don't know".
30
+ - Keep your tone clear, supportive, and professional — like an HR representative for airline staff.
31
+ - If multiple relevant rules exist, summarize them and cite all applicable sources.
32
+ - Never invent or assume policies beyond what is provided.
 
 
33
  """
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  hr_user_message_template = """
36
  Consider the following ###Context and ###Question:
37
 
 
42
  {question}
43
  """
44
 
45
+ # --------------------------
46
+ # PDF Parsing Utils
47
+ # --------------------------
48
+
49
+ def clean_text_hidden(s: str) -> str:
50
+ if not s:
51
+ return ""
52
+ s = re.sub(r"[\u200B-\u200F\u202A-\u202E\u00A0\u00AD]", " ", s)
53
+ s = re.sub(r"\s+", " ", s)
54
+ return s.strip()
55
+
56
+ def is_line_fully_bold(spans):
57
+ return all(
58
+ ("Bold" in s["font"] or s["flags"] & 2 != 0)
59
+ for s in spans if s.get("text", "").strip()
60
+ )
61
+
62
+ def detect_font_levels(pdf_path):
63
+ doc = fitz.open(pdf_path)
64
+ font_sizes = []
65
+ for page in doc:
66
+ blocks = page.get_text("dict")["blocks"]
67
+ for b in blocks:
68
+ for l in b.get("lines", []):
69
+ for s in l.get("spans", []):
70
+ font_sizes.append(round(s["size"], 1))
71
+ unique_sizes = sorted(set(font_sizes), reverse=True)
72
+ if len(unique_sizes) > 3:
73
+ candidate_sizes = unique_sizes[1:-1]
74
+ else:
75
+ candidate_sizes = unique_sizes
76
+ section_size = candidate_sizes[0] if candidate_sizes else unique_sizes[0]
77
+ subsubsection_size = candidate_sizes[1] if len(candidate_sizes) > 1 else section_size
78
+ return section_size, subsubsection_size
79
+
80
+ def most_common_size(sizes):
81
+ return Counter(sizes).most_common(1)[0][0] if sizes else None
82
+
83
+ def parse_flykite(pdf_path):
84
+ section_size, subsubsection_size = detect_font_levels(pdf_path)
85
+ doc = fitz.open(pdf_path)
86
+ sections = []
87
+ current_section, current_subsection, current_subsubsection = None, None, None
88
+
89
+ for page_num, page in enumerate(doc, start=1):
90
+ blocks = page.get_text("dict")["blocks"]
91
+ for b in blocks:
92
+ for l in b.get("lines", []):
93
+ spans = l.get("spans", [])
94
+ line_text = "".join(s.get("text", "") for s in spans).strip()
95
+ line_text = clean_text_hidden(line_text)
96
+ if not line_text:
97
+ continue
98
+ span_sizes = [round(s["size"], 1) for s in spans]
99
+ line_size = most_common_size(span_sizes)
100
+
101
+ # SECTION/SUBSECTION
102
+ if line_size == section_size:
103
+ if is_line_fully_bold(spans) and "policy" in line_text.lower():
104
+ current_subsection = {"subsection": line_text, "subsubsections": [], "content": []}
105
+ if current_section:
106
+ current_section["subsections"].append(current_subsection)
107
+ else:
108
+ current_section = {"section": line_text, "subsections": []}
109
+ sections.append(current_section)
110
+ current_subsection = None
111
+ current_subsubsection = None
112
+ continue
113
+
114
+ # SUB-SUBSECTION
115
+ if re.match(r"^\d+\s*\.\s+", line_text):
116
+ if line_size == subsubsection_size:
117
+ is_heading = False
118
+ if is_line_fully_bold(spans):
119
+ is_heading = True
120
+ else:
121
+ if len(spans) > 1:
122
+ first_span_text = clean_text_hidden(spans[0]["text"]).strip()
123
+ if re.match(r"^\d+\.?$", first_span_text):
124
+ rest_bold = all(
125
+ ("Bold" in s["font"] or s["flags"] & 2 != 0)
126
+ for s in spans[1:] if s.get("text", "").strip()
127
+ )
128
+ if rest_bold:
129
+ is_heading = True
130
+ if is_heading:
131
+ current_subsubsection = {"title": line_text, "content": []}
132
+ if current_subsection:
133
+ current_subsection["subsubsections"].append(current_subsubsection)
134
+ elif current_section:
135
+ auto_sub = {"subsection": current_section["section"], "subsubsections": []}
136
+ current_section["subsections"].append(auto_sub)
137
+ current_subsection = auto_sub
138
+ current_subsection["subsubsections"].append(current_subsubsection)
139
+ continue
140
+ # otherwise treat as content
141
+ if current_subsubsection:
142
+ current_subsubsection["content"].append(line_text)
143
+ elif current_subsection:
144
+ current_subsection["content"].append(line_text)
145
+ elif current_section:
146
+ current_section.setdefault("content", []).append(line_text)
147
+ else:
148
+ if not sections:
149
+ sections.append({"intro": [line_text]})
150
+ else:
151
+ sections[0].setdefault("intro", []).append(line_text)
152
+ return sections
153
+
154
+
155
+
156
+ # (REST calls, no LangChain-OpenAI).
157
+ class SimpleChat:
158
+ def __init__(self, model="gpt-4o-mini"):
159
+ self.model = model
160
+ self.api_key = os.getenv("OPENAI_API_KEY")
161
+ self.base_url = "https://api.openai.com/v1/chat/completions"
162
+
163
+ def invoke(self, messages, temperature=0, max_tokens=1500):
164
+ resp = requests.post(
165
+ self.base_url,
166
+ headers={"Authorization": f"Bearer {self.api_key}"},
167
+ json={
168
+ "model": self.model,
169
+ "messages": messages,
170
+ "temperature": temperature,
171
+ "max_tokens": max_tokens
172
+ }
173
+ )
174
+ resp.raise_for_status()
175
+ return resp.json()["choices"][0]["message"]["content"].strip()
176
+
177
+
178
+
179
+ # --------------------------
180
+ # Chunking + RAG
181
+ # --------------------------
182
+
183
+
184
+ # ADDED section_title & subsection_title alongside subsubsection_titLes into each chunk,
185
+ # so that any Chunk as it gets embedded
186
+ # >>>> It should have reference of the Parent level Section/Subsetion Titles information , in particular , as well ,
187
+ # >>>> Just in case , some End User says something at the level of Section Level mapped information.
188
+
189
+ # Secondly this helps to Increase trust and compliance by citing sources (document name, section, subsection, subsubsection as well) for each response.
190
+
191
+ # --- Flatten JSON to chunks ---
192
+ import spacy
193
+ import json
194
+
195
+ # Load spaCy NER model
196
+ nlp = spacy.load("en_core_web_sm")
197
+
198
  # --- spaCy Extraction ---
199
  def extract_with_spacy(text):
200
  doc = nlp(text)
 
258
  """
259
 
260
  try:
261
+ # (REST calls, no LangChain-OpenAI).
262
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
263
+ llm = SimpleChat(model="gpt-4o-mini")
264
+ messages = [
265
+ {"role": "user", "content": prompt}
266
+ ]
267
+ content = llm.invoke(messages, temperature=0, max_tokens=1500)
268
 
269
  # Enforce safe parsing
270
  if content.startswith("{"):
 
273
  extracted = {"roles": [], "locations": [], "departments": []}
274
 
275
  except Exception:
276
+ print("NOT ABLE TO RESOLVE LLM CALL XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
277
  extracted = {"roles": [], "locations": [], "departments": []}
278
 
279
  return extracted
280
 
281
 
282
+ # --- Merge spaCy + LLM ---
283
+ def enrich_metadata(text):
284
+ spacy_res = extract_with_spacy(text)
285
+ llm_res = extract_with_llm(text)
286
+ return {
287
+ "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
288
+ "locations": list(set(spacy_res["locations"] + llm_res["locations"])),
289
+ "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
290
+ }
291
+
292
+ # --- Ensure metadata is Chroma-compatible ---
293
+ def sanitize_metadata(meta: dict) -> dict:
294
+ safe_meta = {}
295
+ for k, v in meta.items():
296
+ if isinstance(v, (str, int, float, bool)) or v is None:
297
+ safe_meta[k] = v
298
+ elif isinstance(v, (list, tuple)):
299
+ safe_meta[k] = ", ".join(map(str, v)) # flatten lists
300
+ elif isinstance(v, dict):
301
+ safe_meta[k] = json.dumps(v, ensure_ascii=False) # dict → string
302
+ else:
303
+ safe_meta[k] = str(v) # fallback
304
+ return safe_meta
305
+
306
+
307
+
308
+ # --- Flatten JSON to chunks ---
309
+ def flatten_json_to_chunks(structured_json, document_name="Flykite HR Policy Handbook"):
310
+ chunks = []
311
+ for sec in structured_json:
312
+ section_title = sec.get("section")
313
+ for sub in sec.get("subsections", []):
314
+ subsection_title = sub.get("subsection")
315
+
316
+ # Sub-subsections
317
+ for subsub in sub.get("subsubsections", []):
318
+ content_text = " ".join(subsub.get("content", []))
319
+ if content_text.strip():
320
+ enriched_meta = enrich_metadata(content_text)
321
+ meta = sanitize_metadata({
322
+ "document": document_name,
323
+ "section": section_title,
324
+ "subsection": subsection_title,
325
+ "subsubsection": subsub.get("title"),
326
+ **enriched_meta
327
+ })
328
+ chunks.append({
329
+ "text": f"{section_title} | {subsection_title} | {subsub.get('title')}\n\n{content_text}",
330
+ "metadata": meta
331
+ })
332
+
333
+ # Fallback: orphaned content under subsection
334
+ if sub.get("content"):
335
+ content_text = " ".join(sub.get("content", []))
336
+ enriched_meta = enrich_metadata(content_text)
337
+ meta = sanitize_metadata({
338
+ "document": document_name,
339
+ "section": section_title,
340
+ "subsection": subsection_title,
341
+ "subsubsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
342
+ **enriched_meta
343
+ })
344
+ chunks.append({
345
+ "text": f"{section_title} | {subsection_title}\n\n{content_text}",
346
+ "metadata": meta
347
+ })
348
+
349
+ # Fallback: orphaned content under section
350
+ if sec.get("content"):
351
+ content_text = " ".join(sec.get("content", []))
352
+ enriched_meta = enrich_metadata(content_text)
353
+ meta = sanitize_metadata({
354
+ "document": document_name,
355
+ "section": section_title,
356
+ "subsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
357
+ "subsubsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
358
+ **enriched_meta
359
+ })
360
+ chunks.append({
361
+ "text": f"{section_title}\n\n{content_text}",
362
+ "metadata": meta
363
+ })
364
+ return chunks
365
+
366
+
367
+
368
+
369
+ def build_context(docs):
370
+ context_parts = []
371
+ for d in docs:
372
+ meta = d.metadata
373
+ citation = f"{meta.get('document')} → {meta.get('section')}"
374
+ if meta.get("subsection"):
375
+ citation += f" / {meta.get('subsection')}"
376
+ if meta.get("subsubsection"):
377
+ citation += f" / {meta.get('subsubsection')}"
378
+ context_parts.append(f"Source: {citation}\n{d.page_content}")
379
+ return "\n\n---\n\n".join(context_parts)
380
+
381
+
382
+
383
  # -----------------------
384
  # User Query Enrichment
385
  # -----------------------
386
  def extract_metadata_from_query(query: str):
387
  """Use spaCy + LLM to extract role/location/department from user query."""
388
  spacy_res = extract_with_spacy(query)
389
+ print("spaCy results ## ==>", spacy_res)
390
  llm_res = extract_with_llm(query)
391
+ print("LLM Extraction Results ## ==>", llm_res)
 
392
 
393
  return {
394
  "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
 
396
  "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
397
  }
398
 
399
+
400
+ # -----------------------
401
  # Helper: Filter docs manually
402
+ # -----------------------
403
  def filter_docs_by_metadata(docs, metadata_filters):
404
  filtered = []
405
  for d in docs:
 
417
 
418
 
419
 
420
+ def generate_rag_response(user_input, retriever, k=3, max_tokens=1500):
 
 
 
 
 
 
 
 
421
 
 
 
 
422
  # relevant_docs = retriever.get_relevant_documents(user_input)[:k]
423
 
 
424
  # When user asks a query, we enrich it by extracting role, location, department using the same spaCy + LLM pipeline.
425
  # Pass those extracted values as filters to the retriever → only chunks with matching metadata are considered.
426
  # If nothing matches, fallback to plain semantic search (so we don’t block valid answers).
 
429
  query_metadata = extract_metadata_from_query(user_input)
430
 
431
  print("\n======================")
432
+ print(" User Query:", user_input)
433
+ print(" Extracted metadata from query:", query_metadata) # Investigatory log
434
+
435
 
436
  # 2. Retrieve top-k docs semantically
437
  retrieved_docs = retriever.get_relevant_documents(user_input, k=k)
438
+ print(f" Retrieved {len(retrieved_docs)} docs before filtering")
439
 
440
  # 3. Apply metadata filtering
441
  filtered_docs = filter_docs_by_metadata(retrieved_docs, query_metadata)
442
  if filtered_docs:
443
  selected_docs = filtered_docs
444
+ print(f"✅ {len(selected_docs)} docs kept after metadata filtering")
445
  else:
446
  selected_docs = retrieved_docs # fallback if no metadata match
447
  print("⚠️ No metadata match, falling back to semantic retrieval only")
448
 
449
 
450
+ # Step 4: Log retrieved docs metadata
451
+ print(f"✅ Retrieved {len(selected_docs)} docs")
 
452
  for i, d in enumerate(selected_docs, 1):
453
+ print(f"\n--- Chunk {i} ---")
454
+ print("Text:", d.page_content[:200], "...") # preview first 200 chars
455
+ print("Metadata:", d.metadata)
456
 
457
 
458
+ context_for_query = build_context(selected_docs)
459
+ user_prompt = hr_user_message_template.format(context=context_for_query, question=user_input)
460
 
461
+ messages = [
462
+ {"role": "system", "content": hr_system_message},
463
+ {"role": "user", "content": user_prompt},
464
+ ]
 
 
 
 
 
 
465
 
466
+ #llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=max_tokens)
467
+ #response = llm.invoke(messages)
468
+ #return {"answer": response.content, "sources": [d.metadata for d in relevant_docs]}
469
+ # You still used ChatOpenAI (from langchain-openai) for generating answers.
470
+ # That’s where the proxies keyword issue blew up, since that part was still using the buggy client.
471
+ # Error: your container is pulling in a version of langchain-openai (and maybe openai)
472
+ # that still tries to pass proxies to the OpenAI client, but in your current environment the client doesn’t accept that argument.
473
+
474
+ # (REST calls, no LangChain-OpenAI).
475
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
476
+ llm = SimpleChat(model="gpt-4o-mini")
477
+ answer = llm.invoke(messages, temperature=0, max_tokens=max_tokens)
478
+ return {"answer": answer, "sources": [d.metadata for d in selected_docs]}
479
+
480
+
481
+
482
+ # --------------------------
483
+ # FastAPI App
484
+ # --------------------------
485
+
486
+ app = FastAPI()
487
+ persist_dir = "./flykite_chromadb"
488
+ retriever = None
489
+
490
+ class QueryRequest(BaseModel):
491
+ query: str
492
+ top_k: Optional[int] = 3
493
+
494
+ @app.on_event("startup")
495
+ def startup_event():
496
+ global retriever
497
+ pdf_path = "Dataset-FlykiteAirlines_HRP.pdf" #Place PDF IN the repo Boot
498
+
499
+ # Parse PDF → JSON
500
+ parsed_data = parse_flykite(pdf_path)
501
+ print(json.dumps(parsed_data[:1], indent=2, ensure_ascii=False))
502
+
503
+ if not parsed_data:
504
+ raise RuntimeError(" Parsed JSON is empty, cannot build chunks/vectorstore")
505
+
506
+ # Flatten chunks
507
+ chunks = flatten_json_to_chunks(parsed_data)
508
+ print(f" Loaded {len(chunks)} chunks from JSON")
509
+
510
+ # If no chunks, fail early
511
+ if not chunks:
512
+ raise RuntimeError("No chunks generated from structured JSON")
513
+
514
+
515
+ # Build Chroma vectorstore
516
+ # Define SimpleEmbeddings inline
517
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
518
+ class SimpleEmbeddings:
519
+ def __init__(self, model="text-embedding-3-small"):
520
+ self.model = model
521
+ self.api_key = os.getenv("OPENAI_API_KEY")
522
+ self.base_url = "https://api.openai.com/v1/embeddings"
523
+
524
+ def embed_documents(self, texts):
525
+ embeddings = []
526
+ for text in texts:
527
+ resp = requests.post(
528
+ self.base_url,
529
+ headers={"Authorization": f"Bearer {self.api_key}"},
530
+ json={"model": self.model, "input": text}
531
+ )
532
+ resp.raise_for_status()
533
+ embeddings.append(resp.json()["data"][0]["embedding"])
534
+ return embeddings
535
+
536
+ def embed_query(self, query):
537
+ resp = requests.post(
538
+ self.base_url,
539
+ headers={"Authorization": f"Bearer {self.api_key}"},
540
+ json={"model": self.model, "input": query}
541
+ )
542
+ resp.raise_for_status()
543
+ return resp.json()["data"][0]["embedding"]
544
+
545
+
546
+ # Use SimpleEmbeddings instead of OpenAIEmbeddings
547
+ embedding = SimpleEmbeddings(model="text-embedding-3-small")
548
+
549
+ texts = [c["text"] for c in chunks]
550
+ metadatas = [c["metadata"] for c in chunks]
551
+
552
+ vectorstore = Chroma.from_texts(
553
+ texts=texts,
554
+ embedding=embedding,
555
+ metadatas=metadatas,
556
+ persist_directory=persist_dir,
557
+ ids=[f"chunk_{i}" for i in range(len(chunks))]
558
 
 
 
 
 
559
  )
560
 
561
+ vectorstore.persist() #ensure data is saved to disk
 
 
 
562
 
563
+ print("💾 Chroma vectorstore saved !!")
 
564
 
565
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
566
+ print(" PDF parsed, chunks embedded, retriever initialized.")
567
+
568
+ @app.post("/query")
569
+ def query_endpoint(req: QueryRequest):
570
+ return generate_rag_response(req.query, retriever, k=req.top_k)
571
 
 
572
 
573
  # =============================
574
  # Step 5: Chat Function
575
  # =============================
576
+ #def chat_fn(message, history):
577
+ # answer = generate_rag_based_response(message, retriever)
578
+ # return f"{answer}\n\n🧠 (Context retrieved from {DATASET_REPO})"
579
 
580
 
581
  # =============================
 
670
  # return f"BubbleBot says: {message}"
671
 
672
  gr.ChatInterface(
673
+ fn=query_endpoint,
674
  title="Flyline Chatbot ✈ ️",
675
  description="Ask Flyline HR",
676
  theme="soft",