Spaces:

naveen07garg
/

AirlineChatBot

Sleeping

App Files Files Community

naveen07garg commited on Oct 31

Commit

8b58fd4

verified ·

1 Parent(s): 147a11f

Update app.py

Browse files

Files changed (1) hide show

app.py +418 -211

app.py CHANGED Viewed

@@ -1,157 +1,37 @@
-import os
-import gradio as gr
-from huggingface_hub import snapshot_download
-from langchain.embeddings import SentenceTransformerEmbeddings
-from langchain_chroma import Chroma
-#from langchain_community.vectorstores import Chroma
-from transformers import pipeline
-from langchain_community.llms import HuggingFacePipeline
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from langchain_openai import ChatOpenAI
-from langchain_openai import OpenAIEmbeddings
-#from langchain_community.vectorstores import Chroma
-import spacy
 import json
-import os
-from dotenv import load_dotenv
-#--===============================
-#   Set ENV Variable
-#--===============================
-load_dotenv()
-os.environ["OPENAI_API_KEY"] = "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"
-#"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"
-#--===============================
-#   Define Prompt
-#--===============================
-# Define the system message for Flykite Airlines HR Policy Assistant
-# --- Refined System Prompt ---
-QNA_SYSTEM_MESSAGE = """
-You are the Flykite Airlines HR Policy Assistant.
-Your role is to answer employee questions based on official HR documents (handbooks, policy PDFs, etc.).
-Each user question will start with the token: ###Question.
-### Response Rules
-- Be clear, factual, and professional.
-- Use bullet points (-) or numbered lists (1., 2., etc.) for clarity.
-- Begin with a **one-line summary**, then details.
-- Cite the Specific policy references (Document → Section → Subsection → Sub-subsection) where
-  the answer comes from.
-- If the answer is not in the source, reply 1 line from generic resonse and post fix with exactly: \n\n **"Could not find anything out from Flyline  HR documentation around your query.\n\nPlease rephrase your query."**
-- Do **not** make assumptions or fabricate information.
-### Ambiguity & Context
-- If a query could refer to multiple policies or depends on role/location/department, ask **one short clarifying question**.
-- If you assume a context, state it clearly (e.g., "Assuming HQ staff...").
-- When policies differ by role/location, list variations clearly.
-### Personalization
-- Tailor responses to any role, location, or employment type provided.
-- Mention if rules vary and what those differences are.
-### Format
-1. One-line summary.
-2. Key details, steps, or rules.
-3. Specific policy references (Document → Section → Subsection → Sub-subsection) where
   the answer comes from.
-4. Optional follow-up suggestion or clarifying question.
-### Important
-- Never guess or invent policy content.
-- Maintain confidentiality and avoid personal data.
-- User questions always begin with `###Question`. Respond only to those.
 """
-# =========================================================
-# Step 1: Download Vectorstore from Hugging Face Dataset
-# =========================================================
-VECTOR_DIR = "naveen07garg/AirlineChatBot/vectorstore/" #=== application space location
-DATASET_REPO = "naveen07garg/AirlineChatBot-vectorstore" #== data store space flykite_handbook_chromadb
-#OPENAI_API_KEY="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"
-if not os.path.exists(VECTOR_DIR):
-    print("⬇️ Downloading vectorstore from Hugging Face dataset...")
-    snapshot_download(
-        repo_id=DATASET_REPO,
-        repo_type="dataset",
-        local_dir=VECTOR_DIR,
-        ignore_patterns=[".gitattributes"],
-    )
-    print("✅ Vectorstore downloaded successfully!")
-    for root, dirs, files in os.walk(VECTOR_DIR):
-        for f in files:
-            print("   ", os.path.join(root, f))
-else:
-    print("📦 Vectorstore already present, skipping download.")
-# =============================
-# Step 2: Load Chroma Vectorstore
-# =============================
-#embedding_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-embedding_fn = OpenAIEmbeddings(model="text-embedding-3-small")
-vectordb = Chroma(persist_directory=VECTOR_DIR, embedding_function=embedding_fn)
-print("Collections:", vectordb._client.list_collections())
-# Get underlying collection
-collection = vectordb._collection
-print("Chroma vectorstore collection :\n")
-print(collection)
-#res = vectordb._collection.get(ids=["chunk_6"], include=["metadatas", "documents"])
-#print("lets check with chunk_6 \n")
-#print(res["metadatas"][0])
-retriever = vectordb.as_retriever(
-    search_type='similarity',
-    search_kwargs={'k': 3}
-)
-query = "What is the leave policy?"
-results = retriever.get_relevant_documents(query)
-print("Chroma vectorstore loaded successfully! \n\nWith test results - ")
-print(results)
-for i, doc in enumerate(results):
-    print(f"\nResult {i+1}")
-    print("Document:", doc.page_content)
-    print("Metadata:", doc.metadata)
-# =============================
-# Step 3: Load LLM
-# =============================
-#qa_model = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct") #model="mistralai/Mistral-7B-Instruct-v0.2")
-#model_id = "meta-llama/Meta-Llama-3-70B"
-#tokenizer = AutoTokenizer.from_pretrained(model_id)
-#model = AutoModelForCausalLM.from_pretrained(model_id)
-# Low creativity (deterministic) LLM
-llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=1500, openai_api_key="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")
-#qa_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
-#llm = HuggingFacePipeline(pipeline=qa_model)
-# =============================
-# Step 4: RAG Response Function
-# =============================
-# Load spaCy NER model
-nlp = spacy.load("en_core_web_sm")
-# --- User Prompt Template ---
 hr_user_message_template = """
 Consider the following ###Context and ###Question:
@@ -162,6 +42,159 @@ Consider the following ###Context and ###Question:
 {question}
 """
 # --- spaCy Extraction ---
 def extract_with_spacy(text):
     doc = nlp(text)
@@ -225,8 +258,13 @@ def extract_with_llm(text):
     """
     try:
-        response = llm.invoke(prompt)
-        content = response.content.strip()
         # Enforce safe parsing
         if content.startswith("{"):
@@ -235,21 +273,122 @@ def extract_with_llm(text):
             extracted = {"roles": [], "locations": [], "departments": []}
     except Exception:
         extracted = {"roles": [], "locations": [], "departments": []}
     return extracted
 # -----------------------
 # User Query Enrichment
 # -----------------------
 def extract_metadata_from_query(query: str):
     """Use spaCy + LLM to extract role/location/department from user query."""
     spacy_res = extract_with_spacy(query)
-    print("spaCy results ## ==>%s", spacy_res)
     llm_res = extract_with_llm(query)
-    print("LLM Extraction Results ## ==>%s", llm_res)
     return {
         "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
@@ -257,9 +396,10 @@ def extract_metadata_from_query(query: str):
         "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
     }
-# -------------------------------
 # Helper: Filter docs manually
-# -------------------------------
 def filter_docs_by_metadata(docs, metadata_filters):
     filtered = []
     for d in docs:
@@ -277,22 +417,10 @@ def filter_docs_by_metadata(docs, metadata_filters):
-def generate_rag_based_response(user_input, retriever, k=3, max_tokens=800, temperature=0, top_p=0.95):
-    """
-    Args:
-        user_input: User query string
-        retriever: LangChain retriever (from Chroma)
-        k: number of top documents to retrieve
-    Returns:
-        The generated response based on user query + context with citations
-    """
-    # Step 1: Retrieve relevant chunks
-    # relevant_docs = retriever.get_relevant_documents(user_input)
-    # selected_docs = relevant_docs[:k]
     # relevant_docs = retriever.get_relevant_documents(user_input)[:k]
     # When user asks a query, we enrich it by extracting role, location, department using the same spaCy + LLM pipeline.
     # Pass those extracted values as filters to the retriever → only chunks with matching metadata are considered.
     # If nothing matches, fallback to plain semantic search (so we don’t block valid answers).
@@ -301,74 +429,153 @@ def generate_rag_based_response(user_input, retriever, k=3, max_tokens=800, temp
     query_metadata = extract_metadata_from_query(user_input)
     print("\n======================")
-    print("User Query: %s", user_input)
-    print("Extracted metadata from query: %s", query_metadata)  # Investigatory log
     # 2. Retrieve top-k docs semantically
     retrieved_docs = retriever.get_relevant_documents(user_input, k=k)
-    print("Retrieved %d docs before filtering", len(retrieved_docs))
     # 3. Apply metadata filtering
     filtered_docs = filter_docs_by_metadata(retrieved_docs, query_metadata)
     if filtered_docs:
         selected_docs = filtered_docs
-        print("✅ %d docs kept after metadata filtering", len(selected_docs))
     else:
         selected_docs = retrieved_docs  # fallback if no metadata match
         print("⚠️ No metadata match, falling back to semantic retrieval only")
-     #  Step 4: Log retrieved docs metadata
-    print("✅ Retrieved %d docs", len(selected_docs))
     for i, d in enumerate(selected_docs, 1):
-        print("\n--- Chunk %d ---", i)
-        print("Text: %s...", d.page_content[:200])  # preview first 200 chars
-        print("Metadata: %s", d.metadata)
-    # Step 4: Build context with citations
-    context_parts = []
-    for d in selected_docs:
-        meta = d.metadata
-        citation = f"{meta.get('document')} → {meta.get('section')}"
-        if meta.get("subsection"):
-            citation += f" / {meta.get('subsection')}"
-        if meta.get("subsubsection"):
-            citation += f" / {meta.get('subsubsection')}"
-        context_parts.append(f"Source: {citation}\n{d.page_content}")
-    context_for_query = "\n\n---\n\n".join(context_parts)
-    # Step 5: Construct prompt
-    user_prompt = hr_user_message_template.format(
-        context=context_for_query,
-        question=user_input
     )
-    messages = [
-        {"role": "system", "content": QNA_SYSTEM_MESSAGE},
-        {"role": "user", "content": user_prompt},
-    ]
-    # Step 6: Query the LLM
-    llm = ChatOpenAI(model="gpt-4o-mini", temperature=temperature, max_tokens=max_tokens)
-    try:
-        response = llm.invoke(messages)
-        prediction = response.content
-    except Exception as e:
-        prediction = f" Error: {e}"
-    return prediction
 # =============================
 # Step 5: Chat Function
 # =============================
-def chat_fn(message, history):
-    answer = generate_rag_based_response(message, retriever)
-    return f"{answer}\n\n🧠 (Context retrieved from {DATASET_REPO})"
 # =============================
@@ -463,7 +670,7 @@ css = """
 #    return f"BubbleBot says: {message}"
 gr.ChatInterface(
-    fn=chat_fn,
     title="Flyline Chatbot ✈ ️",
     description="Ask Flyline HR",
     theme="soft",

+%%writefile backendFiles/app.py
 import json
+import re
+import os, requests
+import fitz  # PyMuPDF  We use PyMuPDF (fitz) to capture hierarchy (section → subsection → subsubsection →  content/bullets).
+from collections import Counter
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import Optional
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+# --------------------------
+# HR Assistant Prompt Templates
+# --------------------------
+hr_system_message = """
+You are "Flykite HR Assistant", a helpful and professional AI bot for an airline company.
+You specialize in answering employee questions about HR policies, benefits, and compliance.
+Rules:
+- Use only the information provided in the ###Context.
+- If the user's role, location, or department is mentioned in the query or appears in the context,
+  personalize the answer accordingly. Acknowledge differences (e.g., policies for Field Staff vs Headquarters,
+  or India vs UK).
+- Always cite the specific policy references (Document → Section → Subsection → Sub-subsection) where
   the answer comes from.
+- If the answer cannot be derived from the context, respond only with: "I don't know".
+- Keep your tone clear, supportive, and professional — like an HR representative for airline staff.
+- If multiple relevant rules exist, summarize them and cite all applicable sources.
+- Never invent or assume policies beyond what is provided.
 """
 hr_user_message_template = """
 Consider the following ###Context and ###Question:
 {question}
 """
+# --------------------------
+# PDF Parsing Utils
+# --------------------------
+def clean_text_hidden(s: str) -> str:
+    if not s:
+        return ""
+    s = re.sub(r"[\u200B-\u200F\u202A-\u202E\u00A0\u00AD]", " ", s)
+    s = re.sub(r"\s+", " ", s)
+    return s.strip()
+def is_line_fully_bold(spans):
+    return all(
+        ("Bold" in s["font"] or s["flags"] & 2 != 0)
+        for s in spans if s.get("text", "").strip()
+    )
+def detect_font_levels(pdf_path):
+    doc = fitz.open(pdf_path)
+    font_sizes = []
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:
+            for l in b.get("lines", []):
+                for s in l.get("spans", []):
+                    font_sizes.append(round(s["size"], 1))
+    unique_sizes = sorted(set(font_sizes), reverse=True)
+    if len(unique_sizes) > 3:
+        candidate_sizes = unique_sizes[1:-1]
+    else:
+        candidate_sizes = unique_sizes
+    section_size = candidate_sizes[0] if candidate_sizes else unique_sizes[0]
+    subsubsection_size = candidate_sizes[1] if len(candidate_sizes) > 1 else section_size
+    return section_size, subsubsection_size
+def most_common_size(sizes):
+    return Counter(sizes).most_common(1)[0][0] if sizes else None
+def parse_flykite(pdf_path):
+    section_size, subsubsection_size = detect_font_levels(pdf_path)
+    doc = fitz.open(pdf_path)
+    sections = []
+    current_section, current_subsection, current_subsubsection = None, None, None
+    for page_num, page in enumerate(doc, start=1):
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:
+            for l in b.get("lines", []):
+                spans = l.get("spans", [])
+                line_text = "".join(s.get("text", "") for s in spans).strip()
+                line_text = clean_text_hidden(line_text)
+                if not line_text:
+                    continue
+                span_sizes = [round(s["size"], 1) for s in spans]
+                line_size = most_common_size(span_sizes)
+                # SECTION/SUBSECTION
+                if line_size == section_size:
+                    if is_line_fully_bold(spans) and "policy" in line_text.lower():
+                        current_subsection = {"subsection": line_text, "subsubsections": [], "content": []}
+                        if current_section:
+                            current_section["subsections"].append(current_subsection)
+                    else:
+                        current_section = {"section": line_text, "subsections": []}
+                        sections.append(current_section)
+                        current_subsection = None
+                        current_subsubsection = None
+                    continue
+                # SUB-SUBSECTION
+                if re.match(r"^\d+\s*\.\s+", line_text):
+                    if line_size == subsubsection_size:
+                        is_heading = False
+                        if is_line_fully_bold(spans):
+                            is_heading = True
+                        else:
+                            if len(spans) > 1:
+                                first_span_text = clean_text_hidden(spans[0]["text"]).strip()
+                                if re.match(r"^\d+\.?$", first_span_text):
+                                    rest_bold = all(
+                                        ("Bold" in s["font"] or s["flags"] & 2 != 0)
+                                        for s in spans[1:] if s.get("text", "").strip()
+                                    )
+                                    if rest_bold:
+                                        is_heading = True
+                        if is_heading:
+                            current_subsubsection = {"title": line_text, "content": []}
+                            if current_subsection:
+                                current_subsection["subsubsections"].append(current_subsubsection)
+                            elif current_section:
+                                auto_sub = {"subsection": current_section["section"], "subsubsections": []}
+                                current_section["subsections"].append(auto_sub)
+                                current_subsection = auto_sub
+                                current_subsection["subsubsections"].append(current_subsubsection)
+                            continue
+                # otherwise treat as content
+                if current_subsubsection:
+                    current_subsubsection["content"].append(line_text)
+                elif current_subsection:
+                    current_subsection["content"].append(line_text)
+                elif current_section:
+                    current_section.setdefault("content", []).append(line_text)
+                else:
+                    if not sections:
+                        sections.append({"intro": [line_text]})
+                    else:
+                        sections[0].setdefault("intro", []).append(line_text)
+    return sections
+# (REST calls, no LangChain-OpenAI).
+class SimpleChat:
+    def __init__(self, model="gpt-4o-mini"):
+        self.model = model
+        self.api_key = os.getenv("OPENAI_API_KEY")
+        self.base_url = "https://api.openai.com/v1/chat/completions"
+    def invoke(self, messages, temperature=0, max_tokens=1500):
+        resp = requests.post(
+            self.base_url,
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": messages,
+                "temperature": temperature,
+                "max_tokens": max_tokens
+            }
+        )
+        resp.raise_for_status()
+        return resp.json()["choices"][0]["message"]["content"].strip()
+# --------------------------
+# Chunking + RAG
+# --------------------------
+# ADDED section_title & subsection_title alongside subsubsection_titLes into each chunk,
+# so that any Chunk as it gets embedded
+# >>>> It should have reference of the Parent level Section/Subsetion Titles information , in particular , as well ,
+# >>>> Just in case , some End User says something at the level of Section Level mapped information.
+# Secondly this helps to Increase trust and compliance by citing sources (document name, section, subsection, subsubsection as well) for each response.
+# --- Flatten JSON to chunks ---
+import spacy
+import json
+# Load spaCy NER model
+nlp = spacy.load("en_core_web_sm")
 # --- spaCy Extraction ---
 def extract_with_spacy(text):
     doc = nlp(text)
     """
     try:
+         # (REST calls, no LangChain-OpenAI).
+        os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+        llm = SimpleChat(model="gpt-4o-mini")
+        messages = [
+        {"role": "user", "content": prompt}
+        ]
+        content = llm.invoke(messages, temperature=0, max_tokens=1500)
         # Enforce safe parsing
         if content.startswith("{"):
             extracted = {"roles": [], "locations": [], "departments": []}
     except Exception:
+        print("NOT ABLE TO RESOLVE LLM CALL XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
         extracted = {"roles": [], "locations": [], "departments": []}
     return extracted
+# --- Merge spaCy + LLM ---
+def enrich_metadata(text):
+    spacy_res = extract_with_spacy(text)
+    llm_res = extract_with_llm(text)
+    return {
+        "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
+        "locations": list(set(spacy_res["locations"] + llm_res["locations"])),
+        "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
+    }
+# --- Ensure metadata is Chroma-compatible ---
+def sanitize_metadata(meta: dict) -> dict:
+    safe_meta = {}
+    for k, v in meta.items():
+        if isinstance(v, (str, int, float, bool)) or v is None:
+            safe_meta[k] = v
+        elif isinstance(v, (list, tuple)):
+            safe_meta[k] = ", ".join(map(str, v))  # flatten lists
+        elif isinstance(v, dict):
+            safe_meta[k] = json.dumps(v, ensure_ascii=False)  # dict → string
+        else:
+            safe_meta[k] = str(v)  # fallback
+    return safe_meta
+# --- Flatten JSON to chunks ---
+def flatten_json_to_chunks(structured_json, document_name="Flykite HR Policy Handbook"):
+    chunks = []
+    for sec in structured_json:
+        section_title = sec.get("section")
+        for sub in sec.get("subsections", []):
+            subsection_title = sub.get("subsection")
+            # Sub-subsections
+            for subsub in sub.get("subsubsections", []):
+                content_text = " ".join(subsub.get("content", []))
+                if content_text.strip():
+                    enriched_meta = enrich_metadata(content_text)
+                    meta = sanitize_metadata({
+                        "document": document_name,
+                        "section": section_title,
+                        "subsection": subsection_title,
+                        "subsubsection": subsub.get("title"),
+                        **enriched_meta
+                    })
+                    chunks.append({
+                        "text": f"{section_title} | {subsection_title} | {subsub.get('title')}\n\n{content_text}",
+                        "metadata": meta
+                    })
+            # Fallback: orphaned content under subsection
+            if sub.get("content"):
+                content_text = " ".join(sub.get("content", []))
+                enriched_meta = enrich_metadata(content_text)
+                meta = sanitize_metadata({
+                    "document": document_name,
+                    "section": section_title,
+                    "subsection": subsection_title,
+                    "subsubsection": "",             # None, : Chroma doesn’t allow None values. They must be strings (or removed),
+                    **enriched_meta
+                })
+                chunks.append({
+                    "text": f"{section_title} | {subsection_title}\n\n{content_text}",
+                    "metadata": meta
+                })
+        # Fallback: orphaned content under section
+        if sec.get("content"):
+            content_text = " ".join(sec.get("content", []))
+            enriched_meta = enrich_metadata(content_text)
+            meta = sanitize_metadata({
+                "document": document_name,
+                "section": section_title,
+                "subsection": "",             # None, : Chroma doesn’t allow None values. They must be strings (or removed),
+                "subsubsection": "",             # None, : Chroma doesn’t allow None values. They must be strings (or removed),
+                **enriched_meta
+            })
+            chunks.append({
+                "text": f"{section_title}\n\n{content_text}",
+                "metadata": meta
+            })
+    return chunks
+def build_context(docs):
+    context_parts = []
+    for d in docs:
+        meta = d.metadata
+        citation = f"{meta.get('document')} → {meta.get('section')}"
+        if meta.get("subsection"):
+            citation += f" / {meta.get('subsection')}"
+        if meta.get("subsubsection"):
+            citation += f" / {meta.get('subsubsection')}"
+        context_parts.append(f"Source: {citation}\n{d.page_content}")
+    return "\n\n---\n\n".join(context_parts)
 # -----------------------
 # User Query Enrichment
 # -----------------------
 def extract_metadata_from_query(query: str):
     """Use spaCy + LLM to extract role/location/department from user query."""
     spacy_res = extract_with_spacy(query)
+    print("spaCy results ## ==>", spacy_res)
     llm_res = extract_with_llm(query)
+    print("LLM Extraction Results ## ==>", llm_res)
     return {
         "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
         "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
     }
+# -----------------------
 # Helper: Filter docs manually
+# -----------------------
 def filter_docs_by_metadata(docs, metadata_filters):
     filtered = []
     for d in docs:
+def generate_rag_response(user_input, retriever, k=3, max_tokens=1500):
     # relevant_docs = retriever.get_relevant_documents(user_input)[:k]
     # When user asks a query, we enrich it by extracting role, location, department using the same spaCy + LLM pipeline.
     # Pass those extracted values as filters to the retriever → only chunks with matching metadata are considered.
     # If nothing matches, fallback to plain semantic search (so we don’t block valid answers).
     query_metadata = extract_metadata_from_query(user_input)
     print("\n======================")
+    print(" User Query:", user_input)
+    print(" Extracted metadata from query:", query_metadata)  # Investigatory log
     # 2. Retrieve top-k docs semantically
     retrieved_docs = retriever.get_relevant_documents(user_input, k=k)
+    print(f" Retrieved {len(retrieved_docs)} docs before filtering")
     # 3. Apply metadata filtering
     filtered_docs = filter_docs_by_metadata(retrieved_docs, query_metadata)
     if filtered_docs:
         selected_docs = filtered_docs
+        print(f"✅ {len(selected_docs)} docs kept after metadata filtering")
     else:
         selected_docs = retrieved_docs  # fallback if no metadata match
         print("⚠️ No metadata match, falling back to semantic retrieval only")
+    #  Step 4: Log retrieved docs metadata
+    print(f"✅ Retrieved {len(selected_docs)} docs")
     for i, d in enumerate(selected_docs, 1):
+        print(f"\n--- Chunk {i} ---")
+        print("Text:", d.page_content[:200], "...")  # preview first 200 chars
+        print("Metadata:", d.metadata)
+    context_for_query = build_context(selected_docs)
+    user_prompt = hr_user_message_template.format(context=context_for_query, question=user_input)
+    messages = [
+        {"role": "system", "content": hr_system_message},
+        {"role": "user", "content": user_prompt},
+    ]
+    #llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=max_tokens)
+    #response = llm.invoke(messages)
+    #return {"answer": response.content, "sources": [d.metadata for d in relevant_docs]}
+    # You still used ChatOpenAI (from langchain-openai) for generating answers.
+    # That’s where the proxies keyword issue blew up, since that part was still using the buggy client.
+    # Error: your container is pulling in a version of langchain-openai (and maybe openai)
+    # that still tries to pass proxies to the OpenAI client, but in your current environment the client doesn’t accept that argument.
+    # (REST calls, no LangChain-OpenAI).
+    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+    llm = SimpleChat(model="gpt-4o-mini")
+    answer = llm.invoke(messages, temperature=0, max_tokens=max_tokens)
+    return {"answer": answer, "sources": [d.metadata for d in selected_docs]}
+# --------------------------
+# FastAPI App
+# --------------------------
+app = FastAPI()
+persist_dir = "./flykite_chromadb"
+retriever = None
+class QueryRequest(BaseModel):
+    query: str
+    top_k: Optional[int] = 3
+@app.on_event("startup")
+def startup_event():
+    global retriever
+    pdf_path = "Dataset-FlykiteAirlines_HRP.pdf"  #Place PDF IN the repo Boot
+    # Parse PDF → JSON
+    parsed_data = parse_flykite(pdf_path)
+    print(json.dumps(parsed_data[:1], indent=2, ensure_ascii=False))
+    if not parsed_data:
+        raise RuntimeError(" Parsed JSON is empty, cannot build chunks/vectorstore")
+    # Flatten chunks
+    chunks = flatten_json_to_chunks(parsed_data)
+    print(f" Loaded {len(chunks)} chunks from JSON")
+    # If no chunks, fail early
+    if not chunks:
+        raise RuntimeError("No chunks generated from structured JSON")
+    # Build Chroma vectorstore
+    # Define SimpleEmbeddings inline
+    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+    class SimpleEmbeddings:
+        def __init__(self, model="text-embedding-3-small"):
+            self.model = model
+            self.api_key = os.getenv("OPENAI_API_KEY")
+            self.base_url = "https://api.openai.com/v1/embeddings"
+        def embed_documents(self, texts):
+            embeddings = []
+            for text in texts:
+                resp = requests.post(
+                    self.base_url,
+                    headers={"Authorization": f"Bearer {self.api_key}"},
+                    json={"model": self.model, "input": text}
+                )
+                resp.raise_for_status()
+                embeddings.append(resp.json()["data"][0]["embedding"])
+            return embeddings
+        def embed_query(self, query):
+            resp = requests.post(
+                self.base_url,
+                headers={"Authorization": f"Bearer {self.api_key}"},
+                json={"model": self.model, "input": query}
+            )
+            resp.raise_for_status()
+            return resp.json()["data"][0]["embedding"]
+    # Use SimpleEmbeddings instead of OpenAIEmbeddings
+    embedding = SimpleEmbeddings(model="text-embedding-3-small")
+    texts = [c["text"] for c in chunks]
+    metadatas = [c["metadata"] for c in chunks]
+    vectorstore = Chroma.from_texts(
+        texts=texts,
+        embedding=embedding,
+        metadatas=metadatas,
+        persist_directory=persist_dir,
+        ids=[f"chunk_{i}" for i in range(len(chunks))]
     )
+    vectorstore.persist()   #ensure data is saved to disk
+    print("💾 Chroma vectorstore saved !!")
+    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
+    print(" PDF parsed, chunks embedded, retriever initialized.")
+@app.post("/query")
+def query_endpoint(req: QueryRequest):
+    return generate_rag_response(req.query, retriever, k=req.top_k)
 # =============================
 # Step 5: Chat Function
 # =============================
+#def chat_fn(message, history):
+#    answer = generate_rag_based_response(message, retriever)
+#    return f"{answer}\n\n🧠 (Context retrieved from {DATASET_REPO})"
 # =============================
 #    return f"BubbleBot says: {message}"
 gr.ChatInterface(
+    fn=query_endpoint,
     title="Flyline Chatbot ✈ ️",
     description="Ask Flyline HR",
     theme="soft",