Spaces:

YousefMohtady1
/

CorpGuideAI

Sleeping

App Files Files Community

YousefMohtady1 commited on 24 days ago

Commit

0ec978f

1 Parent(s): fedd57c

Initial commit with Database and PDF

Browse files

Files changed (31) hide show

.env.example +1 -0
.github/workflows/sync_to_hub.yml +20 -0
.gitignore +207 -0
Dockerfile +13 -0
LICENSE +21 -0
README.md +121 -0
api/__init__.py +0 -0
api/main.py +82 -0
api/schemas.py +16 -0
chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/data_level0.bin +3 -0
chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/header.bin +3 -0
chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/length.bin +3 -0
chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/link_lists.bin +0 -0
chroma_db/chroma.sqlite3 +3 -0
config/__init.py +0 -0
config/settings.py +21 -0
core/__init.py +0 -0
core/prompts.py +37 -0
core/rag_pipeline.py +72 -0
data/employee_handbook_-_2024.pdf +3 -0
ingest.py +56 -0
requirements.txt +14 -0
services/__init.py +0 -0
services/document_processor.py +51 -0
services/llm_client.py +26 -0
services/vector_store.py +49 -0
utils/__init.py +0 -0
utils/vision_helper.py +0 -0
web_ui/index.html +54 -0
web_ui/script.js +189 -0
web_ui/style.css +139 -0

.env.example ADDED Viewed

	@@ -0,0 +1 @@


1	+ GROQ_API_KEY=your_groq_api_key_here

.github/workflows/sync_to_hub.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git push https://YousefMohtady1:$HF_TOKEN@huggingface.co/spaces/YousefMohtady1/CorpGuideAI main

.gitignore ADDED Viewed

	@@ -0,0 +1,207 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN chmod -R 777 .
+EXPOSE 7860
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Yousef Mohtady
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+# CorpGuideAI - HR Policy Assistant
+**CorpGuideAI** is an advanced AI-powered assistant designed to help employees navigate and understand internal HR policies. Leveraging **Retrieval-Augmented Generation (RAG)**, it provides accurate, context-aware answers based on your organization's PDF documents.
+## 🚀 Features
+- **RAG Architecture**: Combines vector search with generative AI for precise answers.
+- **LLM Integration**: Powered by **Groq** (using Llama models) for fast and efficient inference.
+- **Vector Database**: Uses **ChromaDB** for efficient document storage and retrieval.
+- **Smart Ingestion**:
+  - Extracts text from PDFs.
+  - Improves retrieval speed and accuracy with semantic chunking.
+  - Uses `Alibaba-NLP/gte-multilingual-base` for robust multilingual support.
+- **Interactive Web UI**: A modern, clean chat interface to interact with the assistant.
+- **Chat History Management**: Maintains context across the conversation session (Reset supported).
+- **FastAPI Backend**: A high-performance API to serve requests.
+- **Docker Support**: Containerized for easy deployment.
+## 🛠️ Tech Stack
+- **Language**: Python 3.10+
+- **Frontend**: HTML5, CSS3, Vanilla JS
+- **Backend**: FastAPI, Uvicorn
+- **AI/ML**: LangChain, HuggingFace, ChromaDB, Groq
+- **Tools**: `pypdf`, `sentence-transformers`, Docker
+## 📂 Project Structure
+```bash
+CorpGuideAI-HR-Policy-Assistant/
+├── api/
+│   ├── main.py              # FastAPI application & entry point
+│   ├── schemas.py           # Pydantic models
+├── config/
+│   ├── settings.py          # Configuration settings
+├── core/
+│   ├── rag_pipeline.py      # Core RAG logic & Chat History
+│   ├── prompts.py           # Prompt templates
+├── data/                    # PDF documents storage
+├── services/
+│   ├── document_processor.py
+│   ├── vector_store.py
+│   ├── llm_client.py
+├── web_ui/                  # Frontend Application
+│   ├── index.html
+│   ├── script.js
+│   ├── style.css
+├── ingest.py                # Document ingestion script
+├── Dockerfile               # Docker container configuration
+├── requirements.txt         # Dependencies
+└── README.md                # Documentation
+```
+## ⚡ Prerequisites
+- **Python 3.10+**
+- **Groq API Key**: Get it from [Groq Console](https://console.groq.com/).
+## 📦 Installation & Usage
+### Option 1: Local Installation
+1.  **Clone & Setup**:
+    ```bash
+    git clone <repository-url>
+    cd CorpGuideAI-HR-Policy-Assistant
+    python -m venv venv
+    # Windows
+    venv\Scripts\activate
+    # macOS/Linux
+    source venv/bin/activate
+    pip install -r requirements.txt
+    ```
+2.  **Environment Variables**:
+    Create a `.env` file:
+    ```env
+    GROQ_API_KEY=your_groq_api_key_here
+    ```
+3.  **Ingest Documents**:
+    Place PDFs in `data/` and run:
+    ```bash
+    python ingest.py
+    ```
+4.  **Run Server**:
+    ```bash
+    uvicorn api.main:app --reload
+    ```
+    Access the Web UI at: `http://localhost:8000`
+### Option 2: Docker
+1.  **Build Image**:
+    ```bash
+    docker build -t corpguide-ai .
+    ```
+2.  **Run Container**:
+    ```bash
+    docker run -p 7860:7860 --env-file .env corpguide-ai
+    ```
+    Access at: `http://localhost:7860`
+## 🔗 API Endpoints
+- `GET /`: Serves the Web UI.
+- `POST /chat`: Chat endpoint.
+  - Body: `{ "question": "..." }` (History managed internally)
+- `POST /reset`: Clears current chat history.
+## 📄 License
+MIT License - see [LICENSE](LICENSE) file.

api/__init__.py ADDED Viewed

File without changes

api/main.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import logging
+from fastapi import FastAPI, HTTPException
+from contextlib import asynccontextmanager
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from api.schemas import ChatRequest, ChatResponse, UploadResponse
+from core.rag_pipeline import RagPipeline
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+pipeline_resources = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Starting CorpGuide AI API...")
+    try:
+        pipeline_resources["rag"] = RagPipeline()
+        logger.info("CorpGuide AI API ready for queries")
+    except Exception as e:
+        logger.error(f"Failed to initialize CorpGuide AI API: {str(e)}")
+        raise e
+    yield
+    pipeline_resources.clear()
+    logger.info("API shut down.")
+app = FastAPI(
+    title="CorpGuide AI",
+    description="AI-powered HR policy assistant",
+    version="1.0.0",
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.mount("/static", StaticFiles(directory="web_ui"), name="static")
+@app.get("/")
+async def read_index():
+    return FileResponse("web_ui/index.html")
+@app.post("/reset")
+async def reset_chat():
+    try:
+        if 'rag' in pipeline_resources and hasattr(pipeline_resources['rag'], 'clear_history'):
+            pipeline_resources['rag'].clear_history()
+        return {"message": "Chat history has been reset."}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest):
+    try:
+        if 'rag' not in pipeline_resources:
+            raise HTTPException(status_code=503, detail="System is initializing, try again later")
+        rag = pipeline_resources['rag']
+        result = rag.process_query(
+            question= request.question,
+            chat_history=request.chat_history
+        )
+        return ChatResponse(
+            answer=result["answer"],
+            sources=result["sources"],
+            latency=result["latency"]
+        )
+    except Exception as e:
+        logger.error(f"Error in chat: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))

api/schemas.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from pydantic import BaseModel
+from typing import List, Optional, Tuple
+class ChatRequest(BaseModel):
+    question: str
+    chat_history: List[Tuple[str,str]] = []
+class ChatResponse(BaseModel):
+    answer:str
+    sources: List[str]
+    latency: Optional[float] = None
+class UploadResponse(BaseModel):
+    filename: str
+    chunks_count: int
+    message:str

chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7e2a5c66a30e0d9228b85d06681048e2d25425ad5b7f8f10b672c87ac37e001
+size 321200

chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03cb3ac86f3e5bcb15e88b9bf99f760ec6b33e31d64a699e129b49868db6d733
+size 100

chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6c03ed65fe1386e0c057caca3ac3771497dc92063f9713f93abdf9b6399b2a
+size 400

chroma_db/6a26d0e5-feba-4f8b-a92a-f8d1220144dc/link_lists.bin ADDED Viewed

File without changes

chroma_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d49857011870b48f6c378045d8d713fd1dac413de0a216d3003022472d80ea7f
+size 1146880

config/__init.py ADDED Viewed

File without changes

config/settings.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+class Settings:
+    #API Key
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    if not GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY is not set in the environment variables")
+    #Models
+    LLM_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
+    EMBEDDING_MODEL = "Alibaba-NLP/gte-multilingual-base"
+    #Vector DB
+    CHROMA_PERSIST_DIR = "chroma_db"
+    COLLECTION_NAME = "policy_docs"
+    DATA_DIR = "data"
+settings = Settings()

core/__init.py ADDED Viewed

File without changes

core/prompts.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+SYSTEM_TEMPLATE = """
+You are CorpGuide AI, an expert HR and Policy Assistant.
+Your task is to answer the user's question based ONLY on the provided context below.
+<context>
+{context}
+</context>
+Guidelines:
+1. If the answer is not in the context, strictly reply: "I'm sorry, I cannot find this information in the company policy documents."
+2. Do not make up or hallucinate information.
+3. Keep your answer professional, concise, and helpful.
+4. If the question is in Arabic, answer in Arabic. If in English, answer in English.
+5. Provide specific details (numbers, days, penalties) if available in the context.
+"""
+CONTEXTUALIZE_Q_SYSTEM_PROMPT = """
+Given a chat history and the latest user question which might reference context in the chat history,
+formulate a standalone question which can be understood without the chat history.
+Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
+"""
+def get_chat_prompt():
+    return ChatPromptTemplate.from_messages([
+        ("system", SYSTEM_TEMPLATE),
+        MessagesPlaceholder("chat_history"),
+        ("user", "{input}")
+    ])
+def get_contextualize_prompt():
+    return ChatPromptTemplate.from_messages([
+        ("system", CONTEXTUALIZE_Q_SYSTEM_PROMPT),
+        MessagesPlaceholder("chat_history"),
+        ("user", "{input}")
+    ])

core/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import logging
+import time
+from langchain_core.messages import HumanMessage, AIMessage
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from config.settings import Settings
+from services.llm_client import LLMClient
+from services.vector_store import VectorStore
+from core.prompts import get_chat_prompt, get_contextualize_prompt
+logger = logging.getLogger(__name__)
+class RagPipeline:
+    def __init__(self):
+        try:
+            self.llm = LLMClient().get_llm()
+            self.vector_store = VectorStore()
+            self.retriever = self.vector_store.get_retriever(k=5)
+            self.prompt = get_chat_prompt()
+            self.history_aware_retriever = create_history_aware_retriever(
+                self.llm,
+                self.retriever,
+                get_contextualize_prompt()
+            )
+            self.question_answer_chain = create_stuff_documents_chain(
+                self.llm,
+                get_chat_prompt()
+            )
+            self.rag_chain = create_retrieval_chain(
+                self.history_aware_retriever,
+                self.question_answer_chain
+            )
+            self.chat_history = []
+            logger.info("RAG pipeline initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing RAG pipeline: {str(e)}")
+            raise e
+    def clear_history(self):
+        self.chat_history = []
+        logger.info("Chat history cleared")
+    def process_query(self, question:str, chat_history: list = []):
+        start_time = time.time()
+        try:
+            logger.info(f"Processing query: {question}")
+            response = self.rag_chain.invoke({
+                "input": question,
+                "chat_history": self.chat_history
+                })
+            self.chat_history.extend([
+                HumanMessage(content=question),
+                AIMessage(content=response["answer"])
+            ])
+            latency = time.time() - start_time
+            source_files = list(set(
+                [doc.metadata.get("source", "Unknown") for doc in response["context"]]
+            ))
+            return{
+                "answer": response["answer"],
+                "sources": source_files,
+                "latency": latency
+            }
+        except Exception as e:
+            logger.error(f"Error processing query: {str(e)}")
+            raise e

data/employee_handbook_-_2024.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60d7587505554f258c5758378243a21c4e0fa2484e730eee4c0e58aa245fde8c
+size 314218

ingest.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import logging
+import shutil
+from services.document_processor import DocumentProcessor
+from services.vector_store import VectorStore
+from config.settings import Settings
+logging.basicConfig(level=logging.INFO, format='%(asctime)s- %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def main():
+    logger.info("Starting document ingestion process")
+    if os.path.exists(Settings.CHROMA_PERSIST_DIR):
+        logger.warning(f"Removing existing database at {Settings.CHROMA_PERSIST_DIR}")
+        shutil.rmtree(Settings.CHROMA_PERSIST_DIR)
+    processor = DocumentProcessor()
+    vector_store = VectorStore()
+    if not os.path.exists(Settings.DATA_DIR):
+        os.makedirs(Settings.DATA_DIR)
+        logger.info(f"Data directory '{Settings.DATA_DIR}' not found. Created it. Please add PDFs there.")
+        return
+    pdf_files = [f for f in os.listdir(Settings.DATA_DIR) if f.endswith('.pdf')]
+    if not pdf_files:
+        logger.warning("No PDF files found in the data directory. Please add PDFs there.")
+        return
+    total_chunks = 0
+    for pdf_file in pdf_files:
+        file_path = os.path.join(Settings.DATA_DIR, pdf_file)
+        logger.info(f"Processing: {pdf_file}...")
+        try:
+            chunks = processor.process_pdf(file_path)
+            for chunk in chunks:
+                chunk.metadata['source'] = pdf_file
+            vector_store.add_documents(chunks)
+            total_chunks += len(chunks)
+            logger.info(f"Processed {len(chunks)} chunks from {pdf_file}")
+        except Exception as e:
+            logger .error(f"Failed to process {pdf_file}: {str(e)}")
+            continue
+    logger.info(f"Ingestion Completed. Total chunks stored: {total_chunks}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi==0.124.4
+uvicorn==0.38.0
+python-multipart==0.0.20
+python-dotenv==1.2.1
+langchain==0.3.0
+langchain-groq==0.2.0
+langchain-huggingface==0.1.0
+langchain-chroma==1.1.0
+langchain-community==0.3.0
+langchain-experimental==0.4.1
+chromadb==1.3.7
+pypdf==6.4.1
+sentence-transformers==5.2.0
+aiofiles==24.1.0

services/__init.py ADDED Viewed

File without changes

services/document_processor.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import logging
+from typing import List
+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_huggingface import HuggingFaceEmbeddings
+from config.settings import Settings
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class DocumentProcessor:
+    def __init__(self):
+        try:
+            logger.info(f"Loading embeddings model: {Settings.EMBEDDING_MODEL}")
+            # Initialize embeddings
+            self.embeddings = HuggingFaceEmbeddings(
+                model_name = Settings.EMBEDDING_MODEL,
+                model_kwargs = {"trust_remote_code": True}
+            )
+            # Initialize text Semantic splitter
+            self.text_splitter = SemanticChunker(
+                embeddings = self.embeddings,
+                breakpoint_threshold_type = "percentile"
+            )
+            logger.info("DocumentProcessor initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing DocumentProcessor")
+            raise e
+    def process_pdf(self, file_path:str):
+        try:
+            logger.info(f"Processing file: {file_path}")
+            # Read PDF
+            loader = PyPDFLoader(file_path)
+            raw_documents = loader.load()
+            logger.info(f"Loaded {len(raw_documents)} pages from PDF.")
+            # Split text
+            chunks = self.text_splitter.split_documents(raw_documents)
+            logger.info(f"Created {len(chunks)} semantic chunks.")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error processing PDF: {str(e)}")
+            raise e

services/llm_client.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import logging
+from langchain_groq import ChatGroq
+from config.settings import Settings
+logger = logging.getLogger(__name__)
+class LLMClient:
+    def __init__(self):
+        try:
+            logger.info(f"Initializing Groq LLM with model: {Settings.LLM_MODEL}")
+            self.llm = ChatGroq(
+                api_key = Settings.GROQ_API_KEY,
+                model_name = Settings.LLM_MODEL,
+                temperature = 0.0,
+                max_retries = 2,
+                streaming = True
+            )
+            logger.info("LLM initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize LLM Client: {str(e)}")
+            raise e
+    def get_llm(self):
+        return self.llm

services/vector_store.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+import os
+import shutil
+from typing import List
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from config.settings import Settings
+logger = logging.getLogger(__name__)
+class VectorStore:
+    def __init__(self):
+        try:
+            self.embeddings = HuggingFaceEmbeddings(
+                model_name = Settings.EMBEDDING_MODEL,
+                model_kwargs = {"trust_remote_code": True}
+            )
+            self.vector_db = Chroma(
+                persist_directory = Settings.CHROMA_PERSIST_DIR,
+                embedding_function = self.embeddings,
+                collection_name = Settings.COLLECTION_NAME
+            )
+            logger.info(f"VectorStore connected to {Settings.CHROMA_PERSIST_DIR}")
+        except Exception as e:
+            logger.error(f"Failed to initialize VectorStore: {str(e)}")
+            raise e
+    def add_documents(self, documents: List[Document]):
+        try:
+            if not documents:
+                logger.warning("No documents provided to add to the VectorStore")
+                return
+            logger.info(f"Adding {len(documents)} documents to ChromaDB...")
+            self.vector_db.add_documents(documents)
+            logger.info("Documents added to ChromaDB successfully")
+        except Exception as e:
+            logger.error(f"Error adding documents: {str(e)}")
+            raise e
+    def get_retriever(self, k: int = 5):
+        return self.vector_db.as_retriever(
+            search_type = "similarity",
+            search_kwargs = {"k": k}
+        )

utils/__init.py ADDED Viewed

File without changes

utils/vision_helper.py ADDED Viewed

File without changes

web_ui/index.html ADDED Viewed

	@@ -0,0 +1,54 @@

+<!DOCTYPE html>
+<html lang="en" dir="ltr">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>CorpGuide AI Assistant</title>
+    <link rel="stylesheet" href="/static/style.css" />
+    <link
+      rel="stylesheet"
+      href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"
+    />
+  </head>
+  <body>
+    <div class="chat-container">
+      <div class="chat-header">
+        <div class="logo">
+          <i class="fa-solid fa-robot"></i>
+          <span>CorpGuide AI</span>
+        </div>
+        <button
+          onclick="startNewChat()"
+          class="new-chat-btn"
+          title="Start New Chat"
+        >
+          <i class="fa-solid fa-arrows-rotate"></i>
+        </button>
+        <div class="status-dot"></div>
+      </div>
+      <div class="chat-box" id="chat-box">
+        <div class="message bot-message">
+          <div class="msg-content">
+            Hi, I am CorpGuide AI. Ask me any question about your company's
+            policies.
+          </div>
+        </div>
+      </div>
+      <div class="input-area">
+        <input
+          type="text"
+          id="user-input"
+          placeholder="Ask me any question about your company's policies."
+          autocomplete="off"
+        />
+        <button onclick="sendMessage()" id="send-btn">
+          <i class="fa-solid fa-paper-plane"></i>
+        </button>
+      </div>
+    </div>
+    <script src="/static/script.js"></script>
+  </body>
+</html>

web_ui/script.js ADDED Viewed

	@@ -0,0 +1,189 @@

+// ==========================================
+// Connection and Memory Settings
+// ==========================================
+// Relative URL to work both locally and on server
+const API_URL = "/chat";
+const RESET_URL = "/reset";
+// Array to store chat history (so the model remembers context)
+let chatHistory = [];
+// Define page elements
+const chatBox = document.getElementById('chat-box');
+const userInput = document.getElementById('user-input');
+const sendBtn = document.getElementById('send-btn');
+// ==========================================
+// Core Functions
+// ==========================================
+/**
+ * Function to send message and handle API interaction
+ */
+async function sendMessage() {
+    const question = userInput.value.trim();
+    // If no text, do nothing
+    if (!question) return;
+    // 1. Display user message in chat immediately
+    appendMessage(question, 'user');
+    userInput.value = ''; // Clear input
+    sendBtn.disabled = true; // Disable button to prevent double sending
+    // 2. Display "typing..." indicator
+    const loadingId = appendLoading();
+    try {
+        // 3. Prepare payload (question + old history)
+        const payload = {
+            question: question,
+            chat_history: chatHistory
+        };
+        // 4. Connect to server
+        const response = await fetch(API_URL, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify(payload)
+        });
+        if (!response.ok) {
+            throw new Error(`Server Error: ${response.status}`);
+        }
+        const data = await response.json();
+        // 5. Remove loading indicator and show AI response
+        removeLoading(loadingId);
+        appendMessage(data.answer, 'bot', data.sources);
+        // 6. Update memory (for next question)
+        // Add question and answer to the list
+        chatHistory.push(["human", question]);
+        chatHistory.push(["ai", data.answer]);
+    } catch (error) {
+        console.error("Error:", error);
+        removeLoading(loadingId);
+        appendMessage("Sorry, an error occurred connecting to the server. Please ensure the backend is running. 😔", 'bot');
+    } finally {
+        // Re-enable button and focus input
+        sendBtn.disabled = false;
+        userInput.focus();
+    }
+}
+/**
+ * Function to start a new chat (Reset)
+ */
+async function startNewChat() {
+    // 1. Reset browser memory
+    chatHistory = [];
+    // 2. Clear screen (return to initial state)
+    chatBox.innerHTML = `
+        <div class="message bot-message">
+            <div class="msg-content">
+                Welcome back! 👋<br>Memory cleared, you can start a new topic.
+            </div>
+        </div>
+    `;
+    // 3. Notify server to clear memory (optional, for double confirmation)
+    try {
+        await fetch(RESET_URL, { method: 'POST' });
+        console.log("Backend history reset.");
+    } catch (e) {
+        console.warn("Backend reset failed (might be stateless):", e);
+    }
+}
+// ==========================================
+// UI Helpers
+// ==========================================
+/**
+ * Add message to screen
+ * @param {string} text - Message text
+ * @param {string} sender - Sender ('user' or 'bot')
+ * @param {Array} sources - List of sources (optional)
+ */
+function appendMessage(text, sender, sources = []) {
+    const msgDiv = document.createElement('div');
+    msgDiv.classList.add('message', sender === 'user' ? 'user-message' : 'bot-message');
+    // Convert newlines to <br> for proper formatting
+    // Could use a library like 'marked' for full Markdown, but this is simpler
+    let formattedText = text.replace(/\n/g, '<br>');
+    // Convert text between ** ** to bold tags (simple implementation)
+    formattedText = formattedText.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>');
+    let html = `<div class="msg-content">${formattedText}</div>`;
+    /*
+    // If there are sources, add them in a small box below - DISABLED BY REQUEST
+    if (sources && sources.length > 0) {
+        // Remove duplicates from filenames
+        const uniqueSources = [...new Set(sources)];
+        html += `<div class="sources-box">📚 Sources: ${uniqueSources.join(', ')}</div>`;
+    }
+    */
+    msgDiv.innerHTML = html;
+    chatBox.appendChild(msgDiv);
+    scrollToBottom();
+}
+/**
+ * Add loading indicator (3 moving dots)
+ */
+function appendLoading() {
+    const id = 'loading-' + Date.now();
+    const msgDiv = document.createElement('div');
+    msgDiv.classList.add('message', 'bot-message');
+    msgDiv.id = id;
+    msgDiv.innerHTML = `
+        <div class="msg-content">
+            <div class="typing-indicator">
+                <span></span><span></span><span></span>
+            </div>
+        </div>`;
+    chatBox.appendChild(msgDiv);
+    scrollToBottom();
+    return id;
+}
+/**
+ * Remove loading indicator
+ */
+function removeLoading(id) {
+    const element = document.getElementById(id);
+    if (element) element.remove();
+}
+/**
+ * Scroll to bottom of chat automatically
+ */
+function scrollToBottom() {
+    chatBox.scrollTop = chatBox.scrollHeight;
+}
+// ==========================================
+// Event Listeners
+// ==========================================
+// When send button is clicked
+sendBtn.addEventListener('click', sendMessage);
+// When Enter is pressed in input box
+userInput.addEventListener('keypress', (e) => {
+    if (e.key === 'Enter') {
+        sendMessage();
+    }
+});
+// Focus on input box when page loads
+window.onload = () => userInput.focus();

web_ui/style.css ADDED Viewed

	@@ -0,0 +1,139 @@

+body {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+    background-color: #f0f2f5;
+    margin: 0;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 100vh;
+}
+.chat-container {
+    width: 100%;
+    max-width: 500px; /* Suitable size for mobile and desktop */
+    background: white;
+    height: 90vh;
+    border-radius: 15px;
+    box-shadow: 0 10px 25px rgba(0,0,0,0.1);
+    display: flex;
+    flex-direction: column;
+    overflow: hidden;
+}
+.chat-header {
+    background: #2c3e50;
+    color: white;
+    padding: 15px;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+}
+.new-chat-btn {
+    background: transparent;
+    border: 1px solid rgba(255, 255, 255, 0.3);
+    color: white;
+    width: 35px;
+    height: 35px;
+    border-radius: 50%;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: all 0.3s ease;
+}
+.new-chat-btn:hover {
+    background: rgba(255, 255, 255, 0.2);
+    transform: rotate(180deg); /* Cool rotation effect on hover */
+}
+.logo { font-size: 1.2rem; font-weight: bold; }
+.logo i { margin-left: 10px; color: #3498db; }
+.status-dot { width: 10px; height: 10px; background: #2ecc71; border-radius: 50%; }
+.chat-box {
+    flex: 1;
+    padding: 20px;
+    overflow-y: auto;
+    background: #f9f9f9;
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+.message { display: flex; flex-direction: column; max-width: 80%; }
+.bot-message { align-self: flex-start; }
+.user-message { align-self: flex-end; }
+.msg-content {
+    padding: 12px 16px;
+    border-radius: 15px;
+    font-size: 0.95rem;
+    line-height: 1.5;
+    position: relative;
+}
+.bot-message .msg-content {
+    background: #e9ecef;
+    color: #333;
+    border-bottom-right-radius: 2px;
+}
+.user-message .msg-content {
+    background: #3498db;
+    color: white;
+    border-bottom-left-radius: 2px;
+}
+.sources-box {
+    font-size: 0.8rem;
+    color: #666;
+    margin-top: 5px;
+    background: #fff;
+    padding: 5px 10px;
+    border-radius: 5px;
+    border: 1px solid #ddd;
+}
+.input-area {
+    padding: 15px;
+    background: white;
+    border-top: 1px solid #eee;
+    display: flex;
+    gap: 10px;
+}
+input {
+    flex: 1;
+    padding: 12px;
+    border: 1px solid #ddd;
+    border-radius: 25px;
+    outline: none;
+    font-family: inherit;
+}
+button {
+    background: #2c3e50;
+    color: white;
+    border: none;
+    width: 45px;
+    height: 45px;
+    border-radius: 50%;
+    cursor: pointer;
+    transition: 0.2s;
+}
+button:hover { background: #34495e; }
+button:disabled { background: #ccc; cursor: not-allowed; }
+/* Loading Animation */
+.typing-indicator span {
+    display: inline-block;
+    width: 6px; height: 6px;
+    background-color: #555;
+    border-radius: 50%;
+    animation: typing 1s infinite;
+    margin: 0 2px;
+}
+@keyframes typing { 0% {opacity: 0.3} 50% {opacity: 1} 100% {opacity: 0.3} }