Spaces:
Running
Running
Asish Karthikeya Gogineni
commited on
Commit
Β·
a3bdcf1
1
Parent(s):
5059b8f
Refactor: Code Structure Update & UI Redesign
Browse files- ARCHITECTURE_WALKTHROUGH.md +879 -0
- CODE_OF_CONDUCT.md +0 -128
- api/routes/index.py +6 -6
- app.py +6 -6
- architecture_viz.jsx +625 -0
- code_chatbot/{agent_workflow.py β agents/agent_workflow.py} +2 -2
- code_chatbot/{crews β agents/crews}/__init__.py +0 -0
- code_chatbot/{tools.py β agents/tools.py} +0 -0
- code_chatbot/analysis/__init__.py +0 -0
- code_chatbot/{ast_analysis.py β analysis/ast_analysis.py} +0 -0
- code_chatbot/{code_symbols.py β analysis/code_symbols.py} +1 -1
- code_chatbot/core/__init__.py +0 -0
- code_chatbot/{config.py β core/config.py} +0 -0
- code_chatbot/{db_connection.py β core/db_connection.py} +0 -0
- code_chatbot/{path_obfuscator.py β core/path_obfuscator.py} +0 -0
- code_chatbot/{prompts.py β core/prompts.py} +0 -0
- code_chatbot/{rate_limiter.py β core/rate_limiter.py} +0 -0
- code_chatbot/ingestion/__init__.py +0 -0
- code_chatbot/{chunker.py β ingestion/chunker.py} +0 -0
- code_chatbot/{incremental_indexing.py β ingestion/incremental_indexing.py} +3 -3
- code_chatbot/{indexer.py β ingestion/indexer.py} +6 -6
- code_chatbot/{indexing_progress.py β ingestion/indexing_progress.py} +7 -7
- code_chatbot/{merkle_tree.py β ingestion/merkle_tree.py} +0 -0
- code_chatbot/{universal_ingestor.py β ingestion/universal_ingestor.py} +44 -4
- code_chatbot/mcp/__init__.py +0 -0
- code_chatbot/{mcp_client.py β mcp/mcp_client.py} +1 -1
- code_chatbot/{mcp_server.py β mcp/mcp_server.py} +0 -0
- code_chatbot/retrieval/__init__.py +0 -0
- code_chatbot/{graph_rag.py β retrieval/graph_rag.py} +0 -0
- code_chatbot/{llm_retriever.py β retrieval/llm_retriever.py} +0 -0
- code_chatbot/{rag.py β retrieval/rag.py} +25 -16
- code_chatbot/{reranker.py β retrieval/reranker.py} +0 -0
- code_chatbot/{retriever_wrapper.py β retrieval/retriever_wrapper.py} +1 -1
- components/file_explorer.py +1 -1
- components/multi_mode.py +3 -3
- components/sidebar.py +1 -1
- pages/1_β‘_Code_Studio.py +72 -63
- pages/1_β‘_Code_Studio.py.bak +118 -0
- tests/test_merkle_tree_simple.py +1 -1
ARCHITECTURE_WALKTHROUGH.md
ADDED
|
@@ -0,0 +1,879 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π·οΈ Code Crawler - Complete Architecture Walkthrough
|
| 2 |
+
|
| 3 |
+
## Table of Contents
|
| 4 |
+
1. [Project Overview](#project-overview)
|
| 5 |
+
2. [System Architecture](#system-architecture)
|
| 6 |
+
3. [Data Flow Pipeline](#data-flow-pipeline)
|
| 7 |
+
4. [RAG Implementation](#rag-implementation)
|
| 8 |
+
5. [AST Analysis & Graph Creation](#ast-analysis--graph-creation)
|
| 9 |
+
6. [Code Chunking Strategy](#code-chunking-strategy)
|
| 10 |
+
7. [Retrieval System](#retrieval-system)
|
| 11 |
+
8. [Agentic Workflow](#agentic-workflow)
|
| 12 |
+
9. [Frontend & API](#frontend--api)
|
| 13 |
+
10. [Component Deep Dives](#component-deep-dives)
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## Project Overview
|
| 18 |
+
|
| 19 |
+
**Code Crawler** is an AI-powered codebase assistant that combines multiple advanced techniques:
|
| 20 |
+
|
| 21 |
+
- **RAG (Retrieval-Augmented Generation)**: Vector-based semantic search over code
|
| 22 |
+
- **AST Analysis**: Abstract Syntax Tree parsing for understanding code structure
|
| 23 |
+
- **Graph RAG**: Knowledge graph enhancement for relationship-aware retrieval
|
| 24 |
+
- **Agentic Workflows**: Multi-step reasoning with tool use (LangGraph)
|
| 25 |
+
- **Multi-LLM Support**: Gemini, Groq (Llama 3.3)
|
| 26 |
+
|
| 27 |
+
### Key Features
|
| 28 |
+
| Feature | Description |
|
| 29 |
+
|---------|-------------|
|
| 30 |
+
| π¬ Chat Mode | Natural language Q&A about codebase |
|
| 31 |
+
| π Search Mode | Regex pattern search across files |
|
| 32 |
+
| π§ Refactor Mode | AI-assisted code refactoring |
|
| 33 |
+
| β¨ Generate Mode | Spec generation (PO-friendly, Dev Specs, User Stories) |
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## System Architecture
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 41 |
+
β CODE CRAWLER SYSTEM β
|
| 42 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
|
| 43 |
+
β β
|
| 44 |
+
β βββββββββββββββββββ βββββββββββββββββββ βββββββββββββββββββ β
|
| 45 |
+
β β DATA INGEST ββββββΆβ PROCESSING ββββββΆβ STORAGE β β
|
| 46 |
+
β β β β β β β β
|
| 47 |
+
β β β’ ZIP Files β β β’ AST Parsing β β β’ Vector DB β β
|
| 48 |
+
β β β’ GitHub URLs β β β’ Chunking β β (Chroma/FAISS)β β
|
| 49 |
+
β β β’ Local Dirs β β β’ Embeddings β β β’ AST Graph β β
|
| 50 |
+
β β β’ Web Docs β β β’ Graph Build β β (GraphML) β β
|
| 51 |
+
β βββββββββββββββββββ βββββββββββββββββββ ββββββββββ¬βββββββββ β
|
| 52 |
+
β β β
|
| 53 |
+
β βΌ β
|
| 54 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 55 |
+
β β RETRIEVAL LAYER β β
|
| 56 |
+
β β βββββββββββββββ βββββββββββββββ βββββββββββββββ βββββββββββββββ β β
|
| 57 |
+
β β β Vector β β LLM β β Graph β β Reranker β β β
|
| 58 |
+
β β β Retriever ββββ Retriever ββββ Enhanced ββββ (Cross- β β β
|
| 59 |
+
β β β β β β β Retriever β β Encoder) β β β
|
| 60 |
+
β β βββββββββββββββ βββββββββββββββ βββββββββββββββ βββββββββββββββ β β
|
| 61 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 62 |
+
β β β
|
| 63 |
+
β βΌ β
|
| 64 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 65 |
+
β β CHAT ENGINE β β
|
| 66 |
+
β β β β
|
| 67 |
+
β β ββββββββββββββββββββ ββββββββββββββββββββββββββββ β β
|
| 68 |
+
β β β Linear RAG β OR β Agentic Workflow β β β
|
| 69 |
+
β β β (Simple) β β (LangGraph) β β β
|
| 70 |
+
β β β β β β β β
|
| 71 |
+
β β β Query β Retrieveβ β Agent β Tool β Agent β β β
|
| 72 |
+
β β β β Answer β β β β β β
|
| 73 |
+
β β β β β search_codebase β β β
|
| 74 |
+
β β β β β read_file β β β
|
| 75 |
+
β β β β β list_files β β β
|
| 76 |
+
β β β β β find_callers β β β
|
| 77 |
+
β β ββββββββββββββββββββ ββββββββββββββββββββββββββββ β β
|
| 78 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 79 |
+
β β β
|
| 80 |
+
β βΌ β
|
| 81 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 82 |
+
β β FRONTEND LAYER β β
|
| 83 |
+
β β β β
|
| 84 |
+
β β Streamlit App FastAPI (REST) Next.js (React) β β
|
| 85 |
+
β β βββ app.py βββ /api/index βββ /chat β β
|
| 86 |
+
β β βββ Code_Studio.py βββ /api/chat βββ /generate β β
|
| 87 |
+
β β βββ /api/health βββ /search β β
|
| 88 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 89 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
## Data Flow Pipeline
|
| 95 |
+
|
| 96 |
+
### 1. Ingestion Flow
|
| 97 |
+
|
| 98 |
+
```
|
| 99 |
+
User Input (ZIP/GitHub/Local)
|
| 100 |
+
β
|
| 101 |
+
βΌ
|
| 102 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
β UniversalIngestor β
|
| 104 |
+
β (universal_ingestor.py) β
|
| 105 |
+
β β
|
| 106 |
+
β βββββββββββββββ βββββββββββββββββββ β
|
| 107 |
+
β β _detect_ β β Handler Classes β β
|
| 108 |
+
β β handler() ββββΆβ β β
|
| 109 |
+
β βββββββββββββββ β β’ ZIPFileManagerβ β
|
| 110 |
+
β β β’ GitHubRepoMgr β β
|
| 111 |
+
β β β’ LocalDirMgr β β
|
| 112 |
+
β β β’ WebDocManager β β
|
| 113 |
+
β βββββββββββββββββββ β
|
| 114 |
+
ββββββββββββββββββββββ¬βββββββββββββββββββββ
|
| 115 |
+
β
|
| 116 |
+
βΌ
|
| 117 |
+
List[Document] + local_path
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
**Example: GitHub Repository Processing**
|
| 121 |
+
|
| 122 |
+
```python
|
| 123 |
+
# 1. User provides: "https://github.com/owner/repo"
|
| 124 |
+
|
| 125 |
+
# 2. UniversalIngestor detects GitHub URL
|
| 126 |
+
ingestor = UniversalIngestor(source)
|
| 127 |
+
# delegate = GitHubRepoManager
|
| 128 |
+
|
| 129 |
+
# 3. Download (clone or ZIP fallback)
|
| 130 |
+
ingestor.download()
|
| 131 |
+
# Clones to: /tmp/code_chatbot/owner_repo/
|
| 132 |
+
|
| 133 |
+
# 4. Walk files
|
| 134 |
+
for content, metadata in ingestor.walk():
|
| 135 |
+
# content = "def hello(): ..."
|
| 136 |
+
# metadata = {"file_path": "/tmp/.../main.py", "source": "main.py"}
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
### 2. Indexing Flow
|
| 140 |
+
|
| 141 |
+
```
|
| 142 |
+
Documents
|
| 143 |
+
β
|
| 144 |
+
βΌ
|
| 145 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 146 |
+
β Indexer β
|
| 147 |
+
β (indexer.py) β
|
| 148 |
+
β β
|
| 149 |
+
β βββββββββββββββββββ βββββββββββββββββββ βββββββββββββββββ β
|
| 150 |
+
β β StructuralChunkerββββΆβ Embedding Model ββββΆβ Vector Store β β
|
| 151 |
+
β β β β (Gemini/HF) β β (Chroma/FAISS)β β
|
| 152 |
+
β βββββββββββββββββββ βββββββββββββββββββ βββββββββββββββββ β
|
| 153 |
+
β β
|
| 154 |
+
β Additionally: β
|
| 155 |
+
β βββββββββββββββββββ βββββββββββββββββββ β
|
| 156 |
+
β β ASTGraphBuilder ββββΆβ GraphML File β β
|
| 157 |
+
β βββββββββββββββββββ βββββββββββββββββββ β
|
| 158 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## RAG Implementation
|
| 164 |
+
|
| 165 |
+
The RAG system in this project is implemented in `code_chatbot/rag.py` with these key components:
|
| 166 |
+
|
| 167 |
+
### ChatEngine Class
|
| 168 |
+
|
| 169 |
+
```python
|
| 170 |
+
class ChatEngine:
|
| 171 |
+
def __init__(self, retriever, model_name, provider, ...):
|
| 172 |
+
# 1. Base retriever (from vector store)
|
| 173 |
+
self.base_retriever = retriever
|
| 174 |
+
|
| 175 |
+
# 2. Enhanced retriever with reranking
|
| 176 |
+
self.vector_retriever = build_enhanced_retriever(
|
| 177 |
+
base_retriever=retriever,
|
| 178 |
+
use_multi_query=use_multi_query,
|
| 179 |
+
use_reranking=True # Uses Cross-Encoder
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# 3. LLM Retriever (file-aware)
|
| 183 |
+
self.llm_retriever = LLMRetriever(llm, repo_files)
|
| 184 |
+
|
| 185 |
+
# 4. Ensemble Retriever (combines both)
|
| 186 |
+
self.retriever = EnsembleRetriever(
|
| 187 |
+
retrievers=[self.vector_retriever, self.llm_retriever],
|
| 188 |
+
weights=[0.6, 0.4] # 60% vector, 40% LLM
|
| 189 |
+
)
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
### RAG Flow Example
|
| 193 |
+
|
| 194 |
+
```
|
| 195 |
+
User Query: "How does the authentication work?"
|
| 196 |
+
β
|
| 197 |
+
βΌ
|
| 198 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 199 |
+
β 1. RETRIEVAL β
|
| 200 |
+
β ββββββββββββββββββββ ββββββββββββββββββββ β
|
| 201 |
+
β β Vector Retriever β β LLM Retriever β β
|
| 202 |
+
β β β β β β
|
| 203 |
+
β β Semantic search β β LLM picks files β β
|
| 204 |
+
β β in Chroma DB β β from structure β β
|
| 205 |
+
β ββββββββββ¬ββββββββββ ββββββββββ¬ββββββββββ β
|
| 206 |
+
β β β β
|
| 207 |
+
β ββββββββββββββ¬βββββββββββββ β
|
| 208 |
+
β βΌ β
|
| 209 |
+
β βββββββββββββββββββββββ β
|
| 210 |
+
β β EnsembleRetriever β β
|
| 211 |
+
β β (60% + 40% weighted)β β
|
| 212 |
+
β βββββββββββ¬ββββββββββββ β
|
| 213 |
+
β β β
|
| 214 |
+
β βΌ β
|
| 215 |
+
β βββββββββββββββββββββββ β
|
| 216 |
+
β β Reranker β β
|
| 217 |
+
β β (Cross-Encoder) β β
|
| 218 |
+
β β ms-marco-MiniLM β β
|
| 219 |
+
β βββββββββββ¬ββββββββββββ β
|
| 220 |
+
β β β
|
| 221 |
+
β βΌ β
|
| 222 |
+
β Top 5 Most Relevant Docs β
|
| 223 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 224 |
+
β
|
| 225 |
+
βΌ
|
| 226 |
+
βββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½βββββββββββββββββββ
|
| 227 |
+
β 2. GENERATION β
|
| 228 |
+
β β
|
| 229 |
+
β System Prompt + Context + History + Question β
|
| 230 |
+
β β β
|
| 231 |
+
β βΌ β
|
| 232 |
+
β βββββββββββββββββββββββ β
|
| 233 |
+
β β LLM (Gemini/Groq) β β
|
| 234 |
+
β βββββββββββ¬ββββββββββββ β
|
| 235 |
+
β β β
|
| 236 |
+
β βΌ β
|
| 237 |
+
β Answer + Sources β
|
| 238 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
|
| 243 |
+
## AST Analysis & Graph Creation
|
| 244 |
+
|
| 245 |
+
The AST analysis is implemented in `code_chatbot/ast_analysis.py` using **tree-sitter** for multi-language parsing.
|
| 246 |
+
|
| 247 |
+
### How AST Parsing Works
|
| 248 |
+
|
| 249 |
+
```python
|
| 250 |
+
# Example: Parsing a Python file
|
| 251 |
+
|
| 252 |
+
# Source code:
|
| 253 |
+
"""
|
| 254 |
+
from typing import List
|
| 255 |
+
|
| 256 |
+
class UserService:
|
| 257 |
+
def __init__(self, db):
|
| 258 |
+
self.db = db
|
| 259 |
+
|
| 260 |
+
def get_user(self, user_id: int) -> User:
|
| 261 |
+
return self.db.find(user_id)
|
| 262 |
+
|
| 263 |
+
def create_user(self, name: str) -> User:
|
| 264 |
+
user = User(name=name)
|
| 265 |
+
self.db.save(user)
|
| 266 |
+
return user
|
| 267 |
+
"""
|
| 268 |
+
|
| 269 |
+
# tree-sitter parses this into an AST:
|
| 270 |
+
"""
|
| 271 |
+
module
|
| 272 |
+
βββ import_from_statement
|
| 273 |
+
β βββ module: "typing"
|
| 274 |
+
β βββ names: ["List"]
|
| 275 |
+
βββ class_definition
|
| 276 |
+
β βββ name: "UserService"
|
| 277 |
+
β βββ block
|
| 278 |
+
β βββ function_definition (name: "__init__")
|
| 279 |
+
β βββ function_definition (name: "get_user")
|
| 280 |
+
β β βββ call (function: "self.db.find")
|
| 281 |
+
β βββ function_definition (name: "create_user")
|
| 282 |
+
β βββ call (function: "User")
|
| 283 |
+
β βββ call (function: "self.db.save")
|
| 284 |
+
"""
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
### EnhancedCodeAnalyzer
|
| 288 |
+
|
| 289 |
+
```python
|
| 290 |
+
class EnhancedCodeAnalyzer:
|
| 291 |
+
"""Builds a knowledge graph from code"""
|
| 292 |
+
|
| 293 |
+
def __init__(self):
|
| 294 |
+
self.graph = nx.DiGraph() # NetworkX directed graph
|
| 295 |
+
self.functions = {} # node_id -> FunctionInfo
|
| 296 |
+
self.classes = {} # node_id -> ClassInfo
|
| 297 |
+
self.imports = {} # file_path -> [ImportInfo]
|
| 298 |
+
self.definitions = {} # name -> [node_ids]
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
### Graph Structure Example
|
| 302 |
+
|
| 303 |
+
```
|
| 304 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
+
β AST KNOWLEDGE GRAPH β
|
| 306 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
|
| 307 |
+
β β
|
| 308 |
+
β Nodes: β
|
| 309 |
+
β ββββββββββββββββββββ β
|
| 310 |
+
β β Type: "file" β β
|
| 311 |
+
β β Name: "api.py" β β
|
| 312 |
+
β ββββββββββ¬ββββββββββ β
|
| 313 |
+
β β defines β
|
| 314 |
+
β βΌ β
|
| 315 |
+
β ββββββββββββββββββββ ββββββββββββββββββββ β
|
| 316 |
+
β β Type: "class" β β Type: "function" β β
|
| 317 |
+
β β Name: "UserAPI" β β Name: "main" β β
|
| 318 |
+
β ββββββββββ¬ββββββββββ ββββββββββββββββββββ β
|
| 319 |
+
β β has_method β
|
| 320 |
+
β βΌ β
|
| 321 |
+
β ββββββββββββββββββββ β
|
| 322 |
+
β β Type: "method" ββββcallsββββΆ UserService.get_user β
|
| 323 |
+
β β Name: "get" β β
|
| 324 |
+
β ββββββββββββββββββββ β
|
| 325 |
+
β β
|
| 326 |
+
β Edges: β
|
| 327 |
+
β β’ defines: file -> class/function β
|
| 328 |
+
β β’ has_method: class -> method β
|
| 329 |
+
β β’ calls: function -> function β
|
| 330 |
+
β β’ imports: file -> module β
|
| 331 |
+
β β’ inherits_from: class -> class β
|
| 332 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
### Call Graph Resolution
|
| 336 |
+
|
| 337 |
+
```python
|
| 338 |
+
def resolve_call_graph(self):
|
| 339 |
+
"""
|
| 340 |
+
After parsing all files, resolve function calls to their definitions.
|
| 341 |
+
|
| 342 |
+
Example:
|
| 343 |
+
- File A has: service.get_user(id)
|
| 344 |
+
- File B has: def get_user(self, id): ...
|
| 345 |
+
|
| 346 |
+
Resolution:
|
| 347 |
+
- Finds that "get_user" is defined in File B
|
| 348 |
+
- Creates edge: A::caller_func --calls--> B::UserService.get_user
|
| 349 |
+
"""
|
| 350 |
+
for caller_id, callee_name, line in self.unresolved_calls:
|
| 351 |
+
# Try direct match
|
| 352 |
+
if callee_name in self.definitions:
|
| 353 |
+
for target_id in self.definitions[callee_name]:
|
| 354 |
+
self.graph.add_edge(caller_id, target_id, relation="calls")
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
---
|
| 358 |
+
|
| 359 |
+
## Code Chunking Strategy
|
| 360 |
+
|
| 361 |
+
The chunking system in `code_chatbot/chunker.py` uses **structural chunking** based on AST boundaries.
|
| 362 |
+
|
| 363 |
+
### Chunking Philosophy
|
| 364 |
+
|
| 365 |
+
```
|
| 366 |
+
Traditional Text Chunking:
|
| 367 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 368 |
+
β def process_data(): β CHUNK 1 β
|
| 369 |
+
β data = load() β β
|
| 370 |
+
β # Some processing β β
|
| 371 |
+
β ββββββββββββββββββββββββββββΌβββββββββββββ
|
| 372 |
+
β result = transform() β CHUNK 2 β β Breaks mid-function!
|
| 373 |
+
β return result β β
|
| 374 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 375 |
+
|
| 376 |
+
Structural Chunking (This Project):
|
| 377 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 378 |
+
β def process_data(): β β
|
| 379 |
+
β data = load() β CHUNK 1 β β Complete function
|
| 380 |
+
β result = transform() β β
|
| 381 |
+
β return result β β
|
| 382 |
+
βββββββββββββββββββββββββββββββββββββββββββ€
|
| 383 |
+
β def another_function(): β β
|
| 384 |
+
β ... β CHUNK 2 β β Complete function
|
| 385 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
### StructuralChunker Implementation
|
| 389 |
+
|
| 390 |
+
```python
|
| 391 |
+
class StructuralChunker:
|
| 392 |
+
"""Uses tree-sitter to chunk code at semantic boundaries"""
|
| 393 |
+
|
| 394 |
+
def __init__(self, max_tokens: int = 800):
|
| 395 |
+
self.max_tokens = max_tokens
|
| 396 |
+
self._init_parsers() # Python, JS, TS parsers
|
| 397 |
+
|
| 398 |
+
def _chunk_node(self, node, file_content, file_metadata):
|
| 399 |
+
"""
|
| 400 |
+
Recursive chunking algorithm:
|
| 401 |
+
|
| 402 |
+
1. If node fits in max_tokens β return as single chunk
|
| 403 |
+
2. If node is too large β recurse into children
|
| 404 |
+
3. Merge neighboring small chunks
|
| 405 |
+
"""
|
| 406 |
+
chunk = FileChunk(file_content, file_metadata,
|
| 407 |
+
node.start_byte, node.end_byte)
|
| 408 |
+
|
| 409 |
+
# Fits? Return it
|
| 410 |
+
if chunk.num_tokens <= self.max_tokens:
|
| 411 |
+
return [chunk]
|
| 412 |
+
|
| 413 |
+
# Too large? Recurse
|
| 414 |
+
child_chunks = []
|
| 415 |
+
for child in node.children:
|
| 416 |
+
child_chunks.extend(self._chunk_node(child, ...))
|
| 417 |
+
|
| 418 |
+
# Merge small neighbors
|
| 419 |
+
return self._merge_small_chunks(child_chunks)
|
| 420 |
+
```
|
| 421 |
+
|
| 422 |
+
### Chunk Metadata (Rich Context)
|
| 423 |
+
|
| 424 |
+
Each chunk carries rich metadata:
|
| 425 |
+
|
| 426 |
+
```python
|
| 427 |
+
@dataclass
|
| 428 |
+
class FileChunk:
|
| 429 |
+
file_content: str
|
| 430 |
+
file_metadata: Dict
|
| 431 |
+
start_byte: int
|
| 432 |
+
end_byte: int
|
| 433 |
+
|
| 434 |
+
# Enhanced metadata
|
| 435 |
+
symbols_defined: List[str] # ["UserService", "UserService.get_user"]
|
| 436 |
+
imports_used: List[str] # ["from typing import List"]
|
| 437 |
+
complexity_score: int # Cyclomatic complexity
|
| 438 |
+
parent_context: str # "UserService" (parent class)
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
This metadata is stored in the vector DB and used for filtering/ranking.
|
| 442 |
+
|
| 443 |
+
---
|
| 444 |
+
|
| 445 |
+
## Retrieval System
|
| 446 |
+
|
| 447 |
+
### Multi-Stage Retrieval Pipeline
|
| 448 |
+
|
| 449 |
+
```
|
| 450 |
+
Query: "How does user authentication work?"
|
| 451 |
+
β
|
| 452 |
+
βΌ
|
| 453 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 454 |
+
β STAGE 1: Initial Retrieval (k=10) β
|
| 455 |
+
β β
|
| 456 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 457 |
+
β β Vector Store (Chroma) β β
|
| 458 |
+
β β β β
|
| 459 |
+
β β Query Embedding ββsimilarityβββΆ Document Embeddings β β
|
| 460 |
+
β β β β
|
| 461 |
+
β β Returns: 10 candidate documents β β
|
| 462 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 463 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 464 |
+
β
|
| 465 |
+
βΌ
|
| 466 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 467 |
+
β STAGE 2: LLM-Based File Selection β
|
| 468 |
+
β β
|
| 469 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 470 |
+
β β LLMRetriever β β
|
| 471 |
+
β β β β
|
| 472 |
+
β β File Tree: β β
|
| 473 |
+
β β βββ src/ β β
|
| 474 |
+
β β β βββ auth/ β β
|
| 475 |
+
β β β β βββ login.py βββ LLM selects this β β
|
| 476 |
+
β β β β βββ middleware.py βββ And this β β
|
| 477 |
+
β β β βββ api/ β β
|
| 478 |
+
β β βββ tests/ β β
|
| 479 |
+
β β β β
|
| 480 |
+
β β LLM Prompt: "Select top 5 relevant files for: ..." β β
|
| 481 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 482 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 483 |
+
β
|
| 484 |
+
βΌ
|
| 485 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 486 |
+
β STAGE 3: Ensemble Combination β
|
| 487 |
+
β β
|
| 488 |
+
β Vector Results (weight: 0.6) + LLM Results (weight: 0.4) β
|
| 489 |
+
β β
|
| 490 |
+
β Combined: 12-15 unique documents β
|
| 491 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 492 |
+
β
|
| 493 |
+
βΌ
|
| 494 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 495 |
+
β STAGE 4: Graph Enhancement β
|
| 496 |
+
β β
|
| 497 |
+
β For each retrieved document: β
|
| 498 |
+
β 1. Find its node in AST graph β
|
| 499 |
+
β 2. Get neighboring nodes (related files) β
|
| 500 |
+
β 3. Add related files to context β
|
| 501 |
+
β β
|
| 502 |
+
β Example: login.py found β adds auth_utils.py (imports it) β
|
| 503 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 504 |
+
β
|
| 505 |
+
βΌ
|
| 506 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 507 |
+
β STAGE 5: Reranking β
|
| 508 |
+
β β
|
| 509 |
+
β ββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββ β
|
| 510 |
+
β β Cross-Encoder Reranker β β
|
| 511 |
+
β β (ms-marco-MiniLM-L-6-v2) β β
|
| 512 |
+
β β β β
|
| 513 |
+
β β For each (query, document) pair: β β
|
| 514 |
+
β β score = cross_encoder.predict([query, doc.content]) β β
|
| 515 |
+
β β β β
|
| 516 |
+
β β Sort by score, return top 5 β β
|
| 517 |
+
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
|
| 518 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 519 |
+
β
|
| 520 |
+
βΌ
|
| 521 |
+
Final: Top 5 Documents
|
| 522 |
+
```
|
| 523 |
+
|
| 524 |
+
### Reranker (Cross-Encoder)
|
| 525 |
+
|
| 526 |
+
```python
|
| 527 |
+
class Reranker:
|
| 528 |
+
"""
|
| 529 |
+
Uses a Cross-Encoder for precise relevance scoring.
|
| 530 |
+
|
| 531 |
+
Unlike bi-encoders (used for initial retrieval), cross-encoders
|
| 532 |
+
process query AND document together, giving more accurate scores.
|
| 533 |
+
"""
|
| 534 |
+
|
| 535 |
+
def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
|
| 536 |
+
self.model = CrossEncoder(model_name)
|
| 537 |
+
|
| 538 |
+
def rerank(self, query: str, documents: List[Document], top_k=5):
|
| 539 |
+
# Score each document against the query
|
| 540 |
+
pairs = [[query, doc.page_content] for doc in documents]
|
| 541 |
+
scores = self.model.predict(pairs)
|
| 542 |
+
|
| 543 |
+
# Sort by score
|
| 544 |
+
scored = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
|
| 545 |
+
return [doc for doc, _ in scored[:top_k]]
|
| 546 |
+
```
|
| 547 |
+
|
| 548 |
+
---
|
| 549 |
+
|
| 550 |
+
## Agentic Workflow
|
| 551 |
+
|
| 552 |
+
The agentic workflow uses **LangGraph** to enable multi-step reasoning with tool use.
|
| 553 |
+
|
| 554 |
+
### Agent Graph Structure
|
| 555 |
+
|
| 556 |
+
```
|
| 557 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 558 |
+
β LANGGRAPH AGENT β
|
| 559 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
|
| 560 |
+
β β
|
| 561 |
+
β βββββββββββββββ β
|
| 562 |
+
β βββββββββββ START βββββββββββ β
|
| 563 |
+
β β βββββββββββββββ β β
|
| 564 |
+
β βΌ β β
|
| 565 |
+
β βββββββββββββββββββββββββββββββββββββββ β β
|
| 566 |
+
β β AGENT NODE β β β
|
| 567 |
+
β β β β β
|
| 568 |
+
β β 1. Process messages β β β
|
| 569 |
+
β β 2. Call LLM with tools bound β β β
|
| 570 |
+
β β 3. LLM decides: β β β
|
| 571 |
+
β β - Call a tool? β go to TOOLS β β β
|
| 572 |
+
β β - Final answer? β go to END β β β
|
| 573 |
+
β ββββββββββββββββ¬βββββββββββββββββββββββ β β
|
| 574 |
+
β β β β
|
| 575 |
+
β has_tool_call? β β
|
| 576 |
+
β β β β β
|
| 577 |
+
β Yes β β No β β
|
| 578 |
+
β β β β β
|
| 579 |
+
β βΌ ββββββββββββββββββββββββββββΆβ€ β
|
| 580 |
+
β βββββββββββββββββββββββββββββββββββββββ β β
|
| 581 |
+
β β TOOLS NODE β β β
|
| 582 |
+
β β β β β
|
| 583 |
+
β β Execute tool calls: β β β
|
| 584 |
+
β β β’ search_codebase(query) β β β
|
| 585 |
+
β β β’ read_file(path) β β β
|
| 586 |
+
β β β’ list_files(dir) β β β
|
| 587 |
+
β β β’ find_callers(func) β β β
|
| 588 |
+
β β β’ find_callees(func) β β β
|
| 589 |
+
β β β’ find_call_chain(a, b) β β β
|
| 590 |
+
β β β β β
|
| 591 |
+
β β Add tool results to messages β β β
|
| 592 |
+
β ββββββββββββββββ¬βββββββββββββββββββββββ β β
|
| 593 |
+
β β β β
|
| 594 |
+
β βββββββββββββββββββββββββββ β
|
| 595 |
+
β β
|
| 596 |
+
β βΌ β
|
| 597 |
+
β βββββββββββββββ β
|
| 598 |
+
β β END β β
|
| 599 |
+
β βββββββββββββββ β
|
| 600 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 601 |
+
```
|
| 602 |
+
|
| 603 |
+
### Available Tools
|
| 604 |
+
|
| 605 |
+
```python
|
| 606 |
+
# 1. search_codebase - Semantic search in vector store
|
| 607 |
+
@tool("search_codebase")
|
| 608 |
+
def search_codebase(query: str):
|
| 609 |
+
"""Search the codebase for relevant code snippets."""
|
| 610 |
+
docs = retriever.invoke(query)
|
| 611 |
+
return format_results(docs[:5])
|
| 612 |
+
|
| 613 |
+
# 2. read_file - Read complete file content
|
| 614 |
+
@tool("read_file")
|
| 615 |
+
def read_file(file_path: str):
|
| 616 |
+
"""Read the content of a specific file."""
|
| 617 |
+
with open(full_path, "r") as f:
|
| 618 |
+
return f.read()
|
| 619 |
+
|
| 620 |
+
# 3. list_files - Directory listing
|
| 621 |
+
@tool("list_files")
|
| 622 |
+
def list_files(path: str = "."):
|
| 623 |
+
"""List files in a directory."""
|
| 624 |
+
return "\n".join(os.listdir(target_path))
|
| 625 |
+
|
| 626 |
+
# 4. find_callers - Call graph: who calls this?
|
| 627 |
+
@tool("find_callers")
|
| 628 |
+
def find_callers(function_name: str):
|
| 629 |
+
"""Find all functions that call this function."""
|
| 630 |
+
return analyzer.get_callers(function_name)
|
| 631 |
+
|
| 632 |
+
# 5. find_callees - Call graph: what does this call?
|
| 633 |
+
@tool("find_callees")
|
| 634 |
+
def find_callees(function_name: str):
|
| 635 |
+
"""Find all functions called by this function."""
|
| 636 |
+
return analyzer.get_callees(function_name)
|
| 637 |
+
|
| 638 |
+
# 6. find_call_chain - Call graph: trace execution path
|
| 639 |
+
@tool("find_call_chain")
|
| 640 |
+
def find_call_chain(start: str, end: str):
|
| 641 |
+
"""Find the call path from one function to another."""
|
| 642 |
+
return analyzer.get_call_chain(start, end)
|
| 643 |
+
```
|
| 644 |
+
|
| 645 |
+
### Agent Execution Example
|
| 646 |
+
|
| 647 |
+
```
|
| 648 |
+
User: "How does the login function handle invalid passwords?"
|
| 649 |
+
|
| 650 |
+
Agent Thought: I need to find the login function first.
|
| 651 |
+
Agent Action: search_codebase("login function authentication")
|
| 652 |
+
Tool Result: [login.py snippet showing authenticate() function]
|
| 653 |
+
|
| 654 |
+
Agent Thought: I found authenticate(). Let me see the full implementation.
|
| 655 |
+
Agent Action: read_file("src/auth/login.py")
|
| 656 |
+
Tool Result: [Full file content]
|
| 657 |
+
|
| 658 |
+
Agent Thought: I see it calls verify_password(). Let me check that.
|
| 659 |
+
Agent Action: find_callees("authenticate")
|
| 660 |
+
Tool Result: verify_password, get_user, create_session
|
| 661 |
+
|
| 662 |
+
Agent Action: search_codebase("verify_password invalid password")
|
| 663 |
+
Tool Result: [password_utils.py with error handling]
|
| 664 |
+
|
| 665 |
+
Agent Final Answer: The login function handles invalid passwords by...
|
| 666 |
+
```
|
| 667 |
+
|
| 668 |
+
---
|
| 669 |
+
|
| 670 |
+
## Frontend & API
|
| 671 |
+
|
| 672 |
+
### Streamlit App Structure
|
| 673 |
+
|
| 674 |
+
```
|
| 675 |
+
app.py (Main Entry)
|
| 676 |
+
β
|
| 677 |
+
βββ Ingestion Screen
|
| 678 |
+
β βββ Source Type Selection (ZIP/GitHub/Web)
|
| 679 |
+
β βββ File Upload / URL Input
|
| 680 |
+
β βββ "Process & Index" Button
|
| 681 |
+
β
|
| 682 |
+
βββ Redirects to β pages/1_β‘_Code_Studio.py
|
| 683 |
+
|
| 684 |
+
Code_Studio.py
|
| 685 |
+
β
|
| 686 |
+
βββ Left Panel (Tabs)
|
| 687 |
+
β βββ π Explorer - File tree navigation
|
| 688 |
+
β βββ π Search - Regex pattern search
|
| 689 |
+
β βββ π¬ Chat - RAG conversation
|
| 690 |
+
β βββ β¨ Generate - Spec generation
|
| 691 |
+
β
|
| 692 |
+
βββ Right Panel
|
| 693 |
+
βββ Code Viewer - Syntax highlighted file view
|
| 694 |
+
```
|
| 695 |
+
|
| 696 |
+
### FastAPI REST API
|
| 697 |
+
|
| 698 |
+
```
|
| 699 |
+
/api
|
| 700 |
+
βββ /health GET - Health check
|
| 701 |
+
β
|
| 702 |
+
βββ /index POST - Index a codebase
|
| 703 |
+
β Body: {
|
| 704 |
+
β source: "https://github.com/...",
|
| 705 |
+
β provider: "gemini",
|
| 706 |
+
β use_agent: true
|
| 707 |
+
β }
|
| 708 |
+
β
|
| 709 |
+
βββ /chat POST - Ask questions
|
| 710 |
+
Body: {
|
| 711 |
+
question: "How does auth work?",
|
| 712 |
+
provider: "gemini",
|
| 713 |
+
use_agent: true
|
| 714 |
+
}
|
| 715 |
+
Response: {
|
| 716 |
+
answer: "...",
|
| 717 |
+
sources: [...],
|
| 718 |
+
mode: "agent",
|
| 719 |
+
processing_time: 2.5
|
| 720 |
+
}
|
| 721 |
+
```
|
| 722 |
+
|
| 723 |
+
---
|
| 724 |
+
|
| 725 |
+
## Component Deep Dives
|
| 726 |
+
|
| 727 |
+
### Merkle Tree (Incremental Indexing)
|
| 728 |
+
|
| 729 |
+
```python
|
| 730 |
+
class MerkleTree:
|
| 731 |
+
"""
|
| 732 |
+
Enables incremental indexing by detecting file changes.
|
| 733 |
+
|
| 734 |
+
How it works:
|
| 735 |
+
1. Build a hash tree mirroring directory structure
|
| 736 |
+
2. Each file node has SHA-256 hash of content
|
| 737 |
+
3. Each directory node has hash of children hashes
|
| 738 |
+
4. Compare old vs new tree to find changes
|
| 739 |
+
"""
|
| 740 |
+
|
| 741 |
+
def compare_trees(self, old, new) -> ChangeSet:
|
| 742 |
+
# Returns: added, modified, deleted, unchanged files
|
| 743 |
+
```
|
| 744 |
+
|
| 745 |
+
**Example:**
|
| 746 |
+
|
| 747 |
+
```
|
| 748 |
+
First Index:
|
| 749 |
+
project/
|
| 750 |
+
βββ main.py (hash: abc123)
|
| 751 |
+
βββ utils.py (hash: def456)
|
| 752 |
+
|
| 753 |
+
Root hash: sha256(abc123 + def456) = xyz789
|
| 754 |
+
|
| 755 |
+
Second Index (utils.py changed):
|
| 756 |
+
project/
|
| 757 |
+
βββ main.py (hash: abc123) β unchanged
|
| 758 |
+
βββ utils.py (hash: ghi012) β NEW HASH!
|
| 759 |
+
|
| 760 |
+
Root hash changed! β Only re-index utils.py
|
| 761 |
+
```
|
| 762 |
+
|
| 763 |
+
### Path Obfuscation (Privacy)
|
| 764 |
+
|
| 765 |
+
```python
|
| 766 |
+
class PathObfuscator:
|
| 767 |
+
"""
|
| 768 |
+
Obfuscates file paths for sensitive codebases.
|
| 769 |
+
|
| 770 |
+
Original: /home/user/secret-project/src/auth/login.py
|
| 771 |
+
Obfuscated: /f8a3b2c1/d4e5f6a7/89012345.py
|
| 772 |
+
|
| 773 |
+
Mapping stored securely, reversible only with key.
|
| 774 |
+
"""
|
| 775 |
+
```
|
| 776 |
+
|
| 777 |
+
### Rate Limiter (API Management)
|
| 778 |
+
|
| 779 |
+
```python
|
| 780 |
+
class AdaptiveRateLimiter:
|
| 781 |
+
"""
|
| 782 |
+
Handles rate limits for free-tier APIs.
|
| 783 |
+
|
| 784 |
+
Gemini Free Tier: 15 RPM, 32K TPM, 1500 RPD
|
| 785 |
+
|
| 786 |
+
Strategies:
|
| 787 |
+
1. Track usage in rolling window
|
| 788 |
+
2. Adaptive delay based on remaining quota
|
| 789 |
+
3. Exponential backoff on 429 errors
|
| 790 |
+
4. Model fallback chain (flash β pro β legacy)
|
| 791 |
+
"""
|
| 792 |
+
```
|
| 793 |
+
|
| 794 |
+
---
|
| 795 |
+
|
| 796 |
+
## Configuration System
|
| 797 |
+
|
| 798 |
+
```python
|
| 799 |
+
@dataclass
|
| 800 |
+
class RAGConfig:
|
| 801 |
+
"""Central configuration for entire pipeline"""
|
| 802 |
+
|
| 803 |
+
# Chunking
|
| 804 |
+
chunking: ChunkingConfig
|
| 805 |
+
max_chunk_tokens: int = 800
|
| 806 |
+
min_chunk_tokens: int = 100
|
| 807 |
+
preserve_imports: bool = True
|
| 808 |
+
calculate_complexity: bool = True
|
| 809 |
+
|
| 810 |
+
# Privacy
|
| 811 |
+
privacy: PrivacyConfig
|
| 812 |
+
enable_path_obfuscation: bool = False
|
| 813 |
+
|
| 814 |
+
# Indexing
|
| 815 |
+
indexing: IndexingConfig
|
| 816 |
+
enable_incremental_indexing: bool = True
|
| 817 |
+
batch_size: int = 100
|
| 818 |
+
ignore_patterns: List[str] = [...]
|
| 819 |
+
|
| 820 |
+
# Retrieval
|
| 821 |
+
retrieval: RetrievalConfig
|
| 822 |
+
enable_reranking: bool = True
|
| 823 |
+
retrieval_k: int = 10
|
| 824 |
+
rerank_top_k: int = 5
|
| 825 |
+
similarity_threshold: float = 0.5
|
| 826 |
+
```
|
| 827 |
+
|
| 828 |
+
---
|
| 829 |
+
|
| 830 |
+
## File Dependency Map
|
| 831 |
+
|
| 832 |
+
```
|
| 833 |
+
app.py
|
| 834 |
+
βββ code_chatbot/universal_ingestor.py
|
| 835 |
+
βββ code_chatbot/indexer.py
|
| 836 |
+
β βββ code_chatbot/chunker.py (StructuralChunker)
|
| 837 |
+
β βββ code_chatbot/merkle_tree.py (MerkleTree)
|
| 838 |
+
β βββ code_chatbot/config.py (RAGConfig)
|
| 839 |
+
β βββ code_chatbot/db_connection.py (Chroma client)
|
| 840 |
+
βββ code_chatbot/rag.py (ChatEngine)
|
| 841 |
+
β βββ code_chatbot/retriever_wrapper.py
|
| 842 |
+
β β βββ code_chatbot/reranker.py (Reranker)
|
| 843 |
+
β βββ code_chatbot/llm_retriever.py (LLMRetriever)
|
| 844 |
+
β βββ code_chatbot/agent_workflow.py
|
| 845 |
+
β β βββ code_chatbot/tools.py
|
| 846 |
+
β βββ code_chatbot/prompts.py
|
| 847 |
+
βββ code_chatbot/ast_analysis.py (EnhancedCodeAnalyzer)
|
| 848 |
+
βββ code_chatbot/graph_rag.py (GraphEnhancedRetriever)
|
| 849 |
+
|
| 850 |
+
pages/1_β‘_Code_Studio.py
|
| 851 |
+
βββ components/file_explorer.py
|
| 852 |
+
βββ components/code_viewer.py
|
| 853 |
+
βββ components/panels.py
|
| 854 |
+
βββ components/style.py
|
| 855 |
+
|
| 856 |
+
api/main.py
|
| 857 |
+
βββ api/routes/chat.py
|
| 858 |
+
βββ api/routes/index.py
|
| 859 |
+
βββ api/routes/health.py
|
| 860 |
+
βββ api/schemas.py
|
| 861 |
+
βββ api/state.py
|
| 862 |
+
```
|
| 863 |
+
|
| 864 |
+
---
|
| 865 |
+
|
| 866 |
+
## Summary
|
| 867 |
+
|
| 868 |
+
This project implements a sophisticated code understanding system with:
|
| 869 |
+
|
| 870 |
+
1. **Multi-Source Ingestion**: ZIP, GitHub, Local, Web
|
| 871 |
+
2. **Structural Chunking**: AST-aware code splitting
|
| 872 |
+
3. **Hybrid Retrieval**: Vector + LLM + Graph-enhanced
|
| 873 |
+
4. **Cross-Encoder Reranking**: Precision at the top
|
| 874 |
+
5. **Agentic Workflow**: Multi-step reasoning with tools
|
| 875 |
+
6. **Call Graph Analysis**: Function relationship tracking
|
| 876 |
+
7. **Incremental Indexing**: Merkle tree change detection
|
| 877 |
+
8. **Multi-LLM Support**: Gemini, Groq with fallbacks
|
| 878 |
+
|
| 879 |
+
The architecture is designed for scalability, accuracy, and developer experience.
|
CODE_OF_CONDUCT.md
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
# Contributor Covenant Code of Conduct
|
| 2 |
-
|
| 3 |
-
## Our Pledge
|
| 4 |
-
|
| 5 |
-
We as members, contributors, and leaders pledge to make participation in our
|
| 6 |
-
community a harassment-free experience for everyone, regardless of age, body
|
| 7 |
-
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
| 8 |
-
identity and expression, level of experience, education, socio-economic status,
|
| 9 |
-
nationality, personal appearance, race, religion, or sexual identity
|
| 10 |
-
and orientation.
|
| 11 |
-
|
| 12 |
-
We pledge to act and interact in ways that contribute to an open, welcoming,
|
| 13 |
-
diverse, inclusive, and healthy community.
|
| 14 |
-
|
| 15 |
-
## Our Standards
|
| 16 |
-
|
| 17 |
-
Examples of behavior that contributes to a positive environment for our
|
| 18 |
-
community include:
|
| 19 |
-
|
| 20 |
-
* Demonstrating empathy and kindness toward other people
|
| 21 |
-
* Being respectful of differing opinions, viewpoints, and experiences
|
| 22 |
-
* Giving and gracefully accepting constructive feedback
|
| 23 |
-
* Accepting responsibility and apologizing to those affected by our mistakes,
|
| 24 |
-
and learning from the experience
|
| 25 |
-
* Focusing on what is best not just for us as individuals, but for the
|
| 26 |
-
overall community
|
| 27 |
-
|
| 28 |
-
Examples of unacceptable behavior include:
|
| 29 |
-
|
| 30 |
-
* The use of sexualized language or imagery, and sexual attention or
|
| 31 |
-
advances of any kind
|
| 32 |
-
* Trolling, insulting or derogatory comments, and personal or political attacks
|
| 33 |
-
* Public or private harassment
|
| 34 |
-
* Publishing others' private information, such as a physical or email
|
| 35 |
-
address, without their explicit permission
|
| 36 |
-
* Other conduct which could reasonably be considered inappropriate in a
|
| 37 |
-
professional setting
|
| 38 |
-
|
| 39 |
-
## Enforcement Responsibilities
|
| 40 |
-
|
| 41 |
-
Community leaders are responsible for clarifying and enforcing our standards of
|
| 42 |
-
acceptable behavior and will take appropriate and fair corrective action in
|
| 43 |
-
response to any behavior that they deem inappropriate, threatening, offensive,
|
| 44 |
-
or harmful.
|
| 45 |
-
|
| 46 |
-
Community leaders have the right and responsibility to remove, edit, or reject
|
| 47 |
-
comments, commits, code, wiki edits, issues, and other contributions that are
|
| 48 |
-
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
| 49 |
-
decisions when appropriate.
|
| 50 |
-
|
| 51 |
-
## Scope
|
| 52 |
-
|
| 53 |
-
This Code of Conduct applies within all community spaces, and also applies when
|
| 54 |
-
an individual is officially representing the community in public spaces.
|
| 55 |
-
Examples of representing our community include using an official e-mail address,
|
| 56 |
-
posting via an official social media account, or acting as an appointed
|
| 57 |
-
representative at an online or offline event.
|
| 58 |
-
|
| 59 |
-
## Enforcement
|
| 60 |
-
|
| 61 |
-
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
| 62 |
-
reported to the community leaders responsible for enforcement at
|
| 63 |
-
reported to the community leaders responsible for enforcement.
|
| 64 |
-
All complaints will be reviewed and investigated promptly and fairly.
|
| 65 |
-
|
| 66 |
-
All community leaders are obligated to respect the privacy and security of the
|
| 67 |
-
reporter of any incident.
|
| 68 |
-
|
| 69 |
-
## Enforcement Guidelines
|
| 70 |
-
|
| 71 |
-
Community leaders will follow these Community Impact Guidelines in determining
|
| 72 |
-
the consequences for any action they deem in violation of this Code of Conduct:
|
| 73 |
-
|
| 74 |
-
### 1. Correction
|
| 75 |
-
|
| 76 |
-
**Community Impact**: Use of inappropriate language or other behavior deemed
|
| 77 |
-
unprofessional or unwelcome in the community.
|
| 78 |
-
|
| 79 |
-
**Consequence**: A private, written warning from community leaders, providing
|
| 80 |
-
clarity around the nature of the violation and an explanation of why the
|
| 81 |
-
behavior was inappropriate. A public apology may be requested.
|
| 82 |
-
|
| 83 |
-
### 2. Warning
|
| 84 |
-
|
| 85 |
-
**Community Impact**: A violation through a single incident or series
|
| 86 |
-
of actions.
|
| 87 |
-
|
| 88 |
-
**Consequence**: A warning with consequences for continued behavior. No
|
| 89 |
-
interaction with the people involved, including unsolicited interaction with
|
| 90 |
-
those enforcing the Code of Conduct, for a specified period of time. This
|
| 91 |
-
includes avoiding interactions in community spaces as well as external channels
|
| 92 |
-
like social media. Violating these terms may lead to a temporary or
|
| 93 |
-
permanent ban.
|
| 94 |
-
|
| 95 |
-
### 3. Temporary Ban
|
| 96 |
-
|
| 97 |
-
**Community Impact**: A serious violation of community standards, including
|
| 98 |
-
sustained inappropriate behavior.
|
| 99 |
-
|
| 100 |
-
**Consequence**: A temporary ban from any sort of interaction or public
|
| 101 |
-
communication with the community for a specified period of time. No public or
|
| 102 |
-
private interaction with the people involved, including unsolicited interaction
|
| 103 |
-
with those enforcing the Code of Conduct, is allowed during this period.
|
| 104 |
-
Violating these terms may lead to a permanent ban.
|
| 105 |
-
|
| 106 |
-
### 4. Permanent Ban
|
| 107 |
-
|
| 108 |
-
**Community Impact**: Demonstrating a pattern of violation of community
|
| 109 |
-
standards, including sustained inappropriate behavior, harassment of an
|
| 110 |
-
individual, or aggression toward or disparagement of classes of individuals.
|
| 111 |
-
|
| 112 |
-
**Consequence**: A permanent ban from any sort of public interaction within
|
| 113 |
-
the community.
|
| 114 |
-
|
| 115 |
-
## Attribution
|
| 116 |
-
|
| 117 |
-
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
| 118 |
-
version 2.0, available at
|
| 119 |
-
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
| 120 |
-
|
| 121 |
-
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
| 122 |
-
enforcement ladder](https://github.com/mozilla/diversity).
|
| 123 |
-
|
| 124 |
-
[homepage]: https://www.contributor-covenant.org
|
| 125 |
-
|
| 126 |
-
For answers to common questions about this code of conduct, see the FAQ at
|
| 127 |
-
https://www.contributor-covenant.org/faq. Translations are available at
|
| 128 |
-
https://www.contributor-covenant.org/translations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api/routes/index.py
CHANGED
|
@@ -24,12 +24,12 @@ async def index_codebase(request: IndexRequest):
|
|
| 24 |
|
| 25 |
try:
|
| 26 |
# Import required modules
|
| 27 |
-
from code_chatbot.universal_ingestor import process_source
|
| 28 |
-
from code_chatbot.ast_analysis import ASTGraphBuilder
|
| 29 |
-
from code_chatbot.indexer import Indexer
|
| 30 |
-
from code_chatbot.graph_rag import GraphEnhancedRetriever
|
| 31 |
-
from code_chatbot.rag import ChatEngine
|
| 32 |
-
from code_chatbot.chunker import StructuralChunker
|
| 33 |
from langchain_community.vectorstores import Chroma, FAISS
|
| 34 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
| 35 |
|
|
|
|
| 24 |
|
| 25 |
try:
|
| 26 |
# Import required modules
|
| 27 |
+
from code_chatbot.ingestion.universal_ingestor import process_source
|
| 28 |
+
from code_chatbot.analysis.ast_analysis import ASTGraphBuilder
|
| 29 |
+
from code_chatbot.ingestion.indexer import Indexer
|
| 30 |
+
from code_chatbot.retrieval.graph_rag import GraphEnhancedRetriever
|
| 31 |
+
from code_chatbot.retrieval.rag import ChatEngine
|
| 32 |
+
from code_chatbot.ingestion.chunker import StructuralChunker
|
| 33 |
from langchain_community.vectorstores import Chroma, FAISS
|
| 34 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
| 35 |
|
app.py
CHANGED
|
@@ -2,11 +2,11 @@ import streamlit as st
|
|
| 2 |
import os
|
| 3 |
import shutil
|
| 4 |
import time
|
| 5 |
-
from code_chatbot.universal_ingestor import process_source
|
| 6 |
-
from code_chatbot.indexer import Indexer
|
| 7 |
-
from code_chatbot.rag import ChatEngine
|
| 8 |
-
from code_chatbot.ast_analysis import ASTGraphBuilder
|
| 9 |
-
from code_chatbot.graph_rag import GraphEnhancedRetriever
|
| 10 |
import logging
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
|
@@ -83,7 +83,7 @@ if not st.session_state.processed_files:
|
|
| 83 |
st.error(f"Please configure {embedding_provider} API Key for embeddings in the sidebar.")
|
| 84 |
else:
|
| 85 |
# Use the new progress-tracked indexer
|
| 86 |
-
from code_chatbot.indexing_progress import index_with_progress
|
| 87 |
|
| 88 |
chat_engine, success, repo_files, workspace_root = index_with_progress(
|
| 89 |
source_input=source_input,
|
|
|
|
| 2 |
import os
|
| 3 |
import shutil
|
| 4 |
import time
|
| 5 |
+
from code_chatbot.ingestion.universal_ingestor import process_source
|
| 6 |
+
from code_chatbot.ingestion.indexer import Indexer
|
| 7 |
+
from code_chatbot.retrieval.rag import ChatEngine
|
| 8 |
+
from code_chatbot.analysis.ast_analysis import ASTGraphBuilder
|
| 9 |
+
from code_chatbot.retrieval.graph_rag import GraphEnhancedRetriever
|
| 10 |
import logging
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
|
|
|
| 83 |
st.error(f"Please configure {embedding_provider} API Key for embeddings in the sidebar.")
|
| 84 |
else:
|
| 85 |
# Use the new progress-tracked indexer
|
| 86 |
+
from code_chatbot.ingestion.indexing_progress import index_with_progress
|
| 87 |
|
| 88 |
chat_engine, success, repo_files, workspace_root = index_with_progress(
|
| 89 |
source_input=source_input,
|
architecture_viz.jsx
ADDED
|
@@ -0,0 +1,625 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React, { useState } from 'react';
|
| 2 |
+
import { ChevronRight, ChevronDown, Database, Code, Brain, Search, FileText, GitBranch, Layers, Workflow, Server, Cpu, ArrowRight, Zap } from 'lucide-react';
|
| 3 |
+
|
| 4 |
+
const ArchitectureViz = () => {
|
| 5 |
+
const [activeTab, setActiveTab] = useState('overview');
|
| 6 |
+
const [expandedSections, setExpandedSections] = useState({});
|
| 7 |
+
|
| 8 |
+
const toggleSection = (section) => {
|
| 9 |
+
setExpandedSections(prev => ({
|
| 10 |
+
...prev,
|
| 11 |
+
[section]: !prev[section]
|
| 12 |
+
}));
|
| 13 |
+
};
|
| 14 |
+
|
| 15 |
+
const tabs = [
|
| 16 |
+
{ id: 'overview', label: 'System Overview', icon: Layers },
|
| 17 |
+
{ id: 'rag', label: 'RAG Pipeline', icon: Search },
|
| 18 |
+
{ id: 'ast', label: 'AST & Graphs', icon: GitBranch },
|
| 19 |
+
{ id: 'chunking', label: 'Code Chunking', icon: Code },
|
| 20 |
+
{ id: 'agent', label: 'Agentic Workflow', icon: Brain },
|
| 21 |
+
{ id: 'retrieval', label: 'Retrieval System', icon: Database },
|
| 22 |
+
];
|
| 23 |
+
|
| 24 |
+
const ComponentCard = ({ title, description, icon: Icon, color, children }) => (
|
| 25 |
+
<div className={`bg-slate-800 rounded-lg p-4 border-l-4 ${color} hover:bg-slate-750 transition-all`}>
|
| 26 |
+
<div className="flex items-center gap-2 mb-2">
|
| 27 |
+
<Icon className="w-5 h-5 text-slate-300" />
|
| 28 |
+
<h3 className="font-semibold text-white">{title}</h3>
|
| 29 |
+
</div>
|
| 30 |
+
<p className="text-slate-400 text-sm mb-2">{description}</p>
|
| 31 |
+
{children}
|
| 32 |
+
</div>
|
| 33 |
+
);
|
| 34 |
+
|
| 35 |
+
const FlowArrow = () => (
|
| 36 |
+
<div className="flex justify-center py-2">
|
| 37 |
+
<ArrowRight className="w-6 h-6 text-slate-500" />
|
| 38 |
+
</div>
|
| 39 |
+
);
|
| 40 |
+
|
| 41 |
+
const renderOverview = () => (
|
| 42 |
+
<div className="space-y-6">
|
| 43 |
+
<div className="bg-gradient-to-r from-purple-900/50 to-blue-900/50 rounded-xl p-6 border border-purple-500/30">
|
| 44 |
+
<h2 className="text-2xl font-bold text-white mb-2 flex items-center gap-2">
|
| 45 |
+
<Zap className="w-6 h-6 text-yellow-400" />
|
| 46 |
+
Code Crawler Architecture
|
| 47 |
+
</h2>
|
| 48 |
+
<p className="text-slate-300">
|
| 49 |
+
An AI-powered codebase assistant combining RAG, AST analysis, Graph databases, and Agentic workflows.
|
| 50 |
+
</p>
|
| 51 |
+
</div>
|
| 52 |
+
|
| 53 |
+
<div className="grid grid-cols-1 md:grid-cols-3 gap-4">
|
| 54 |
+
<ComponentCard
|
| 55 |
+
title="Data Ingestion"
|
| 56 |
+
description="Universal ingestor supporting ZIP, GitHub, Local, Web"
|
| 57 |
+
icon={FileText}
|
| 58 |
+
color="border-green-500"
|
| 59 |
+
>
|
| 60 |
+
<div className="mt-2 space-y-1">
|
| 61 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">ZIPFileManager</div>
|
| 62 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">GitHubRepoManager</div>
|
| 63 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">LocalDirectoryManager</div>
|
| 64 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">WebDocManager</div>
|
| 65 |
+
</div>
|
| 66 |
+
</ComponentCard>
|
| 67 |
+
|
| 68 |
+
<ComponentCard
|
| 69 |
+
title="Processing"
|
| 70 |
+
description="AST parsing, chunking, embeddings, graph building"
|
| 71 |
+
icon={Cpu}
|
| 72 |
+
color="border-blue-500"
|
| 73 |
+
>
|
| 74 |
+
<div className="mt-2 space-y-1">
|
| 75 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">StructuralChunker (tree-sitter)</div>
|
| 76 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">EnhancedCodeAnalyzer</div>
|
| 77 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">Gemini/HuggingFace Embeddings</div>
|
| 78 |
+
</div>
|
| 79 |
+
</ComponentCard>
|
| 80 |
+
|
| 81 |
+
<ComponentCard
|
| 82 |
+
title="Storage"
|
| 83 |
+
description="Vector DB and AST knowledge graph"
|
| 84 |
+
icon={Database}
|
| 85 |
+
color="border-purple-500"
|
| 86 |
+
>
|
| 87 |
+
<div className="mt-2 space-y-1">
|
| 88 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">Chroma / FAISS / Qdrant</div>
|
| 89 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">GraphML (NetworkX)</div>
|
| 90 |
+
<div className="text-xs bg-slate-700 rounded px-2 py-1">Merkle Tree Snapshots</div>
|
| 91 |
+
</div>
|
| 92 |
+
</ComponentCard>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
<div className="bg-slate-800 rounded-lg p-4">
|
| 96 |
+
<h3 className="font-semibold text-white mb-3 flex items-center gap-2">
|
| 97 |
+
<Workflow className="w-5 h-5" />
|
| 98 |
+
Data Flow
|
| 99 |
+
</h3>
|
| 100 |
+
<div className="flex flex-wrap items-center justify-center gap-2 text-sm">
|
| 101 |
+
<span className="bg-green-600/30 text-green-300 px-3 py-1 rounded-full">Input Source</span>
|
| 102 |
+
<ArrowRight className="w-4 h-4 text-slate-500" />
|
| 103 |
+
<span className="bg-blue-600/30 text-blue-300 px-3 py-1 rounded-full">Ingestor</span>
|
| 104 |
+
<ArrowRight className="w-4 h-4 text-slate-500" />
|
| 105 |
+
<span className="bg-purple-600/30 text-purple-300 px-3 py-1 rounded-full">Chunker</span>
|
| 106 |
+
<ArrowRight className="w-4 h-4 text-slate-500" />
|
| 107 |
+
<span className="bg-pink-600/30 text-pink-300 px-3 py-1 rounded-full">Embeddings</span>
|
| 108 |
+
<ArrowRight className="w-4 h-4 text-slate-500" />
|
| 109 |
+
<span className="bg-orange-600/30 text-orange-300 px-3 py-1 rounded-full">Vector DB</span>
|
| 110 |
+
</div>
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
|
| 114 |
+
<ComponentCard
|
| 115 |
+
title="Retrieval Layer"
|
| 116 |
+
description="Multi-stage retrieval with reranking"
|
| 117 |
+
icon={Search}
|
| 118 |
+
color="border-yellow-500"
|
| 119 |
+
>
|
| 120 |
+
<div className="mt-2 text-xs space-y-1">
|
| 121 |
+
<div className="flex items-center gap-2">
|
| 122 |
+
<span className="w-2 h-2 bg-yellow-500 rounded-full"></span>
|
| 123 |
+
<span className="text-slate-300">Vector Retriever (60%)</span>
|
| 124 |
+
</div>
|
| 125 |
+
<div className="flex items-center gap-2">
|
| 126 |
+
<span className="w-2 h-2 bg-yellow-500 rounded-full"></span>
|
| 127 |
+
<span className="text-slate-300">LLM Retriever (40%)</span>
|
| 128 |
+
</div>
|
| 129 |
+
<div className="flex items-center gap-2">
|
| 130 |
+
<span className="w-2 h-2 bg-yellow-500 rounded-full"></span>
|
| 131 |
+
<span className="text-slate-300">Graph Enhancement</span>
|
| 132 |
+
</div>
|
| 133 |
+
<div className="flex items-center gap-2">
|
| 134 |
+
<span className="w-2 h-2 bg-yellow-500 rounded-full"></span>
|
| 135 |
+
<span className="text-slate-300">Cross-Encoder Reranker</span>
|
| 136 |
+
</div>
|
| 137 |
+
</div>
|
| 138 |
+
</ComponentCard>
|
| 139 |
+
|
| 140 |
+
<ComponentCard
|
| 141 |
+
title="Chat Engine"
|
| 142 |
+
description="Dual-mode: Linear RAG or Agentic"
|
| 143 |
+
icon={Brain}
|
| 144 |
+
color="border-red-500"
|
| 145 |
+
>
|
| 146 |
+
<div className="mt-2 text-xs space-y-1">
|
| 147 |
+
<div className="flex items-center gap-2">
|
| 148 |
+
<span className="w-2 h-2 bg-red-500 rounded-full"></span>
|
| 149 |
+
<span className="text-slate-300">Linear RAG (simple Q&A)</span>
|
| 150 |
+
</div>
|
| 151 |
+
<div className="flex items-center gap-2">
|
| 152 |
+
<span className="w-2 h-2 bg-red-500 rounded-full"></span>
|
| 153 |
+
<span className="text-slate-300">Agentic Workflow (LangGraph)</span>
|
| 154 |
+
</div>
|
| 155 |
+
<div className="flex items-center gap-2">
|
| 156 |
+
<span className="w-2 h-2 bg-red-500 rounded-full"></span>
|
| 157 |
+
<span className="text-slate-300">Tools: search, read, list, call_graph</span>
|
| 158 |
+
</div>
|
| 159 |
+
</div>
|
| 160 |
+
</ComponentCard>
|
| 161 |
+
</div>
|
| 162 |
+
</div>
|
| 163 |
+
);
|
| 164 |
+
|
| 165 |
+
const renderRAG = () => (
|
| 166 |
+
<div className="space-y-6">
|
| 167 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 168 |
+
<h2 className="text-xl font-bold text-white mb-4">RAG Pipeline Implementation</h2>
|
| 169 |
+
<p className="text-slate-400 mb-4">
|
| 170 |
+
The RAG (Retrieval-Augmented Generation) system combines vector search with LLM-based file selection
|
| 171 |
+
and cross-encoder reranking for high-precision code retrieval.
|
| 172 |
+
</p>
|
| 173 |
+
|
| 174 |
+
<div className="space-y-4">
|
| 175 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 176 |
+
<h3 className="font-semibold text-green-400 mb-2">1. Query Processing</h3>
|
| 177 |
+
<code className="text-sm text-slate-300 block bg-slate-900 p-3 rounded">
|
| 178 |
+
{`query = "How does authentication work?"
|
| 179 |
+
# Optionally expand with multi-query
|
| 180 |
+
expanded_queries = multi_query_expander(query)`}
|
| 181 |
+
</code>
|
| 182 |
+
</div>
|
| 183 |
+
|
| 184 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 185 |
+
<h3 className="font-semibold text-blue-400 mb-2">2. Hybrid Retrieval</h3>
|
| 186 |
+
<code className="text-sm text-slate-300 block bg-slate-900 p-3 rounded">
|
| 187 |
+
{`# Vector similarity search (60% weight)
|
| 188 |
+
vector_docs = chroma_db.similarity_search(query, k=10)
|
| 189 |
+
|
| 190 |
+
# LLM-based file selection (40% weight)
|
| 191 |
+
llm_docs = llm_retriever.select_files(query, file_tree)
|
| 192 |
+
|
| 193 |
+
# Combine with EnsembleRetriever
|
| 194 |
+
combined = ensemble([vector_docs, llm_docs], weights=[0.6, 0.4])`}
|
| 195 |
+
</code>
|
| 196 |
+
</div>
|
| 197 |
+
|
| 198 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 199 |
+
<h3 className="font-semibold text-purple-400 mb-2">3. Graph Enhancement</h3>
|
| 200 |
+
<code className="text-sm text-slate-300 block bg-slate-900 p-3 rounded">
|
| 201 |
+
{`# For each retrieved doc, find related files via AST graph
|
| 202 |
+
for doc in combined:
|
| 203 |
+
neighbors = ast_graph.neighbors(doc.file_path)
|
| 204 |
+
for neighbor in neighbors:
|
| 205 |
+
if relation == "imports" or relation == "calls":
|
| 206 |
+
augmented_docs.append(read_file(neighbor))`}
|
| 207 |
+
</code>
|
| 208 |
+
</div>
|
| 209 |
+
|
| 210 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 211 |
+
<h3 className="font-semibold text-yellow-400 mb-2">4. Cross-Encoder Reranking</h3>
|
| 212 |
+
<code className="text-sm text-slate-300 block bg-slate-900 p-3 rounded">
|
| 213 |
+
{`# Score each (query, document) pair with cross-encoder
|
| 214 |
+
pairs = [[query, doc.content] for doc in augmented_docs]
|
| 215 |
+
scores = cross_encoder.predict(pairs)
|
| 216 |
+
|
| 217 |
+
# Return top 5 by score
|
| 218 |
+
final_docs = sorted(zip(docs, scores), by=score)[:5]`}
|
| 219 |
+
</code>
|
| 220 |
+
</div>
|
| 221 |
+
|
| 222 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 223 |
+
<h3 className="font-semibold text-red-400 mb-2">5. Generation</h3>
|
| 224 |
+
<code className="text-sm text-slate-300 block bg-slate-900 p-3 rounded">
|
| 225 |
+
{`# Build context from retrieved docs
|
| 226 |
+
context = format_docs(final_docs)
|
| 227 |
+
|
| 228 |
+
# Generate answer with LLM
|
| 229 |
+
prompt = system_prompt.format(context=context)
|
| 230 |
+
answer = llm.invoke([SystemMessage(prompt), HumanMessage(query)])`}
|
| 231 |
+
</code>
|
| 232 |
+
</div>
|
| 233 |
+
</div>
|
| 234 |
+
</div>
|
| 235 |
+
|
| 236 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 237 |
+
<h3 className="font-semibold text-white mb-3">Key Files</h3>
|
| 238 |
+
<div className="grid grid-cols-2 gap-2 text-sm">
|
| 239 |
+
<div className="bg-slate-700 rounded p-2">
|
| 240 |
+
<span className="text-blue-400">code_chatbot/rag.py</span>
|
| 241 |
+
<p className="text-slate-400 text-xs">ChatEngine class</p>
|
| 242 |
+
</div>
|
| 243 |
+
<div className="bg-slate-700 rounded p-2">
|
| 244 |
+
<span className="text-blue-400">code_chatbot/retriever_wrapper.py</span>
|
| 245 |
+
<p className="text-slate-400 text-xs">RerankingRetriever</p>
|
| 246 |
+
</div>
|
| 247 |
+
<div className="bg-slate-700 rounded p-2">
|
| 248 |
+
<span className="text-blue-400">code_chatbot/llm_retriever.py</span>
|
| 249 |
+
<p className="text-slate-400 text-xs">LLM-based file selection</p>
|
| 250 |
+
</div>
|
| 251 |
+
<div className="bg-slate-700 rounded p-2">
|
| 252 |
+
<span className="text-blue-400">code_chatbot/reranker.py</span>
|
| 253 |
+
<p className="text-slate-400 text-xs">Cross-encoder reranking</p>
|
| 254 |
+
</div>
|
| 255 |
+
</div>
|
| 256 |
+
</div>
|
| 257 |
+
</div>
|
| 258 |
+
);
|
| 259 |
+
|
| 260 |
+
const renderAST = () => (
|
| 261 |
+
<div className="space-y-6">
|
| 262 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 263 |
+
<h2 className="text-xl font-bold text-white mb-4">AST Analysis & Knowledge Graph</h2>
|
| 264 |
+
<p className="text-slate-400 mb-4">
|
| 265 |
+
Uses <span className="text-green-400">tree-sitter</span> to parse code into Abstract Syntax Trees,
|
| 266 |
+
then builds a <span className="text-blue-400">NetworkX</span> directed graph capturing code relationships.
|
| 267 |
+
</p>
|
| 268 |
+
|
| 269 |
+
<div className="grid grid-cols-1 md:grid-cols-2 gap-4 mb-6">
|
| 270 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 271 |
+
<h3 className="font-semibold text-purple-400 mb-2">Node Types</h3>
|
| 272 |
+
<ul className="text-sm text-slate-300 space-y-1">
|
| 273 |
+
<li className="flex items-center gap-2">
|
| 274 |
+
<span className="w-3 h-3 bg-green-500 rounded"></span> file
|
| 275 |
+
</li>
|
| 276 |
+
<li className="flex items-center gap-2">
|
| 277 |
+
<span className="w-3 h-3 bg-blue-500 rounded"></span> class
|
| 278 |
+
</li>
|
| 279 |
+
<li className="flex items-center gap-2">
|
| 280 |
+
<span className="w-3 h-3 bg-purple-500 rounded"></span> function
|
| 281 |
+
</li>
|
| 282 |
+
<li className="flex items-center gap-2">
|
| 283 |
+
<span className="w-3 h-3 bg-yellow-500 rounded"></span> method
|
| 284 |
+
</li>
|
| 285 |
+
</ul>
|
| 286 |
+
</div>
|
| 287 |
+
|
| 288 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 289 |
+
<h3 className="font-semibold text-purple-400 mb-2">Edge Types (Relations)</h3>
|
| 290 |
+
<ul className="text-sm text-slate-300 space-y-1">
|
| 291 |
+
<li><span className="text-green-400">defines</span> - file β class/function</li>
|
| 292 |
+
<li><span className="text-blue-400">has_method</span> - class β method</li>
|
| 293 |
+
<li><span className="text-purple-400">calls</span> - function β function</li>
|
| 294 |
+
<li><span className="text-yellow-400">imports</span> - file β module</li>
|
| 295 |
+
<li><span className="text-red-400">inherits_from</span> - class β class</li>
|
| 296 |
+
</ul>
|
| 297 |
+
</div>
|
| 298 |
+
</div>
|
| 299 |
+
|
| 300 |
+
<div className="bg-slate-900 rounded-lg p-4 overflow-x-auto">
|
| 301 |
+
<h3 className="font-semibold text-white mb-2">Example: Parsing Python Code</h3>
|
| 302 |
+
<pre className="text-sm text-slate-300">
|
| 303 |
+
{`# Source Code
|
| 304 |
+
class UserService:
|
| 305 |
+
def get_user(self, user_id):
|
| 306 |
+
return self.db.find(user_id) # calls db.find
|
| 307 |
+
|
| 308 |
+
# Generated Graph
|
| 309 |
+
(file: user_service.py)
|
| 310 |
+
β
|
| 311 |
+
βββdefinesβββΆ (class: UserService)
|
| 312 |
+
β
|
| 313 |
+
βββhas_methodβββΆ (method: get_user)
|
| 314 |
+
β
|
| 315 |
+
βββcallsβββΆ (function: db.find)`}
|
| 316 |
+
</pre>
|
| 317 |
+
</div>
|
| 318 |
+
</div>
|
| 319 |
+
|
| 320 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 321 |
+
<h3 className="font-semibold text-white mb-3">Call Graph Tools</h3>
|
| 322 |
+
<div className="space-y-3">
|
| 323 |
+
<div className="bg-slate-700 rounded p-3">
|
| 324 |
+
<code className="text-green-400">find_callers("authenticate")</code>
|
| 325 |
+
<p className="text-slate-400 text-sm mt-1">β Returns all functions that call authenticate()</p>
|
| 326 |
+
</div>
|
| 327 |
+
<div className="bg-slate-700 rounded p-3">
|
| 328 |
+
<code className="text-blue-400">find_callees("process_request")</code>
|
| 329 |
+
<p className="text-slate-400 text-sm mt-1">β Returns all functions called by process_request()</p>
|
| 330 |
+
</div>
|
| 331 |
+
<div className="bg-slate-700 rounded p-3">
|
| 332 |
+
<code className="text-purple-400">find_call_chain("main", "save_to_db")</code>
|
| 333 |
+
<p className="text-slate-400 text-sm mt-1">β Returns execution paths from main() to save_to_db()</p>
|
| 334 |
+
</div>
|
| 335 |
+
</div>
|
| 336 |
+
</div>
|
| 337 |
+
</div>
|
| 338 |
+
);
|
| 339 |
+
|
| 340 |
+
const renderChunking = () => (
|
| 341 |
+
<div className="space-y-6">
|
| 342 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 343 |
+
<h2 className="text-xl font-bold text-white mb-4">Structural Code Chunking</h2>
|
| 344 |
+
<p className="text-slate-400 mb-4">
|
| 345 |
+
Unlike naive text splitting, this system uses <span className="text-green-400">tree-sitter</span> to
|
| 346 |
+
chunk code at semantic boundaries (functions, classes) while respecting token limits.
|
| 347 |
+
</p>
|
| 348 |
+
|
| 349 |
+
<div className="grid grid-cols-1 md:grid-cols-2 gap-4 mb-6">
|
| 350 |
+
<div className="bg-red-900/30 border border-red-500/30 rounded-lg p-4">
|
| 351 |
+
<h3 className="font-semibold text-red-400 mb-2">β Naive Text Chunking</h3>
|
| 352 |
+
<pre className="text-xs text-slate-300 bg-slate-900 p-2 rounded">
|
| 353 |
+
{`def process_data():
|
| 354 |
+
data = load()
|
| 355 |
+
# ββββββββββββββββ CHUNK BREAK ββββ
|
| 356 |
+
result = transform(data)
|
| 357 |
+
return result # Broken mid-function!`}
|
| 358 |
+
</pre>
|
| 359 |
+
</div>
|
| 360 |
+
|
| 361 |
+
<div className="bg-green-900/30 border border-green-500/30 rounded-lg p-4">
|
| 362 |
+
<h3 className="font-semibold text-green-400 mb-2">β Structural Chunking</h3>
|
| 363 |
+
<pre className="text-xs text-slate-300 bg-slate-900 p-2 rounded">
|
| 364 |
+
{`# CHUNK 1 - Complete function
|
| 365 |
+
def process_data():
|
| 366 |
+
data = load()
|
| 367 |
+
result = transform(data)
|
| 368 |
+
return result
|
| 369 |
+
|
| 370 |
+
# CHUNK 2 - Complete function
|
| 371 |
+
def another_func():
|
| 372 |
+
...`}
|
| 373 |
+
</pre>
|
| 374 |
+
</div>
|
| 375 |
+
</div>
|
| 376 |
+
|
| 377 |
+
<div className="bg-slate-700/50 rounded-lg p-4">
|
| 378 |
+
<h3 className="font-semibold text-blue-400 mb-2">Chunking Algorithm</h3>
|
| 379 |
+
<ol className="text-sm text-slate-300 space-y-2">
|
| 380 |
+
<li className="flex items-start gap-2">
|
| 381 |
+
<span className="bg-blue-500 text-white w-5 h-5 rounded-full flex items-center justify-center text-xs">1</span>
|
| 382 |
+
<span>Parse file into AST using tree-sitter</span>
|
| 383 |
+
</li>
|
| 384 |
+
<li className="flex items-start gap-2">
|
| 385 |
+
<span className="bg-blue-500 text-white w-5 h-5 rounded-full flex items-center justify-center text-xs">2</span>
|
| 386 |
+
<span>Recursively visit nodes (functions, classes, etc.)</span>
|
| 387 |
+
</li>
|
| 388 |
+
<li className="flex items-start gap-2">
|
| 389 |
+
<span className="bg-blue-500 text-white w-5 h-5 rounded-full flex items-center justify-center text-xs">3</span>
|
| 390 |
+
<span>If node fits in max_tokens (800) β return as chunk</span>
|
| 391 |
+
</li>
|
| 392 |
+
<li className="flex items-start gap-2">
|
| 393 |
+
<span className="bg-blue-500 text-white w-5 h-5 rounded-full flex items-center justify-center text-xs">4</span>
|
| 394 |
+
<span>If too large β split into children, recurse</span>
|
| 395 |
+
</li>
|
| 396 |
+
<li className="flex items-start gap-2">
|
| 397 |
+
<span className="bg-blue-500 text-white w-5 h-5 rounded-full flex items-center justify-center text-xs">5</span>
|
| 398 |
+
<span>Merge neighboring small chunks to avoid fragments</span>
|
| 399 |
+
</li>
|
| 400 |
+
</ol>
|
| 401 |
+
</div>
|
| 402 |
+
</div>
|
| 403 |
+
|
| 404 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 405 |
+
<h3 className="font-semibold text-white mb-3">Rich Chunk Metadata</h3>
|
| 406 |
+
<div className="bg-slate-900 rounded-lg p-4">
|
| 407 |
+
<pre className="text-sm text-slate-300">
|
| 408 |
+
{`FileChunk {
|
| 409 |
+
file_path: "src/auth/login.py",
|
| 410 |
+
start_byte: 245,
|
| 411 |
+
end_byte: 892,
|
| 412 |
+
line_range: "L12-L45",
|
| 413 |
+
language: "python",
|
| 414 |
+
chunk_type: "function_definition",
|
| 415 |
+
name: "authenticate",
|
| 416 |
+
|
| 417 |
+
// Enhanced metadata
|
| 418 |
+
symbols_defined: ["authenticate", "verify_token"],
|
| 419 |
+
imports_used: ["from jwt import decode"],
|
| 420 |
+
complexity_score: 7, // Cyclomatic complexity
|
| 421 |
+
parent_context: "AuthService" // Parent class
|
| 422 |
+
}`}
|
| 423 |
+
</pre>
|
| 424 |
+
</div>
|
| 425 |
+
</div>
|
| 426 |
+
</div>
|
| 427 |
+
);
|
| 428 |
+
|
| 429 |
+
const renderAgent = () => (
|
| 430 |
+
<div className="space-y-6">
|
| 431 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 432 |
+
<h2 className="text-xl font-bold text-white mb-4">Agentic Workflow (LangGraph)</h2>
|
| 433 |
+
<p className="text-slate-400 mb-4">
|
| 434 |
+
The agent can perform multi-step reasoning using tools, enabling complex analysis that
|
| 435 |
+
simple RAG cannot handle.
|
| 436 |
+
</p>
|
| 437 |
+
|
| 438 |
+
<div className="bg-slate-900 rounded-lg p-4 mb-6">
|
| 439 |
+
<h3 className="font-semibold text-purple-400 mb-3">Agent State Machine</h3>
|
| 440 |
+
<div className="flex flex-col items-center space-y-2">
|
| 441 |
+
<div className="bg-green-600/30 text-green-300 px-4 py-2 rounded-lg">START</div>
|
| 442 |
+
<ArrowRight className="w-4 h-4 text-slate-500 rotate-90" />
|
| 443 |
+
<div className="bg-blue-600/30 text-blue-300 px-6 py-3 rounded-lg text-center">
|
| 444 |
+
<div className="font-semibold">AGENT NODE</div>
|
| 445 |
+
<div className="text-xs mt-1">Process messages β Call LLM β Decide action</div>
|
| 446 |
+
</div>
|
| 447 |
+
<div className="flex items-center gap-4">
|
| 448 |
+
<div className="flex flex-col items-center">
|
| 449 |
+
<span className="text-xs text-slate-400">tool_call?</span>
|
| 450 |
+
<ArrowRight className="w-4 h-4 text-slate-500 rotate-90" />
|
| 451 |
+
<div className="bg-yellow-600/30 text-yellow-300 px-4 py-2 rounded-lg text-center">
|
| 452 |
+
<div className="font-semibold">TOOLS NODE</div>
|
| 453 |
+
<div className="text-xs">Execute tools</div>
|
| 454 |
+
</div>
|
| 455 |
+
</div>
|
| 456 |
+
<div className="flex flex-col items-center">
|
| 457 |
+
<span className="text-xs text-slate-400">final answer?</span>
|
| 458 |
+
<ArrowRight className="w-4 h-4 text-slate-500 rotate-90" />
|
| 459 |
+
<div className="bg-red-600/30 text-red-300 px-4 py-2 rounded-lg">END</div>
|
| 460 |
+
</div>
|
| 461 |
+
</div>
|
| 462 |
+
</div>
|
| 463 |
+
</div>
|
| 464 |
+
|
| 465 |
+
<div className="grid grid-cols-2 md:grid-cols-3 gap-3">
|
| 466 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 467 |
+
<code className="text-green-400 text-sm">search_codebase</code>
|
| 468 |
+
<p className="text-xs text-slate-400 mt-1">Vector search in codebase</p>
|
| 469 |
+
</div>
|
| 470 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 471 |
+
<code className="text-blue-400 text-sm">read_file</code>
|
| 472 |
+
<p className="text-xs text-slate-400 mt-1">Read complete file content</p>
|
| 473 |
+
</div>
|
| 474 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 475 |
+
<code className="text-purple-400 text-sm">list_files</code>
|
| 476 |
+
<p className="text-xs text-slate-400 mt-1">Directory listing</p>
|
| 477 |
+
</div>
|
| 478 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 479 |
+
<code className="text-yellow-400 text-sm">find_callers</code>
|
| 480 |
+
<p className="text-xs text-slate-400 mt-1">Who calls this function?</p>
|
| 481 |
+
</div>
|
| 482 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 483 |
+
<code className="text-red-400 text-sm">find_callees</code>
|
| 484 |
+
<p className="text-xs text-slate-400 mt-1">What does this call?</p>
|
| 485 |
+
</div>
|
| 486 |
+
<div className="bg-slate-700 rounded-lg p-3">
|
| 487 |
+
<code className="text-pink-400 text-sm">find_call_chain</code>
|
| 488 |
+
<p className="text-xs text-slate-400 mt-1">Trace execution path</p>
|
| 489 |
+
</div>
|
| 490 |
+
</div>
|
| 491 |
+
</div>
|
| 492 |
+
|
| 493 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 494 |
+
<h3 className="font-semibold text-white mb-3">Example Agent Execution</h3>
|
| 495 |
+
<div className="space-y-3 text-sm">
|
| 496 |
+
<div className="bg-slate-700 rounded p-3">
|
| 497 |
+
<span className="text-blue-400">User:</span>
|
| 498 |
+
<span className="text-slate-300 ml-2">"How does login handle invalid passwords?"</span>
|
| 499 |
+
</div>
|
| 500 |
+
<div className="bg-slate-700 rounded p-3">
|
| 501 |
+
<span className="text-purple-400">Agent Thought:</span>
|
| 502 |
+
<span className="text-slate-300 ml-2">I need to find the login function first.</span>
|
| 503 |
+
<div className="mt-1">
|
| 504 |
+
<span className="text-yellow-400">β Tool Call:</span>
|
| 505 |
+
<code className="text-green-300 ml-2">search_codebase("login authentication")</code>
|
| 506 |
+
</div>
|
| 507 |
+
</div>
|
| 508 |
+
<div className="bg-slate-700 rounded p-3">
|
| 509 |
+
<span className="text-purple-400">Agent Thought:</span>
|
| 510 |
+
<span className="text-slate-300 ml-2">Found authenticate(). Let me see full implementation.</span>
|
| 511 |
+
<div className="mt-1">
|
| 512 |
+
<span className="text-yellow-400">β Tool Call:</span>
|
| 513 |
+
<code className="text-green-300 ml-2">read_file("src/auth/login.py")</code>
|
| 514 |
+
</div>
|
| 515 |
+
</div>
|
| 516 |
+
<div className="bg-slate-700 rounded p-3">
|
| 517 |
+
<span className="text-purple-400">Agent Thought:</span>
|
| 518 |
+
<span className="text-slate-300 ml-2">It calls verify_password(). Let me check that.</span>
|
| 519 |
+
<div className="mt-1">
|
| 520 |
+
<span className="text-yellow-400">β Tool Call:</span>
|
| 521 |
+
<code className="text-green-300 ml-2">find_callees("authenticate")</code>
|
| 522 |
+
</div>
|
| 523 |
+
</div>
|
| 524 |
+
<div className="bg-green-700/50 rounded p-3">
|
| 525 |
+
<span className="text-green-400">Final Answer:</span>
|
| 526 |
+
<span className="text-slate-300 ml-2">The login handles invalid passwords by...</span>
|
| 527 |
+
</div>
|
| 528 |
+
</div>
|
| 529 |
+
</div>
|
| 530 |
+
</div>
|
| 531 |
+
);
|
| 532 |
+
|
| 533 |
+
const renderRetrieval = () => (
|
| 534 |
+
<div className="space-y-6">
|
| 535 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 536 |
+
<h2 className="text-xl font-bold text-white mb-4">Multi-Stage Retrieval System</h2>
|
| 537 |
+
|
| 538 |
+
<div className="space-y-4">
|
| 539 |
+
<div className="bg-green-900/30 border-l-4 border-green-500 rounded-r-lg p-4">
|
| 540 |
+
<h3 className="font-semibold text-green-400">Stage 1: Vector Retrieval (k=10)</h3>
|
| 541 |
+
<p className="text-slate-300 text-sm">Semantic similarity search in Chroma/FAISS using embeddings</p>
|
| 542 |
+
</div>
|
| 543 |
+
|
| 544 |
+
<div className="bg-blue-900/30 border-l-4 border-blue-500 rounded-r-lg p-4">
|
| 545 |
+
<h3 className="font-semibold text-blue-400">Stage 2: LLM File Selection</h3>
|
| 546 |
+
<p className="text-slate-300 text-sm">LLM analyzes file tree structure and selects relevant files</p>
|
| 547 |
+
</div>
|
| 548 |
+
|
| 549 |
+
<div className="bg-purple-900/30 border-l-4 border-purple-500 rounded-r-lg p-4">
|
| 550 |
+
<h3 className="font-semibold text-purple-400">Stage 3: Ensemble Combination</h3>
|
| 551 |
+
<p className="text-slate-300 text-sm">Weighted merge: 60% vector + 40% LLM selection</p>
|
| 552 |
+
</div>
|
| 553 |
+
|
| 554 |
+
<div className="bg-yellow-900/30 border-l-4 border-yellow-500 rounded-r-lg p-4">
|
| 555 |
+
<h3 className="font-semibold text-yellow-400">Stage 4: Graph Enhancement</h3>
|
| 556 |
+
<p className="text-slate-300 text-sm">Add related files from AST graph (imports, calls)</p>
|
| 557 |
+
</div>
|
| 558 |
+
|
| 559 |
+
<div className="bg-red-900/30 border-l-4 border-red-500 rounded-r-lg p-4">
|
| 560 |
+
<h3 className="font-semibold text-red-400">Stage 5: Cross-Encoder Reranking</h3>
|
| 561 |
+
<p className="text-slate-300 text-sm">Score each (query, doc) pair, return top 5</p>
|
| 562 |
+
</div>
|
| 563 |
+
</div>
|
| 564 |
+
</div>
|
| 565 |
+
|
| 566 |
+
<div className="bg-slate-800 rounded-xl p-6">
|
| 567 |
+
<h3 className="font-semibold text-white mb-3">Vector DB Support</h3>
|
| 568 |
+
<div className="grid grid-cols-3 gap-4">
|
| 569 |
+
<div className="bg-slate-700 rounded-lg p-4 text-center">
|
| 570 |
+
<Database className="w-8 h-8 text-green-400 mx-auto mb-2" />
|
| 571 |
+
<div className="font-semibold text-white">Chroma</div>
|
| 572 |
+
<div className="text-xs text-slate-400">Default, local</div>
|
| 573 |
+
</div>
|
| 574 |
+
<div className="bg-slate-700 rounded-lg p-4 text-center">
|
| 575 |
+
<Database className="w-8 h-8 text-blue-400 mx-auto mb-2" />
|
| 576 |
+
<div className="font-semibold text-white">FAISS</div>
|
| 577 |
+
<div className="text-xs text-slate-400">Fallback, fast</div>
|
| 578 |
+
</div>
|
| 579 |
+
<div className="bg-slate-700 rounded-lg p-4 text-center">
|
| 580 |
+
<Database className="w-8 h-8 text-purple-400 mx-auto mb-2" />
|
| 581 |
+
<div className="font-semibold text-white">Qdrant</div>
|
| 582 |
+
<div className="text-xs text-slate-400">Cloud option</div>
|
| 583 |
+
</div>
|
| 584 |
+
</div>
|
| 585 |
+
</div>
|
| 586 |
+
</div>
|
| 587 |
+
);
|
| 588 |
+
|
| 589 |
+
return (
|
| 590 |
+
<div className="min-h-screen bg-slate-900 text-white p-6">
|
| 591 |
+
<div className="max-w-6xl mx-auto">
|
| 592 |
+
<h1 className="text-3xl font-bold text-center mb-2">π·οΈ Code Crawler Architecture</h1>
|
| 593 |
+
<p className="text-slate-400 text-center mb-6">Interactive System Documentation</p>
|
| 594 |
+
|
| 595 |
+
<div className="flex flex-wrap gap-2 mb-6 justify-center">
|
| 596 |
+
{tabs.map(tab => (
|
| 597 |
+
<button
|
| 598 |
+
key={tab.id}
|
| 599 |
+
onClick={() => setActiveTab(tab.id)}
|
| 600 |
+
className={`flex items-center gap-2 px-4 py-2 rounded-lg transition-all ${
|
| 601 |
+
activeTab === tab.id
|
| 602 |
+
? 'bg-purple-600 text-white'
|
| 603 |
+
: 'bg-slate-800 text-slate-400 hover:bg-slate-700'
|
| 604 |
+
}`}
|
| 605 |
+
>
|
| 606 |
+
<tab.icon className="w-4 h-4" />
|
| 607 |
+
{tab.label}
|
| 608 |
+
</button>
|
| 609 |
+
))}
|
| 610 |
+
</div>
|
| 611 |
+
|
| 612 |
+
<div className="transition-all">
|
| 613 |
+
{activeTab === 'overview' && renderOverview()}
|
| 614 |
+
{activeTab === 'rag' && renderRAG()}
|
| 615 |
+
{activeTab === 'ast' && renderAST()}
|
| 616 |
+
{activeTab === 'chunking' && renderChunking()}
|
| 617 |
+
{activeTab === 'agent' && renderAgent()}
|
| 618 |
+
{activeTab === 'retrieval' && renderRetrieval()}
|
| 619 |
+
</div>
|
| 620 |
+
</div>
|
| 621 |
+
</div>
|
| 622 |
+
);
|
| 623 |
+
};
|
| 624 |
+
|
| 625 |
+
export default ArchitectureViz;
|
code_chatbot/{agent_workflow.py β agents/agent_workflow.py}
RENAMED
|
@@ -5,7 +5,7 @@ from langchain_core.messages import BaseMessage
|
|
| 5 |
from langchain_core.tools import tool
|
| 6 |
from langgraph.graph import StateGraph, END
|
| 7 |
from langgraph.prebuilt import ToolNode
|
| 8 |
-
from code_chatbot.rate_limiter import get_rate_limiter
|
| 9 |
|
| 10 |
# Define State
|
| 11 |
class AgentState(TypedDict):
|
|
@@ -49,7 +49,7 @@ def create_agent_graph(llm, retriever, repo_name: str = "Codebase", repo_dir: st
|
|
| 49 |
return result
|
| 50 |
|
| 51 |
# 2. Import File System Tools
|
| 52 |
-
from code_chatbot.tools import get_filesystem_tools, get_call_graph_tools
|
| 53 |
|
| 54 |
# 3. Combine Tools
|
| 55 |
fs_tools = get_filesystem_tools(repo_dir)
|
|
|
|
| 5 |
from langchain_core.tools import tool
|
| 6 |
from langgraph.graph import StateGraph, END
|
| 7 |
from langgraph.prebuilt import ToolNode
|
| 8 |
+
from code_chatbot.core.rate_limiter import get_rate_limiter
|
| 9 |
|
| 10 |
# Define State
|
| 11 |
class AgentState(TypedDict):
|
|
|
|
| 49 |
return result
|
| 50 |
|
| 51 |
# 2. Import File System Tools
|
| 52 |
+
from code_chatbot.agents.tools import get_filesystem_tools, get_call_graph_tools
|
| 53 |
|
| 54 |
# 3. Combine Tools
|
| 55 |
fs_tools = get_filesystem_tools(repo_dir)
|
code_chatbot/{crews β agents/crews}/__init__.py
RENAMED
|
File without changes
|
code_chatbot/{tools.py β agents/tools.py}
RENAMED
|
File without changes
|
code_chatbot/analysis/__init__.py
ADDED
|
File without changes
|
code_chatbot/{ast_analysis.py β analysis/ast_analysis.py}
RENAMED
|
File without changes
|
code_chatbot/{code_symbols.py β analysis/code_symbols.py}
RENAMED
|
@@ -4,7 +4,7 @@ import logging
|
|
| 4 |
from typing import List, Tuple, Optional
|
| 5 |
from tree_sitter import Node
|
| 6 |
|
| 7 |
-
from code_chatbot.chunker import StructuralChunker
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
|
|
|
| 4 |
from typing import List, Tuple, Optional
|
| 5 |
from tree_sitter import Node
|
| 6 |
|
| 7 |
+
from code_chatbot.ingestion.chunker import StructuralChunker
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
code_chatbot/core/__init__.py
ADDED
|
File without changes
|
code_chatbot/{config.py β core/config.py}
RENAMED
|
File without changes
|
code_chatbot/{db_connection.py β core/db_connection.py}
RENAMED
|
File without changes
|
code_chatbot/{path_obfuscator.py β core/path_obfuscator.py}
RENAMED
|
File without changes
|
code_chatbot/{prompts.py β core/prompts.py}
RENAMED
|
File without changes
|
code_chatbot/{rate_limiter.py β core/rate_limiter.py}
RENAMED
|
File without changes
|
code_chatbot/ingestion/__init__.py
ADDED
|
File without changes
|
code_chatbot/{chunker.py β ingestion/chunker.py}
RENAMED
|
File without changes
|
code_chatbot/{incremental_indexing.py β ingestion/incremental_indexing.py}
RENAMED
|
@@ -43,7 +43,7 @@ def add_incremental_indexing_methods(indexer_class):
|
|
| 43 |
if not self.config.indexing.enable_incremental_indexing:
|
| 44 |
logger.info("Incremental indexing disabled, performing full index")
|
| 45 |
# Fall back to full indexing
|
| 46 |
-
from code_chatbot.universal_ingestor import UniversalIngestor
|
| 47 |
ingestor = UniversalIngestor(source_path)
|
| 48 |
ingestor.download()
|
| 49 |
|
|
@@ -138,7 +138,7 @@ def add_incremental_indexing_methods(indexer_class):
|
|
| 138 |
collection_name: Name of the collection
|
| 139 |
vector_db_type: Type of vector database
|
| 140 |
"""
|
| 141 |
-
from code_chatbot.
|
| 142 |
|
| 143 |
try:
|
| 144 |
if vector_db_type == "chroma":
|
|
@@ -185,7 +185,7 @@ def add_incremental_indexing_methods(indexer_class):
|
|
| 185 |
Returns:
|
| 186 |
Dictionary with stats (total_chunks, unique_files, etc.)
|
| 187 |
"""
|
| 188 |
-
from code_chatbot.
|
| 189 |
|
| 190 |
try:
|
| 191 |
chroma_client = get_chroma_client(self.persist_directory)
|
|
|
|
| 43 |
if not self.config.indexing.enable_incremental_indexing:
|
| 44 |
logger.info("Incremental indexing disabled, performing full index")
|
| 45 |
# Fall back to full indexing
|
| 46 |
+
from code_chatbot.ingestion.universal_ingestor import UniversalIngestor
|
| 47 |
ingestor = UniversalIngestor(source_path)
|
| 48 |
ingestor.download()
|
| 49 |
|
|
|
|
| 138 |
collection_name: Name of the collection
|
| 139 |
vector_db_type: Type of vector database
|
| 140 |
"""
|
| 141 |
+
from code_chatbot.core.db_connection import get_chroma_client
|
| 142 |
|
| 143 |
try:
|
| 144 |
if vector_db_type == "chroma":
|
|
|
|
| 185 |
Returns:
|
| 186 |
Dictionary with stats (total_chunks, unique_files, etc.)
|
| 187 |
"""
|
| 188 |
+
from code_chatbot.core.db_connection import get_chroma_client
|
| 189 |
|
| 190 |
try:
|
| 191 |
chroma_client = get_chroma_client(self.persist_directory)
|
code_chatbot/{indexer.py β ingestion/indexer.py}
RENAMED
|
@@ -4,16 +4,16 @@ from pathlib import Path
|
|
| 4 |
from langchain_core.documents import Document
|
| 5 |
from langchain_community.vectorstores import Chroma
|
| 6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 7 |
-
from code_chatbot.chunker import StructuralChunker
|
| 8 |
-
from code_chatbot.merkle_tree import MerkleTree, ChangeSet
|
| 9 |
-
from code_chatbot.path_obfuscator import PathObfuscator
|
| 10 |
-
from code_chatbot.config import get_config
|
| 11 |
import shutil
|
| 12 |
import logging
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
-
from code_chatbot.db_connection import (
|
| 17 |
get_chroma_client,
|
| 18 |
reset_chroma_clients,
|
| 19 |
set_active_vector_db,
|
|
@@ -421,5 +421,5 @@ class Indexer:
|
|
| 421 |
raise
|
| 422 |
|
| 423 |
# Add incremental indexing methods to the Indexer class
|
| 424 |
-
from code_chatbot.incremental_indexing import add_incremental_indexing_methods
|
| 425 |
Indexer = add_incremental_indexing_methods(Indexer)
|
|
|
|
| 4 |
from langchain_core.documents import Document
|
| 5 |
from langchain_community.vectorstores import Chroma
|
| 6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 7 |
+
from code_chatbot.ingestion.chunker import StructuralChunker
|
| 8 |
+
from code_chatbot.ingestion.merkle_tree import MerkleTree, ChangeSet
|
| 9 |
+
from code_chatbot.core.path_obfuscator import PathObfuscator
|
| 10 |
+
from code_chatbot.core.config import get_config
|
| 11 |
import shutil
|
| 12 |
import logging
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
+
from code_chatbot.core.db_connection import (
|
| 17 |
get_chroma_client,
|
| 18 |
reset_chroma_clients,
|
| 19 |
set_active_vector_db,
|
|
|
|
| 421 |
raise
|
| 422 |
|
| 423 |
# Add incremental indexing methods to the Indexer class
|
| 424 |
+
from code_chatbot.ingestion.incremental_indexing import add_incremental_indexing_methods
|
| 425 |
Indexer = add_incremental_indexing_methods(Indexer)
|
code_chatbot/{indexing_progress.py β ingestion/indexing_progress.py}
RENAMED
|
@@ -27,12 +27,12 @@ def index_with_progress(
|
|
| 27 |
Index a codebase with detailed progress tracking.
|
| 28 |
Returns (chat_engine, success)
|
| 29 |
"""
|
| 30 |
-
from code_chatbot.universal_ingestor import process_source
|
| 31 |
-
from code_chatbot.ast_analysis import ASTGraphBuilder
|
| 32 |
-
from code_chatbot.indexer import Indexer
|
| 33 |
-
from code_chatbot.graph_rag import GraphEnhancedRetriever
|
| 34 |
-
from code_chatbot.rag import ChatEngine
|
| 35 |
-
from code_chatbot.chunker import StructuralChunker
|
| 36 |
from langchain_community.vectorstores import Chroma, FAISS
|
| 37 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
| 38 |
|
|
@@ -147,7 +147,7 @@ def index_with_progress(
|
|
| 147 |
progress_bar.progress(1.0)
|
| 148 |
|
| 149 |
else: # Chroma
|
| 150 |
-
from code_chatbot.
|
| 151 |
|
| 152 |
# Reset client cache to avoid stale/corrupt connections
|
| 153 |
reset_chroma_clients()
|
|
|
|
| 27 |
Index a codebase with detailed progress tracking.
|
| 28 |
Returns (chat_engine, success)
|
| 29 |
"""
|
| 30 |
+
from code_chatbot.ingestion.universal_ingestor import process_source
|
| 31 |
+
from code_chatbot.analysis.ast_analysis import ASTGraphBuilder
|
| 32 |
+
from code_chatbot.ingestion.indexer import Indexer
|
| 33 |
+
from code_chatbot.retrieval.graph_rag import GraphEnhancedRetriever
|
| 34 |
+
from code_chatbot.retrieval.rag import ChatEngine
|
| 35 |
+
from code_chatbot.ingestion.chunker import StructuralChunker
|
| 36 |
from langchain_community.vectorstores import Chroma, FAISS
|
| 37 |
from langchain_community.vectorstores.utils import filter_complex_metadata
|
| 38 |
|
|
|
|
| 147 |
progress_bar.progress(1.0)
|
| 148 |
|
| 149 |
else: # Chroma
|
| 150 |
+
from code_chatbot.core.db_connection import get_chroma_client, reset_chroma_clients
|
| 151 |
|
| 152 |
# Reset client cache to avoid stale/corrupt connections
|
| 153 |
reset_chroma_clients()
|
code_chatbot/{merkle_tree.py β ingestion/merkle_tree.py}
RENAMED
|
File without changes
|
code_chatbot/{universal_ingestor.py β ingestion/universal_ingestor.py}
RENAMED
|
@@ -135,8 +135,46 @@ class UniversalIngestor(DataManager):
|
|
| 135 |
|
| 136 |
def download(self) -> bool:
|
| 137 |
"""Downloads/prepares the data."""
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
|
| 141 |
"""Yields (content, metadata) tuples."""
|
| 142 |
yield from self.delegate.walk(get_content)
|
|
@@ -177,7 +215,8 @@ class ZIPFileManager(DataManager):
|
|
| 177 |
IGNORE_EXTENSIONS = {
|
| 178 |
'.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
|
| 179 |
'.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
|
| 180 |
-
'.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map'
|
|
|
|
| 181 |
}
|
| 182 |
# Files to ignore by exact name (lock files, etc.)
|
| 183 |
IGNORE_FILES = {
|
|
@@ -235,7 +274,8 @@ class LocalDirectoryManager(DataManager):
|
|
| 235 |
IGNORE_EXTENSIONS = {
|
| 236 |
'.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
|
| 237 |
'.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
|
| 238 |
-
'.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map'
|
|
|
|
| 239 |
}
|
| 240 |
# Files to ignore by exact name (lock files, etc.)
|
| 241 |
IGNORE_FILES = {
|
|
|
|
| 135 |
|
| 136 |
def download(self) -> bool:
|
| 137 |
"""Downloads/prepares the data."""
|
| 138 |
+
success = self.delegate.download()
|
| 139 |
+
if success:
|
| 140 |
+
self._clean_extracted_files()
|
| 141 |
+
return success
|
| 142 |
+
|
| 143 |
+
def _clean_extracted_files(self):
|
| 144 |
+
"""Removes unnecessary files/directories from the extracted data."""
|
| 145 |
+
path = self.local_path
|
| 146 |
+
if not os.path.exists(path):
|
| 147 |
+
return
|
| 148 |
+
|
| 149 |
+
logger.info(f"Cleaning execution artifacts from {path}")
|
| 150 |
+
|
| 151 |
+
# Directories to remove completely
|
| 152 |
+
DIRS_TO_REMOVE = {'.git', '__pycache__', 'node_modules', '.ipynb_checkpoints', '.pytest_cache', '.dart_tool'}
|
| 153 |
+
|
| 154 |
+
# Files to remove
|
| 155 |
+
FILES_TO_REMOVE = {'.DS_Store', 'Thumbs.db', '.gitignore', '.gitattributes'}
|
| 156 |
+
|
| 157 |
+
for root, dirs, files in os.walk(path, topdown=False):
|
| 158 |
+
# Remove directories
|
| 159 |
+
for name in dirs:
|
| 160 |
+
if name in DIRS_TO_REMOVE:
|
| 161 |
+
dir_path = os.path.join(root, name)
|
| 162 |
+
try:
|
| 163 |
+
shutil.rmtree(dir_path)
|
| 164 |
+
logger.info(f"Removed directory: {dir_path}")
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.warning(f"Failed to remove {dir_path}: {e}")
|
| 167 |
+
|
| 168 |
+
# Remove files
|
| 169 |
+
for name in files:
|
| 170 |
+
if name in FILES_TO_REMOVE:
|
| 171 |
+
file_path = os.path.join(root, name)
|
| 172 |
+
try:
|
| 173 |
+
os.remove(file_path)
|
| 174 |
+
logger.info(f"Removed file: {file_path}")
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.warning(f"Failed to remove {file_path}: {e}")
|
| 177 |
+
|
| 178 |
def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
|
| 179 |
"""Yields (content, metadata) tuples."""
|
| 180 |
yield from self.delegate.walk(get_content)
|
|
|
|
| 215 |
IGNORE_EXTENSIONS = {
|
| 216 |
'.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
|
| 217 |
'.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
|
| 218 |
+
'.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map',
|
| 219 |
+
'.graphml', '.xml', '.toml'
|
| 220 |
}
|
| 221 |
# Files to ignore by exact name (lock files, etc.)
|
| 222 |
IGNORE_FILES = {
|
|
|
|
| 274 |
IGNORE_EXTENSIONS = {
|
| 275 |
'.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
|
| 276 |
'.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
|
| 277 |
+
'.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map',
|
| 278 |
+
'.graphml', '.xml', '.toml'
|
| 279 |
}
|
| 280 |
# Files to ignore by exact name (lock files, etc.)
|
| 281 |
IGNORE_FILES = {
|
code_chatbot/mcp/__init__.py
ADDED
|
File without changes
|
code_chatbot/{mcp_client.py β mcp/mcp_client.py}
RENAMED
|
@@ -6,7 +6,7 @@ Provides async methods to call MCP tools from other parts of the application.
|
|
| 6 |
|
| 7 |
import logging
|
| 8 |
from typing import List, Dict, Optional
|
| 9 |
-
from code_chatbot.mcp_server import RefactorMCPServer, SearchResult, RefactorResult, RefactorSuggestion
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 6 |
|
| 7 |
import logging
|
| 8 |
from typing import List, Dict, Optional
|
| 9 |
+
from code_chatbot.mcp.mcp_server import RefactorMCPServer, SearchResult, RefactorResult, RefactorSuggestion
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
code_chatbot/{mcp_server.py β mcp/mcp_server.py}
RENAMED
|
File without changes
|
code_chatbot/retrieval/__init__.py
ADDED
|
File without changes
|
code_chatbot/{graph_rag.py β retrieval/graph_rag.py}
RENAMED
|
File without changes
|
code_chatbot/{llm_retriever.py β retrieval/llm_retriever.py}
RENAMED
|
File without changes
|
code_chatbot/{rag.py β retrieval/rag.py}
RENAMED
|
@@ -7,8 +7,8 @@ from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
|
| 7 |
from langchain_core.retrievers import BaseRetriever
|
| 8 |
# Simplified implementation that works with current langchain version
|
| 9 |
# We'll implement history-aware retrieval manually
|
| 10 |
-
from code_chatbot.reranker import Reranker
|
| 11 |
-
from code_chatbot.retriever_wrapper import build_enhanced_retriever
|
| 12 |
import os
|
| 13 |
|
| 14 |
# Configure logging
|
|
@@ -77,7 +77,7 @@ class ChatEngine:
|
|
| 77 |
self.llm_retriever = None
|
| 78 |
if self.repo_files:
|
| 79 |
try:
|
| 80 |
-
from code_chatbot.llm_retriever import LLMRetriever
|
| 81 |
from langchain.retrievers import EnsembleRetriever
|
| 82 |
|
| 83 |
logger.info(f"Initializing LLMRetriever with {len(self.repo_files)} files.")
|
|
@@ -103,8 +103,8 @@ class ChatEngine:
|
|
| 103 |
self.code_analyzer = None
|
| 104 |
if self.use_agent:
|
| 105 |
try:
|
| 106 |
-
from code_chatbot.agent_workflow import create_agent_graph
|
| 107 |
-
from code_chatbot.ast_analysis import EnhancedCodeAnalyzer
|
| 108 |
import os
|
| 109 |
|
| 110 |
logger.info(f"Building Agentic Workflow Graph for {self.repo_dir}...")
|
|
@@ -239,7 +239,7 @@ class ChatEngine:
|
|
| 239 |
# Rebuild agent if using agents
|
| 240 |
if self.use_agent:
|
| 241 |
try:
|
| 242 |
-
from code_chatbot.agent_workflow import create_agent_graph
|
| 243 |
self.agent_executor = create_agent_graph(
|
| 244 |
llm=self.llm,
|
| 245 |
retriever=self.vector_retriever,
|
|
@@ -288,7 +288,7 @@ class ChatEngine:
|
|
| 288 |
|
| 289 |
# Contextualize with history
|
| 290 |
# Use comprehensive system prompt for high-quality answers
|
| 291 |
-
from code_chatbot.prompts import get_prompt_for_provider
|
| 292 |
sys_content = get_prompt_for_provider("system_agent", self.provider).format(repo_name=self.repo_name)
|
| 293 |
system_msg = SystemMessage(content=sys_content)
|
| 294 |
|
|
@@ -320,13 +320,7 @@ class ChatEngine:
|
|
| 320 |
answer = raw_content
|
| 321 |
|
| 322 |
# CLEANING: Remove hallucinated source chips
|
| 323 |
-
|
| 324 |
-
# Remove the specific div block structure
|
| 325 |
-
answer = re.sub(r'<div class="source-chip">.*?</div>\s*</div>', '', answer, flags=re.DOTALL)
|
| 326 |
-
# Remove standalone chips if any remain
|
| 327 |
-
answer = re.sub(r'<div class="source-chip">.*?</div>', '', answer, flags=re.DOTALL)
|
| 328 |
-
# Clean up leading whitespace/newlines left behind
|
| 329 |
-
answer = answer.strip()
|
| 330 |
|
| 331 |
# Update history
|
| 332 |
self.chat_history.append(HumanMessage(content=question))
|
|
@@ -371,6 +365,21 @@ class ChatEngine:
|
|
| 371 |
logger.error(f"Error during chat: {e}", exc_info=True)
|
| 372 |
return f"Error: {str(e)}", []
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
def _linear_chat(self, question: str) -> Tuple[str, List[dict]]:
|
| 375 |
"""Linear RAG fallback."""
|
| 376 |
messages, sources, _ = self._prepare_chat_context(question)
|
|
@@ -381,7 +390,7 @@ class ChatEngine:
|
|
| 381 |
# Get response from LLM
|
| 382 |
try:
|
| 383 |
response_msg = self.llm.invoke(messages)
|
| 384 |
-
answer = response_msg.content
|
| 385 |
except Exception as e:
|
| 386 |
# Check for Rate Limit in Linear Chat
|
| 387 |
error_str = str(e)
|
|
@@ -460,7 +469,7 @@ class ChatEngine:
|
|
| 460 |
})
|
| 461 |
|
| 462 |
# Build prompt with history - use provider-specific prompt
|
| 463 |
-
from code_chatbot.prompts import get_prompt_for_provider
|
| 464 |
base_prompt = get_prompt_for_provider("linear_rag", self.provider)
|
| 465 |
qa_system_prompt = base_prompt.format(
|
| 466 |
repo_name=self.repo_name,
|
|
|
|
| 7 |
from langchain_core.retrievers import BaseRetriever
|
| 8 |
# Simplified implementation that works with current langchain version
|
| 9 |
# We'll implement history-aware retrieval manually
|
| 10 |
+
from code_chatbot.retrieval.reranker import Reranker
|
| 11 |
+
from code_chatbot.retrieval.retriever_wrapper import build_enhanced_retriever
|
| 12 |
import os
|
| 13 |
|
| 14 |
# Configure logging
|
|
|
|
| 77 |
self.llm_retriever = None
|
| 78 |
if self.repo_files:
|
| 79 |
try:
|
| 80 |
+
from code_chatbot.retrieval.llm_retriever import LLMRetriever
|
| 81 |
from langchain.retrievers import EnsembleRetriever
|
| 82 |
|
| 83 |
logger.info(f"Initializing LLMRetriever with {len(self.repo_files)} files.")
|
|
|
|
| 103 |
self.code_analyzer = None
|
| 104 |
if self.use_agent:
|
| 105 |
try:
|
| 106 |
+
from code_chatbot.agents.agent_workflow import create_agent_graph
|
| 107 |
+
from code_chatbot.analysis.ast_analysis import EnhancedCodeAnalyzer
|
| 108 |
import os
|
| 109 |
|
| 110 |
logger.info(f"Building Agentic Workflow Graph for {self.repo_dir}...")
|
|
|
|
| 239 |
# Rebuild agent if using agents
|
| 240 |
if self.use_agent:
|
| 241 |
try:
|
| 242 |
+
from code_chatbot.agents.agent_workflow import create_agent_graph
|
| 243 |
self.agent_executor = create_agent_graph(
|
| 244 |
llm=self.llm,
|
| 245 |
retriever=self.vector_retriever,
|
|
|
|
| 288 |
|
| 289 |
# Contextualize with history
|
| 290 |
# Use comprehensive system prompt for high-quality answers
|
| 291 |
+
from code_chatbot.core.prompts import get_prompt_for_provider
|
| 292 |
sys_content = get_prompt_for_provider("system_agent", self.provider).format(repo_name=self.repo_name)
|
| 293 |
system_msg = SystemMessage(content=sys_content)
|
| 294 |
|
|
|
|
| 320 |
answer = raw_content
|
| 321 |
|
| 322 |
# CLEANING: Remove hallucinated source chips
|
| 323 |
+
answer = self._clean_response(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
# Update history
|
| 326 |
self.chat_history.append(HumanMessage(content=question))
|
|
|
|
| 365 |
logger.error(f"Error during chat: {e}", exc_info=True)
|
| 366 |
return f"Error: {str(e)}", []
|
| 367 |
|
| 368 |
+
def _clean_response(self, text: str) -> str:
|
| 369 |
+
"""Clean response from hallucinated HTML/CSS artifacts."""
|
| 370 |
+
if not text:
|
| 371 |
+
return ""
|
| 372 |
+
|
| 373 |
+
import re
|
| 374 |
+
# Remove the specific div block structure for source chips
|
| 375 |
+
clean_text = re.sub(r'<div class="source-chip">.*?</div>\s*</div>', '', text, flags=re.DOTALL)
|
| 376 |
+
# Remove standalone chips if any remain
|
| 377 |
+
clean_text = re.sub(r'<div class="source-chip">.*?</div>', '', clean_text, flags=re.DOTALL)
|
| 378 |
+
# Remove source-container divs
|
| 379 |
+
clean_text = re.sub(r'<div class="source-container">.*?</div>', '', clean_text, flags=re.DOTALL)
|
| 380 |
+
|
| 381 |
+
return clean_text.strip()
|
| 382 |
+
|
| 383 |
def _linear_chat(self, question: str) -> Tuple[str, List[dict]]:
|
| 384 |
"""Linear RAG fallback."""
|
| 385 |
messages, sources, _ = self._prepare_chat_context(question)
|
|
|
|
| 390 |
# Get response from LLM
|
| 391 |
try:
|
| 392 |
response_msg = self.llm.invoke(messages)
|
| 393 |
+
answer = self._clean_response(response_msg.content)
|
| 394 |
except Exception as e:
|
| 395 |
# Check for Rate Limit in Linear Chat
|
| 396 |
error_str = str(e)
|
|
|
|
| 469 |
})
|
| 470 |
|
| 471 |
# Build prompt with history - use provider-specific prompt
|
| 472 |
+
from code_chatbot.core.prompts import get_prompt_for_provider
|
| 473 |
base_prompt = get_prompt_for_provider("linear_rag", self.provider)
|
| 474 |
qa_system_prompt = base_prompt.format(
|
| 475 |
repo_name=self.repo_name,
|
code_chatbot/{reranker.py β retrieval/reranker.py}
RENAMED
|
File without changes
|
code_chatbot/{retriever_wrapper.py β retrieval/retriever_wrapper.py}
RENAMED
|
@@ -5,7 +5,7 @@ from typing import List, Optional, Any
|
|
| 5 |
from langchain_core.retrievers import BaseRetriever
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
| 8 |
-
from code_chatbot.reranker import Reranker
|
| 9 |
|
| 10 |
# Try to import MultiQueryRetriever - may not be available in all versions
|
| 11 |
try:
|
|
|
|
| 5 |
from langchain_core.retrievers import BaseRetriever
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
| 8 |
+
from code_chatbot.retrieval.reranker import Reranker
|
| 9 |
|
| 10 |
# Try to import MultiQueryRetriever - may not be available in all versions
|
| 11 |
try:
|
components/file_explorer.py
CHANGED
|
@@ -80,7 +80,7 @@ def render_tree_items(tree: Dict, depth: int):
|
|
| 80 |
|
| 81 |
for name, node in sorted_items:
|
| 82 |
is_file = node.get("_type") == "file"
|
| 83 |
-
indent = "β
|
| 84 |
|
| 85 |
if is_file:
|
| 86 |
# File item
|
|
|
|
| 80 |
|
| 81 |
for name, node in sorted_items:
|
| 82 |
is_file = node.get("_type") == "file"
|
| 83 |
+
indent = "β " * depth # Compact indent for sidebar
|
| 84 |
|
| 85 |
if is_file:
|
| 86 |
# File item
|
components/multi_mode.py
CHANGED
|
@@ -135,7 +135,7 @@ def render_search_mode():
|
|
| 135 |
|
| 136 |
with st.spinner("Searching codebase..."):
|
| 137 |
try:
|
| 138 |
-
from code_chatbot.mcp_client import MCPClient
|
| 139 |
|
| 140 |
client = MCPClient(workspace_root=workspace)
|
| 141 |
results = client.search_code(
|
|
@@ -222,7 +222,7 @@ def render_refactor_mode():
|
|
| 222 |
|
| 223 |
with st.spinner("Processing refactoring..."):
|
| 224 |
try:
|
| 225 |
-
from code_chatbot.mcp_client import MCPClient
|
| 226 |
|
| 227 |
client = MCPClient(workspace_root=workspace)
|
| 228 |
result = client.refactor_code(
|
|
@@ -298,7 +298,7 @@ def render_refactor_mode():
|
|
| 298 |
if st.button("Apply Refactoring", type="primary", use_container_width=True):
|
| 299 |
with st.spinner("Processing..."):
|
| 300 |
try:
|
| 301 |
-
from code_chatbot.mcp_client import MCPClient
|
| 302 |
|
| 303 |
client = MCPClient(workspace_root=workspace)
|
| 304 |
result = client.refactor_code(
|
|
|
|
| 135 |
|
| 136 |
with st.spinner("Searching codebase..."):
|
| 137 |
try:
|
| 138 |
+
from code_chatbot.mcp.mcp_client import MCPClient
|
| 139 |
|
| 140 |
client = MCPClient(workspace_root=workspace)
|
| 141 |
results = client.search_code(
|
|
|
|
| 222 |
|
| 223 |
with st.spinner("Processing refactoring..."):
|
| 224 |
try:
|
| 225 |
+
from code_chatbot.mcp.mcp_client import MCPClient
|
| 226 |
|
| 227 |
client = MCPClient(workspace_root=workspace)
|
| 228 |
result = client.refactor_code(
|
|
|
|
| 298 |
if st.button("Apply Refactoring", type="primary", use_container_width=True):
|
| 299 |
with st.spinner("Processing..."):
|
| 300 |
try:
|
| 301 |
+
from code_chatbot.mcp.mcp_client import MCPClient
|
| 302 |
|
| 303 |
client = MCPClient(workspace_root=workspace)
|
| 304 |
result = client.refactor_code(
|
components/sidebar.py
CHANGED
|
@@ -117,7 +117,7 @@ def render_sidebar():
|
|
| 117 |
# Show usage statistics if available
|
| 118 |
if st.session_state.chat_engine:
|
| 119 |
try:
|
| 120 |
-
from code_chatbot.rate_limiter import get_rate_limiter
|
| 121 |
limiter = get_rate_limiter(provider)
|
| 122 |
stats = limiter.get_usage_stats()
|
| 123 |
|
|
|
|
| 117 |
# Show usage statistics if available
|
| 118 |
if st.session_state.chat_engine:
|
| 119 |
try:
|
| 120 |
+
from code_chatbot.core.rate_limiter import get_rate_limiter
|
| 121 |
limiter = get_rate_limiter(provider)
|
| 122 |
stats = limiter.get_usage_stats()
|
| 123 |
|
pages/1_β‘_Code_Studio.py
CHANGED
|
@@ -12,92 +12,101 @@ st.set_page_config(
|
|
| 12 |
page_title="Code Studio",
|
| 13 |
page_icon="β‘",
|
| 14 |
layout="wide",
|
| 15 |
-
initial_sidebar_state="
|
| 16 |
)
|
| 17 |
|
| 18 |
apply_custom_css()
|
| 19 |
|
| 20 |
# --- State Management ---
|
| 21 |
-
if "active_tab" not in st.session_state:
|
| 22 |
-
st.session_state.active_tab = "explorer"
|
| 23 |
-
|
| 24 |
if "processed_files" not in st.session_state or not st.session_state.processed_files:
|
| 25 |
-
# If accessed directly without processing, redirect home
|
| 26 |
st.warning("β οΈ Please index a codebase first.")
|
| 27 |
if st.button("Go Home"):
|
| 28 |
st.switch_page("app.py")
|
| 29 |
st.stop()
|
| 30 |
|
| 31 |
-
# ---
|
| 32 |
-
|
| 33 |
-
#
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# --- Side Panel (Tabs) ---
|
| 37 |
-
with col_panel:
|
| 38 |
-
# Use native Streamlit tabs for horizontal navigation
|
| 39 |
-
tab_explorer, tab_search, tab_chat, tab_generate = st.tabs(["π Explorer", "π Search", "π¬ Chat", "β¨ Generate"])
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
render_search_panel(st.session_state.get("indexed_files", []))
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
with tab_chat:
|
| 61 |
chat_engine = st.session_state.get("chat_engine")
|
| 62 |
if chat_engine:
|
| 63 |
render_chat_panel(chat_engine)
|
| 64 |
else:
|
| 65 |
-
st.error("Chat engine unavailable.
|
| 66 |
-
|
| 67 |
-
with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
chat_engine = st.session_state.get("chat_engine")
|
| 69 |
if chat_engine:
|
| 70 |
render_generate_panel(chat_engine, st.session_state.get("indexed_files", []))
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
render_code_viewer_simple(selected_file)
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
# Empty State
|
| 94 |
-
st.markdown(
|
| 95 |
-
"""
|
| 96 |
-
<div style="display: flex; flex-direction: column; align-items: center; justify-content: center; height: 60vh; opacity: 0.5;">
|
| 97 |
-
<h1>β‘ Code Studio</h1>
|
| 98 |
-
<p>Select a file from the explorer to view context.</p>
|
| 99 |
-
<p>Use the tabs on the left to switch between tools.</p>
|
| 100 |
-
</div>
|
| 101 |
-
""",
|
| 102 |
-
unsafe_allow_html=True
|
| 103 |
-
)
|
|
|
|
| 12 |
page_title="Code Studio",
|
| 13 |
page_icon="β‘",
|
| 14 |
layout="wide",
|
| 15 |
+
initial_sidebar_state="expanded"
|
| 16 |
)
|
| 17 |
|
| 18 |
apply_custom_css()
|
| 19 |
|
| 20 |
# --- State Management ---
|
|
|
|
|
|
|
|
|
|
| 21 |
if "processed_files" not in st.session_state or not st.session_state.processed_files:
|
|
|
|
| 22 |
st.warning("β οΈ Please index a codebase first.")
|
| 23 |
if st.button("Go Home"):
|
| 24 |
st.switch_page("app.py")
|
| 25 |
st.stop()
|
| 26 |
|
| 27 |
+
# --- Sidebar: Navigation & Explorer ---
|
| 28 |
+
with st.sidebar:
|
| 29 |
+
# 1. View Settings
|
| 30 |
+
st.header("βοΈ View")
|
| 31 |
+
layout_mode = st.radio("Layout Mode", ["Tabs (Full Width)", "Split View"], horizontal=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
st.divider()
|
| 34 |
+
|
| 35 |
+
# 2. File Explorer
|
| 36 |
+
render_file_tree(
|
| 37 |
+
st.session_state.get("indexed_files", []),
|
| 38 |
+
st.session_state.get("workspace_root", "")
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
st.divider()
|
| 42 |
+
|
| 43 |
+
# 3. Actions
|
| 44 |
+
if st.button("π New Codebase", use_container_width=True):
|
| 45 |
+
st.session_state.processed_files = False
|
| 46 |
+
st.session_state.chat_engine = None
|
| 47 |
+
st.session_state.indexed_files = None
|
| 48 |
+
st.session_state.workspace_root = None
|
| 49 |
+
st.session_state.selected_file = None
|
| 50 |
+
st.switch_page("app.py")
|
| 51 |
|
| 52 |
+
# --- Main Workspace ---
|
|
|
|
| 53 |
|
| 54 |
+
if layout_mode == "Tabs (Full Width)":
|
| 55 |
+
# TABBED LAYOUT (Default)
|
| 56 |
+
tab_chat, tab_code, tab_agent, tab_search = st.tabs(["π¬ Chat", "π Code Editor", "β¨ Agent", "π Search"])
|
| 57 |
+
|
| 58 |
with tab_chat:
|
| 59 |
chat_engine = st.session_state.get("chat_engine")
|
| 60 |
if chat_engine:
|
| 61 |
render_chat_panel(chat_engine)
|
| 62 |
else:
|
| 63 |
+
st.error("Chat engine unavailable.")
|
| 64 |
+
|
| 65 |
+
with tab_code:
|
| 66 |
+
selected_file = st.session_state.get("selected_file")
|
| 67 |
+
if selected_file:
|
| 68 |
+
filename = os.path.basename(selected_file)
|
| 69 |
+
st.caption(f"Editing: {filename}")
|
| 70 |
+
render_code_viewer_simple(selected_file)
|
| 71 |
+
else:
|
| 72 |
+
st.info("π Select a file from the sidebar to view code.")
|
| 73 |
+
|
| 74 |
+
with tab_agent:
|
| 75 |
chat_engine = st.session_state.get("chat_engine")
|
| 76 |
if chat_engine:
|
| 77 |
render_generate_panel(chat_engine, st.session_state.get("indexed_files", []))
|
| 78 |
+
|
| 79 |
+
with tab_search:
|
| 80 |
+
render_search_panel(st.session_state.get("indexed_files", []))
|
| 81 |
|
| 82 |
+
else:
|
| 83 |
+
# SPLIT VIEW (Legacy)
|
| 84 |
+
split_ratio = st.slider("Panel Width (%)", 20, 80, 40, step=5)
|
| 85 |
+
panel_width = split_ratio / 100.0
|
| 86 |
+
editor_width = 1.0 - panel_width
|
| 87 |
|
| 88 |
+
col_panel, col_editor = st.columns([panel_width, editor_width])
|
| 89 |
+
|
| 90 |
+
with col_panel:
|
| 91 |
+
tab_sub_chat, tab_sub_search, tab_sub_agent = st.tabs(["π¬ Chat", "π Search", "β¨ Agent"])
|
| 92 |
+
|
| 93 |
+
with tab_sub_chat:
|
| 94 |
+
chat_engine = st.session_state.get("chat_engine")
|
| 95 |
+
if chat_engine:
|
| 96 |
+
render_chat_panel(chat_engine)
|
| 97 |
+
|
| 98 |
+
with tab_sub_search:
|
| 99 |
+
render_search_panel(st.session_state.get("indexed_files", []))
|
| 100 |
|
| 101 |
+
with tab_sub_agent:
|
| 102 |
+
chat_engine = st.session_state.get("chat_engine")
|
| 103 |
+
if chat_engine:
|
| 104 |
+
render_generate_panel(chat_engine, st.session_state.get("indexed_files", []))
|
| 105 |
+
|
| 106 |
+
with col_editor:
|
| 107 |
+
selected_file = st.session_state.get("selected_file")
|
| 108 |
+
if selected_file:
|
| 109 |
+
st.caption(f"Editing: {os.path.basename(selected_file)}")
|
| 110 |
render_code_viewer_simple(selected_file)
|
| 111 |
+
else:
|
| 112 |
+
st.info("π Select a file from the sidebar.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/1_β‘_Code_Studio.py.bak
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
β‘ Code Studio - The Main IDE Interface
|
| 3 |
+
"""
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import os
|
| 6 |
+
from components.style import apply_custom_css
|
| 7 |
+
from components.file_explorer import render_file_tree
|
| 8 |
+
from components.code_viewer import render_code_viewer_simple
|
| 9 |
+
from components.panels import render_chat_panel, render_search_panel, render_generate_panel
|
| 10 |
+
|
| 11 |
+
st.set_page_config(
|
| 12 |
+
page_title="Code Studio",
|
| 13 |
+
page_icon="β‘",
|
| 14 |
+
layout="wide",
|
| 15 |
+
initial_sidebar_state="collapsed" # Hide standard sidebar
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
apply_custom_css()
|
| 19 |
+
|
| 20 |
+
# --- State Management ---
|
| 21 |
+
if "active_tab" not in st.session_state:
|
| 22 |
+
st.session_state.active_tab = "explorer"
|
| 23 |
+
|
| 24 |
+
if "processed_files" not in st.session_state or not st.session_state.processed_files:
|
| 25 |
+
# If accessed directly without processing, redirect home
|
| 26 |
+
st.warning("β οΈ Please index a codebase first.")
|
| 27 |
+
if st.button("Go Home"):
|
| 28 |
+
st.switch_page("app.py")
|
| 29 |
+
st.stop()
|
| 30 |
+
|
| 31 |
+
# --- Layout Configuration ---
|
| 32 |
+
# Allow user to resize the split
|
| 33 |
+
with st.sidebar:
|
| 34 |
+
st.header("βοΈ Layout Settings")
|
| 35 |
+
split_ratio = st.slider(
|
| 36 |
+
"Panel Width (%)",
|
| 37 |
+
min_value=20,
|
| 38 |
+
max_value=80,
|
| 39 |
+
value=30,
|
| 40 |
+
step=5,
|
| 41 |
+
help="Adjust the width of the left panel (Chat/Explorer)."
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Calculate column ratios based on percentage
|
| 45 |
+
panel_width = split_ratio / 100.0
|
| 46 |
+
editor_width = 1.0 - panel_width
|
| 47 |
+
|
| 48 |
+
# Main Layout
|
| 49 |
+
col_panel, col_editor = st.columns([panel_width, editor_width])
|
| 50 |
+
|
| 51 |
+
# --- Side Panel (Tabs) ---
|
| 52 |
+
with col_panel:
|
| 53 |
+
# Use native Streamlit tabs for horizontal navigation
|
| 54 |
+
tab_explorer, tab_search, tab_chat, tab_generate = st.tabs(["π Explorer", "π Search", "π¬ Chat", "β¨ Generate"])
|
| 55 |
+
|
| 56 |
+
with tab_explorer:
|
| 57 |
+
st.markdown("### π Project Files")
|
| 58 |
+
render_file_tree(
|
| 59 |
+
st.session_state.get("indexed_files", []),
|
| 60 |
+
st.session_state.get("workspace_root", "")
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
st.divider()
|
| 64 |
+
if st.button("π Index New Codebase", use_container_width=True):
|
| 65 |
+
st.session_state.processed_files = False
|
| 66 |
+
st.session_state.chat_engine = None
|
| 67 |
+
st.session_state.indexed_files = None
|
| 68 |
+
st.session_state.workspace_root = None
|
| 69 |
+
st.session_state.selected_file = None
|
| 70 |
+
st.switch_page("app.py")
|
| 71 |
+
|
| 72 |
+
with tab_search:
|
| 73 |
+
render_search_panel(st.session_state.get("indexed_files", []))
|
| 74 |
+
|
| 75 |
+
with tab_chat:
|
| 76 |
+
chat_engine = st.session_state.get("chat_engine")
|
| 77 |
+
if chat_engine:
|
| 78 |
+
render_chat_panel(chat_engine)
|
| 79 |
+
else:
|
| 80 |
+
st.error("Chat engine unavailable. Please index a codebase first.")
|
| 81 |
+
|
| 82 |
+
with tab_generate:
|
| 83 |
+
chat_engine = st.session_state.get("chat_engine")
|
| 84 |
+
if chat_engine:
|
| 85 |
+
render_generate_panel(chat_engine, st.session_state.get("indexed_files", []))
|
| 86 |
+
else:
|
| 87 |
+
st.error("Chat engine unavailable.")
|
| 88 |
+
|
| 89 |
+
# --- Main Editor ---
|
| 90 |
+
with col_editor:
|
| 91 |
+
# If a file is selected, show it. Otherwise show welcome/empty state.
|
| 92 |
+
selected_file = st.session_state.get("selected_file")
|
| 93 |
+
|
| 94 |
+
if selected_file:
|
| 95 |
+
# We use a container to ensure height consistency
|
| 96 |
+
with st.container():
|
| 97 |
+
# Alignment Spacer: Matches the height of st.tabs headers (~50px)
|
| 98 |
+
st.markdown("<div style='height: 50px;'></div>", unsafe_allow_html=True)
|
| 99 |
+
|
| 100 |
+
# Breadcrumbs / File Header
|
| 101 |
+
filename = os.path.basename(selected_file)
|
| 102 |
+
st.caption(f"Editing: {filename}")
|
| 103 |
+
|
| 104 |
+
# Code Viewer
|
| 105 |
+
render_code_viewer_simple(selected_file)
|
| 106 |
+
|
| 107 |
+
else:
|
| 108 |
+
# Empty State
|
| 109 |
+
st.markdown(
|
| 110 |
+
"""
|
| 111 |
+
<div style="display: flex; flex-direction: column; align-items: center; justify-content: center; height: 60vh; opacity: 0.5;">
|
| 112 |
+
<h1>β‘ Code Studio</h1>
|
| 113 |
+
<p>Select a file from the explorer to view context.</p>
|
| 114 |
+
<p>Use the tabs on the left to switch between tools.</p>
|
| 115 |
+
</div>
|
| 116 |
+
""",
|
| 117 |
+
unsafe_allow_html=True
|
| 118 |
+
)
|
tests/test_merkle_tree_simple.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
Test script for Merkle tree change detection.
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from code_chatbot.merkle_tree import MerkleTree
|
| 6 |
from pathlib import Path
|
| 7 |
import tempfile
|
| 8 |
import shutil
|
|
|
|
| 2 |
Test script for Merkle tree change detection.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from code_chatbot.ingestion.merkle_tree import MerkleTree
|
| 6 |
from pathlib import Path
|
| 7 |
import tempfile
|
| 8 |
import shutil
|