CodeMode Agent
commited on
Commit
·
463fc7e
1
Parent(s):
17cc505
Deploy CodeMode via Agent
Browse files- README.md +20 -5
- app.py +430 -0
- requirements.txt +9 -0
- scripts/__init__.py +0 -0
- scripts/__pycache__/__init__.cpython-311.pyc +0 -0
- scripts/aggregate_datasets.py +77 -0
- scripts/core/README.md +37 -0
- scripts/core/__init__.py +0 -0
- scripts/core/__pycache__/__init__.cpython-311.pyc +0 -0
- scripts/core/ingestion/__init__.py +0 -0
- scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc +0 -0
- scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc +0 -0
- scripts/core/ingestion/ast_chunker.py +390 -0
- scripts/core/ingestion/chunk.py +497 -0
- scripts/core/ingestion/chunk_schema.py +112 -0
- scripts/core/ingestion/doc_chunker.py +446 -0
- scripts/core/ingestion/generate_data.py +658 -0
- scripts/core/ingestion/hierarchical_chunker.py +182 -0
- scripts/core/ingestion/ingest.py +380 -0
- scripts/core/ingestion/repo_metadata.py +408 -0
- scripts/core/ingestion/ts_chunker.py +155 -0
- scripts/core/training/__init__.py +0 -0
- scripts/core/training/model.py +47 -0
- scripts/core/training/test_model.py +64 -0
- scripts/core/training/train.py +145 -0
- scripts/core/training/trainer.py +118 -0
- scripts/core/utils/__init__.py +0 -0
- scripts/core/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- scripts/core/utils/__pycache__/id_utils.cpython-311.pyc +0 -0
- scripts/core/utils/id_utils.py +91 -0
- scripts/generate_all_frameworks.py +228 -0
- scripts/run_pairs_triplets_pipeline.py +120 -0
- scripts/run_python_pipeline.py +131 -0
- scripts/run_repo_pipeline.py +289 -0
- scripts/triplets_synthesis.py +259 -0
README.md
CHANGED
|
@@ -1,12 +1,27 @@
|
|
| 1 |
---
|
| 2 |
title: CodeMode
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: CodeMode
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.19.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# CodeMode: Agentic RAG Engine
|
| 14 |
+
|
| 15 |
+
This is the official demo for CodeMode, an advanced RAG engine for codebases.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- **Ingest**: Clone and index any public GitHub repository.
|
| 19 |
+
- **Semantic Search**: Find relevant code using natural language.
|
| 20 |
+
- **Code-to-Code**: Find similar functions using code snippets.
|
| 21 |
+
- **MROps**: Analyze embedding quality and diversity.
|
| 22 |
+
|
| 23 |
+
## Local Setup
|
| 24 |
+
```bash
|
| 25 |
+
pip install -r requirements.txt
|
| 26 |
+
python app.py
|
| 27 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from transformers import AutoTokenizer, AutoModel
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
import shutil
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import chromadb
|
| 11 |
+
from chromadb.config import Settings
|
| 12 |
+
import uuid
|
| 13 |
+
|
| 14 |
+
# --- Add scripts to path so we can import ingestion modules ---
|
| 15 |
+
# --- Add scripts to path so we can import ingestion modules ---
|
| 16 |
+
sys.path.append(os.path.dirname(__file__))
|
| 17 |
+
from scripts.core.ingestion.ingest import GitCrawler
|
| 18 |
+
from scripts.core.ingestion.chunk import RepoChunker
|
| 19 |
+
|
| 20 |
+
# --- Configuration ---
|
| 21 |
+
MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
|
| 22 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
+
DB_DIR = Path("data/chroma_db")
|
| 24 |
+
DB_DIR.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
print(f"Loading model: {MODEL_NAME} on {DEVICE}...")
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 28 |
+
model = AutoModel.from_pretrained(MODEL_NAME)
|
| 29 |
+
model.to(DEVICE)
|
| 30 |
+
model.eval()
|
| 31 |
+
print("Model loaded!")
|
| 32 |
+
|
| 33 |
+
# --- Vector Database Setup ---
|
| 34 |
+
# Initialize ChromaDB Client (Persistent)
|
| 35 |
+
chroma_client = chromadb.PersistentClient(path=str(DB_DIR))
|
| 36 |
+
|
| 37 |
+
# Create or Get Collection
|
| 38 |
+
# We use cosine similarity space
|
| 39 |
+
collection = chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
|
| 40 |
+
|
| 41 |
+
# --- Helper Functions ---
|
| 42 |
+
def compute_embeddings(text_list):
|
| 43 |
+
"""Batch compute embeddings"""
|
| 44 |
+
if not text_list: return None
|
| 45 |
+
# Truncate to 512 tokens to avoid errors
|
| 46 |
+
inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
|
| 47 |
+
with torch.no_grad():
|
| 48 |
+
out = model(**inputs)
|
| 49 |
+
emb = out.last_hidden_state.mean(dim=1)
|
| 50 |
+
return F.normalize(emb, p=2, dim=1)
|
| 51 |
+
|
| 52 |
+
def reset_db():
|
| 53 |
+
"""Clear database"""
|
| 54 |
+
try:
|
| 55 |
+
chroma_client.delete_collection("codemode_rag")
|
| 56 |
+
chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
|
| 57 |
+
return "Database reset (All embeddings deleted)."
|
| 58 |
+
except Exception as e:
|
| 59 |
+
return f"Error resetting DB: {e}"
|
| 60 |
+
|
| 61 |
+
def search_codebase(query, top_k=5):
|
| 62 |
+
"""Semantic Search using ChromaDB"""
|
| 63 |
+
if collection.count() == 0: return []
|
| 64 |
+
|
| 65 |
+
query_emb = compute_embeddings([query])
|
| 66 |
+
if query_emb is None: return []
|
| 67 |
+
|
| 68 |
+
# Convert tensor to list for Chroma
|
| 69 |
+
query_vec = query_emb.cpu().numpy().tolist()[0]
|
| 70 |
+
|
| 71 |
+
results = collection.query(
|
| 72 |
+
query_embeddings=[query_vec],
|
| 73 |
+
n_results=min(top_k, collection.count()),
|
| 74 |
+
include=["metadatas", "documents", "distances"]
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Parse items
|
| 78 |
+
output = []
|
| 79 |
+
if results['ids']:
|
| 80 |
+
for i in range(len(results['ids'][0])):
|
| 81 |
+
meta = results['metadatas'][0][i]
|
| 82 |
+
code = results['documents'][0][i]
|
| 83 |
+
dist = results['distances'][0][i]
|
| 84 |
+
score = 1 - dist # Cosine distance to similarity
|
| 85 |
+
|
| 86 |
+
link_icon = "[Link]" if score > 0.7 else ""
|
| 87 |
+
output.append([meta.get("file_name", "unknown"), f"{score:.4f} {link_icon}", code[:300] + "..."])
|
| 88 |
+
|
| 89 |
+
return output
|
| 90 |
+
|
| 91 |
+
def fn_ingest(repo_url):
|
| 92 |
+
"""
|
| 93 |
+
1. Clone Repo
|
| 94 |
+
2. Chunk Files
|
| 95 |
+
3. Compute Embeddings (Batched)
|
| 96 |
+
4. Store in ChromaDB
|
| 97 |
+
"""
|
| 98 |
+
if not repo_url.startswith("http"):
|
| 99 |
+
return "Invalid URL"
|
| 100 |
+
|
| 101 |
+
DATA_DIR = Path(os.path.abspath("data/raw_ingest"))
|
| 102 |
+
import stat
|
| 103 |
+
def remove_readonly(func, path, _):
|
| 104 |
+
os.chmod(path, stat.S_IWRITE)
|
| 105 |
+
func(path)
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Clean up old raw data
|
| 109 |
+
if DATA_DIR.exists():
|
| 110 |
+
shutil.rmtree(DATA_DIR, onerror=remove_readonly)
|
| 111 |
+
|
| 112 |
+
# 1. Clone
|
| 113 |
+
yield f"Cloning {repo_url}..."
|
| 114 |
+
crawler = GitCrawler(cache_dir=DATA_DIR)
|
| 115 |
+
repo_path = crawler.clone_repository(repo_url)
|
| 116 |
+
|
| 117 |
+
if not repo_path:
|
| 118 |
+
return "Failed to clone repository."
|
| 119 |
+
|
| 120 |
+
# 2. Chunk
|
| 121 |
+
yield "Listing files..."
|
| 122 |
+
files = crawler.list_files(repo_path, extensions={'.py', '.md', '.json', '.js', '.ts', '.java', '.cpp'})
|
| 123 |
+
if isinstance(files, tuple): files = [f.path for f in files[0]]
|
| 124 |
+
|
| 125 |
+
total_files = len(files)
|
| 126 |
+
yield f"Found {total_files} files. Chunking..."
|
| 127 |
+
|
| 128 |
+
chunker = RepoChunker()
|
| 129 |
+
all_chunks = []
|
| 130 |
+
|
| 131 |
+
for i, file_path in enumerate(files):
|
| 132 |
+
yield f"Chunking: {i+1}/{total_files} ({file_path.name})"
|
| 133 |
+
try:
|
| 134 |
+
meta = {"file_name": file_path.name, "url": repo_url}
|
| 135 |
+
file_chunks = chunker.chunk_file(file_path, repo_metadata=meta)
|
| 136 |
+
all_chunks.extend(file_chunks)
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"Skipping {file_path}: {e}")
|
| 139 |
+
|
| 140 |
+
if not all_chunks:
|
| 141 |
+
return "No valid chunks found."
|
| 142 |
+
|
| 143 |
+
# 3. Indexing Loop (Batched)
|
| 144 |
+
total_chunks = len(all_chunks)
|
| 145 |
+
yield f"Generated {total_chunks} chunks. Embedding & Indexing into ChromaDB..."
|
| 146 |
+
|
| 147 |
+
batch_size = 64
|
| 148 |
+
for i in range(0, total_chunks, batch_size):
|
| 149 |
+
batch = all_chunks[i:i+batch_size]
|
| 150 |
+
|
| 151 |
+
# Prepare data
|
| 152 |
+
texts = [c.code for c in batch]
|
| 153 |
+
ids = [str(uuid.uuid4()) for _ in batch]
|
| 154 |
+
metadatas = [{"file_name": Path(c.file_path).name, "url": repo_url} for c in batch]
|
| 155 |
+
|
| 156 |
+
# Compute Embeddings
|
| 157 |
+
embeddings = compute_embeddings(texts)
|
| 158 |
+
if embeddings is not None:
|
| 159 |
+
# Add to Chroma
|
| 160 |
+
collection.add(
|
| 161 |
+
ids=ids,
|
| 162 |
+
embeddings=embeddings.cpu().numpy().tolist(),
|
| 163 |
+
metadatas=metadatas,
|
| 164 |
+
documents=texts
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
progress = int((i / total_chunks) * 100)
|
| 168 |
+
yield f"Indexed {min(i+batch_size, total_chunks)}/{total_chunks} ({progress}%)"
|
| 169 |
+
|
| 170 |
+
count = collection.count()
|
| 171 |
+
yield f"Success! Database now has {count} code chunks. Ready for search."
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
import traceback
|
| 175 |
+
traceback.print_exc()
|
| 176 |
+
yield f"Error: {str(e)}"
|
| 177 |
+
|
| 178 |
+
# --- Analysis Functions ---
|
| 179 |
+
def fn_analyze_embeddings():
|
| 180 |
+
count = collection.count()
|
| 181 |
+
if count < 5:
|
| 182 |
+
return "Not enough data (Need > 5 chunks).", None
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
# Fetch all embeddings (Limit to 2000 for visualization speed)
|
| 186 |
+
limit = min(count, 2000)
|
| 187 |
+
data = collection.get(limit=limit, include=["embeddings", "metadatas"])
|
| 188 |
+
|
| 189 |
+
X = torch.tensor(data['embeddings'])
|
| 190 |
+
|
| 191 |
+
# PCA
|
| 192 |
+
X_mean = torch.mean(X, 0)
|
| 193 |
+
X_centered = X - X_mean
|
| 194 |
+
U, S, V = torch.pca_lowrank(X_centered, q=2)
|
| 195 |
+
projected = torch.matmul(X_centered, V[:, :2]).numpy()
|
| 196 |
+
|
| 197 |
+
# Diversity
|
| 198 |
+
indices = torch.randint(0, len(X), (min(100, len(X)),))
|
| 199 |
+
sample = X[indices]
|
| 200 |
+
sim_matrix = torch.mm(sample, sample.t())
|
| 201 |
+
mask = ~torch.eye(len(sample), dtype=bool)
|
| 202 |
+
avg_sim = sim_matrix[mask].mean().item()
|
| 203 |
+
diversity_score = 1.0 - avg_sim
|
| 204 |
+
|
| 205 |
+
metrics = (
|
| 206 |
+
f"Total Chunks: {count}\n"
|
| 207 |
+
f"Analyzed: {len(X)} (Sampled)\n"
|
| 208 |
+
f"Diversity Score: {diversity_score:.4f}\n"
|
| 209 |
+
f"Est. Avg Similarity: {avg_sim:.4f}"
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
plot_df = pd.DataFrame({
|
| 213 |
+
"x": projected[:, 0],
|
| 214 |
+
"y": projected[:, 1],
|
| 215 |
+
"topic": [m.get("file_name", "unknown") for m in data['metadatas']]
|
| 216 |
+
})
|
| 217 |
+
|
| 218 |
+
return metrics, gr.ScatterPlot(value=plot_df, x="x", y="y", color="topic", title="Semantic Space", tooltip="topic")
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
import traceback
|
| 222 |
+
traceback.print_exc()
|
| 223 |
+
return f"Analysis Error: {e}", None
|
| 224 |
+
|
| 225 |
+
def fn_evaluate_retrieval(sample_limit):
|
| 226 |
+
count = collection.count()
|
| 227 |
+
if count < 10: return "Not enough data for evaluation (Need > 10 chunks)."
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
# Sample random chunks
|
| 231 |
+
# Chroma doesn't support random sample easily, so we get a larger batch and pick random
|
| 232 |
+
fetch_limit = min(count, 2000) # Fetch up to 2k to sample from
|
| 233 |
+
data = collection.get(limit=fetch_limit, include=["documents", "ids"])
|
| 234 |
+
|
| 235 |
+
import random
|
| 236 |
+
actual_sample_size = min(sample_limit, len(data['ids']))
|
| 237 |
+
sample_indices = random.sample(range(len(data['ids'])), actual_sample_size)
|
| 238 |
+
|
| 239 |
+
hits_at_1 = 0
|
| 240 |
+
hits_at_5 = 0
|
| 241 |
+
mrr_sum = 0
|
| 242 |
+
|
| 243 |
+
# Generator for progress updates
|
| 244 |
+
yield f"Running evaluation on {actual_sample_size} chunks..."
|
| 245 |
+
|
| 246 |
+
for i, idx in enumerate(sample_indices):
|
| 247 |
+
target_id = data['ids'][idx]
|
| 248 |
+
code = data['documents'][idx]
|
| 249 |
+
|
| 250 |
+
# Synthetic Query
|
| 251 |
+
query = "\n".join(code.split("\n")[:3])
|
| 252 |
+
query_emb = compute_embeddings([query]).cpu().numpy().tolist()[0]
|
| 253 |
+
|
| 254 |
+
# Query DB
|
| 255 |
+
results = collection.query(query_embeddings=[query_emb], n_results=10)
|
| 256 |
+
|
| 257 |
+
# Check results
|
| 258 |
+
found_ids = results['ids'][0]
|
| 259 |
+
if target_id in found_ids:
|
| 260 |
+
rank = found_ids.index(target_id) + 1
|
| 261 |
+
mrr_sum += 1.0 / rank
|
| 262 |
+
if rank == 1: hits_at_1 += 1
|
| 263 |
+
if rank <= 5: hits_at_5 += 1
|
| 264 |
+
|
| 265 |
+
if i % 10 == 0:
|
| 266 |
+
yield f"Evaluated {i}/{actual_sample_size}..."
|
| 267 |
+
|
| 268 |
+
recall_1 = hits_at_1 / actual_sample_size
|
| 269 |
+
recall_5 = hits_at_5 / actual_sample_size
|
| 270 |
+
mrr = mrr_sum / actual_sample_size
|
| 271 |
+
|
| 272 |
+
report = (
|
| 273 |
+
f"Evaluation on {actual_sample_size} random chunks:\n"
|
| 274 |
+
f"--------------------------------------------\n"
|
| 275 |
+
f"Recall@1: {recall_1:.4f}\n"
|
| 276 |
+
f"Recall@5: {recall_5:.4f}\n"
|
| 277 |
+
f"MRR: {mrr:.4f}\n"
|
| 278 |
+
f"\n(Note: Using ChromaDB for retrieval)"
|
| 279 |
+
)
|
| 280 |
+
yield report
|
| 281 |
+
except Exception as e:
|
| 282 |
+
import traceback
|
| 283 |
+
traceback.print_exc()
|
| 284 |
+
yield f"Eval Error: {e}"
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# --- UI Layout ---
|
| 288 |
+
theme = gr.themes.Soft(
|
| 289 |
+
primary_hue="slate",
|
| 290 |
+
neutral_hue="slate",
|
| 291 |
+
spacing_size="sm",
|
| 292 |
+
radius_size="md"
|
| 293 |
+
).set(
|
| 294 |
+
body_background_fill="*neutral_50",
|
| 295 |
+
block_background_fill="white",
|
| 296 |
+
block_border_width="1px",
|
| 297 |
+
block_title_text_weight="600"
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
css = """
|
| 301 |
+
h1 {
|
| 302 |
+
text-align: center;
|
| 303 |
+
font-family: 'Inter', sans-serif;
|
| 304 |
+
margin-bottom: 1rem;
|
| 305 |
+
color: #1e293b;
|
| 306 |
+
}
|
| 307 |
+
.gradio-container {
|
| 308 |
+
max-width: 1200px !important;
|
| 309 |
+
margin: auto;
|
| 310 |
+
}
|
| 311 |
+
"""
|
| 312 |
+
|
| 313 |
+
with gr.Blocks(theme=theme, css=css, title="CodeMode") as demo:
|
| 314 |
+
gr.Markdown("# CodeMode")
|
| 315 |
+
|
| 316 |
+
with gr.Tabs():
|
| 317 |
+
# --- TAB 1: INGEST ---
|
| 318 |
+
with gr.Tab("1. Ingest GitHub Repo"):
|
| 319 |
+
gr.Markdown("### Connect a Repository")
|
| 320 |
+
with gr.Row():
|
| 321 |
+
repo_input = gr.Textbox(label="GitHub URL", placeholder="https://github.com/fastapi/fastapi", value="https://github.com/langchain-ai/langgraph")
|
| 322 |
+
ingest_btn = gr.Button("Ingest & Index", variant="primary")
|
| 323 |
+
|
| 324 |
+
with gr.Row():
|
| 325 |
+
reset_btn = gr.Button("Reset Database", variant="stop")
|
| 326 |
+
ingest_status = gr.Textbox(label="Status")
|
| 327 |
+
|
| 328 |
+
with gr.Accordion("Database Inspector", open=False):
|
| 329 |
+
list_files_btn = gr.Button("Refresh File List")
|
| 330 |
+
files_df = gr.Dataframe(
|
| 331 |
+
headers=["File Name", "Chunks", "Source URL"],
|
| 332 |
+
datatype=["str", "number", "str"],
|
| 333 |
+
interactive=False
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
def fn_list_files():
|
| 337 |
+
count = collection.count()
|
| 338 |
+
if count == 0: return [["Database Empty", 0, "-"]]
|
| 339 |
+
|
| 340 |
+
try:
|
| 341 |
+
# Fetch all metadata (limit to 10k to prevent UI freeze)
|
| 342 |
+
limit = min(count, 10000)
|
| 343 |
+
data = collection.get(limit=limit, include=["metadatas"])
|
| 344 |
+
|
| 345 |
+
if not data or 'metadatas' not in data or data['metadatas'] is None:
|
| 346 |
+
return [["Error: No metadata found", 0, "-"]]
|
| 347 |
+
|
| 348 |
+
# Aggregate stats
|
| 349 |
+
file_counts = {} # filename -> count
|
| 350 |
+
file_urls = {} # filename -> url
|
| 351 |
+
|
| 352 |
+
for meta in data['metadatas']:
|
| 353 |
+
if meta is None: continue # Skip None entries
|
| 354 |
+
fname = meta.get("file_name", "unknown")
|
| 355 |
+
url = meta.get("url", "-")
|
| 356 |
+
file_counts[fname] = file_counts.get(fname, 0) + 1
|
| 357 |
+
file_urls[fname] = url
|
| 358 |
+
|
| 359 |
+
# Convert to list
|
| 360 |
+
output = []
|
| 361 |
+
for fname, count in file_counts.items():
|
| 362 |
+
output.append([fname, count, file_urls[fname]])
|
| 363 |
+
|
| 364 |
+
if not output:
|
| 365 |
+
return [["No files found in metadata", 0, "-"]]
|
| 366 |
+
|
| 367 |
+
# Sort by chunk count (descending)
|
| 368 |
+
output.sort(key=lambda x: x[1], reverse=True)
|
| 369 |
+
return output
|
| 370 |
+
except Exception as e:
|
| 371 |
+
import traceback
|
| 372 |
+
traceback.print_exc()
|
| 373 |
+
return [[f"Error: {str(e)}", 0, "-"]]
|
| 374 |
+
|
| 375 |
+
ingest_btn.click(fn_ingest, inputs=repo_input, outputs=[ingest_status])
|
| 376 |
+
reset_btn.click(fn=reset_db, inputs=[], outputs=[ingest_status])
|
| 377 |
+
list_files_btn.click(fn_list_files, inputs=[], outputs=[files_df])
|
| 378 |
+
|
| 379 |
+
# --- TAB 2: SEARCH ---
|
| 380 |
+
with gr.Tab("2. Semantic Search"):
|
| 381 |
+
gr.Markdown("### Search the Ingested Code")
|
| 382 |
+
with gr.Row():
|
| 383 |
+
search_box = gr.Textbox(label="Search Query", placeholder="e.g., 'how to create a state graph'")
|
| 384 |
+
search_btn = gr.Button("Search", variant="primary")
|
| 385 |
+
|
| 386 |
+
results_df = gr.Dataframe(
|
| 387 |
+
headers=["File Name", "Score", "Code Snippet"],
|
| 388 |
+
datatype=["str", "str", "str"],
|
| 389 |
+
interactive=False,
|
| 390 |
+
wrap=True
|
| 391 |
+
)
|
| 392 |
+
search_btn.click(fn=search_codebase, inputs=search_box, outputs=results_df)
|
| 393 |
+
|
| 394 |
+
# --- TAB 3: CODE SEARCH ---
|
| 395 |
+
with gr.Tab("3. Find Similar Code"):
|
| 396 |
+
gr.Markdown("### Code-to-Code Retrieval")
|
| 397 |
+
with gr.Row():
|
| 398 |
+
code_input = gr.Code(label="Reference Code", language="python")
|
| 399 |
+
code_search_btn = gr.Button("Find Matches", variant="primary")
|
| 400 |
+
|
| 401 |
+
code_results_df = gr.Dataframe(
|
| 402 |
+
headers=["File Name", "Score", "Matched Code"],
|
| 403 |
+
datatype=["str", "str", "str"],
|
| 404 |
+
interactive=False,
|
| 405 |
+
wrap=True
|
| 406 |
+
)
|
| 407 |
+
code_search_btn.click(fn=search_codebase, inputs=code_input, outputs=code_results_df)
|
| 408 |
+
|
| 409 |
+
# --- TAB 4: MLOps MONITORING ---
|
| 410 |
+
with gr.Tab("4. Deployment Monitoring"):
|
| 411 |
+
gr.Markdown("### Embedding Quality Analysis")
|
| 412 |
+
analyze_btn = gr.Button("Analyze Embeddings", variant="secondary")
|
| 413 |
+
|
| 414 |
+
with gr.Row():
|
| 415 |
+
quality_metrics = gr.Textbox(label="Quality Metrics")
|
| 416 |
+
plot_output = gr.ScatterPlot(label="Semantic Space (PCA)")
|
| 417 |
+
|
| 418 |
+
analyze_btn.click(fn_analyze_embeddings, inputs=[], outputs=[quality_metrics, plot_output])
|
| 419 |
+
|
| 420 |
+
gr.Markdown("### Extrinsic Evaluation (Retrieval Performance)")
|
| 421 |
+
with gr.Row():
|
| 422 |
+
eval_size = gr.Slider(minimum=10, maximum=1000, value=50, step=10, label="Sample Size (Chunks)")
|
| 423 |
+
eval_btn = gr.Button("Run Retrieval Evaluation", variant="primary")
|
| 424 |
+
|
| 425 |
+
eval_output = gr.Textbox(label="Evaluation Report")
|
| 426 |
+
|
| 427 |
+
eval_btn.click(fn_evaluate_retrieval, inputs=[eval_size], outputs=eval_output)
|
| 428 |
+
|
| 429 |
+
if __name__ == "__main__":
|
| 430 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
chromadb>=0.4.0
|
| 3 |
+
torch
|
| 4 |
+
transformers
|
| 5 |
+
pandas
|
| 6 |
+
scikit-learn
|
| 7 |
+
tree-sitter==0.21.3
|
| 8 |
+
tree-sitter-languages
|
| 9 |
+
gitpython
|
scripts/__init__.py
ADDED
|
File without changes
|
scripts/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (172 Bytes). View file
|
|
|
scripts/aggregate_datasets.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
|
| 3 |
+
Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py.
|
| 4 |
+
|
| 5 |
+
'''
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import List, Dict
|
| 11 |
+
|
| 12 |
+
BASE_SYNTHETIC_DIR = Path("data/synthetic")
|
| 13 |
+
OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_jsonl(path: Path) -> List[Dict]:
|
| 17 |
+
with path.open("r", encoding="utf-8") as f:
|
| 18 |
+
return [json.loads(line) for line in f]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def save_jsonl(path: Path, records: List[Dict]):
|
| 22 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 23 |
+
with path.open("w", encoding="utf-8") as f:
|
| 24 |
+
for r in records:
|
| 25 |
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def save_json(path: Path, records: List[Dict]):
|
| 29 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
with path.open("w", encoding="utf-8") as f:
|
| 31 |
+
json.dump(records, f, indent=2)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def aggregate():
|
| 35 |
+
positive_pairs_all = []
|
| 36 |
+
triplets_all = []
|
| 37 |
+
included_runs = []
|
| 38 |
+
|
| 39 |
+
for run_dir in BASE_SYNTHETIC_DIR.iterdir():
|
| 40 |
+
if not run_dir.is_dir():
|
| 41 |
+
continue
|
| 42 |
+
if run_dir.name == "combined":
|
| 43 |
+
continue
|
| 44 |
+
|
| 45 |
+
pos_path = run_dir / "positive_pairs.jsonl"
|
| 46 |
+
tri_path = run_dir / "triplets.jsonl"
|
| 47 |
+
|
| 48 |
+
if pos_path.exists() and tri_path.exists():
|
| 49 |
+
positive_pairs_all.extend(load_jsonl(pos_path))
|
| 50 |
+
triplets_all.extend(load_jsonl(tri_path))
|
| 51 |
+
included_runs.append(run_dir.name)
|
| 52 |
+
|
| 53 |
+
# Save JSONL (training)
|
| 54 |
+
save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all)
|
| 55 |
+
save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all)
|
| 56 |
+
|
| 57 |
+
# Save JSON (inspection / upload)
|
| 58 |
+
save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all)
|
| 59 |
+
save_json(OUTPUT_DIR / "triplets.json", triplets_all)
|
| 60 |
+
|
| 61 |
+
# Metadata
|
| 62 |
+
metadata = {
|
| 63 |
+
"type": "combined_dataset",
|
| 64 |
+
"included_runs": included_runs,
|
| 65 |
+
"total_positive_pairs": len(positive_pairs_all),
|
| 66 |
+
"total_triplets": len(triplets_all),
|
| 67 |
+
"created_at": datetime.utcnow().isoformat(),
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f:
|
| 71 |
+
json.dump(metadata, f, indent=2)
|
| 72 |
+
|
| 73 |
+
print("✅ Combined dataset created at:", OUTPUT_DIR)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
aggregate()
|
scripts/core/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeMode Core Scripts 🚀
|
| 2 |
+
|
| 3 |
+
This directory contains the **modular core logic** for the CodeMode pipeline. It is designed to be cleaner and more production-ready than the experimental notebooks.
|
| 4 |
+
|
| 5 |
+
## Structure
|
| 6 |
+
|
| 7 |
+
### 1. Ingestion (`scripts/core/ingestion`)
|
| 8 |
+
Handles data collection and processing.
|
| 9 |
+
- `ingest.py`: The Git Crawler (formerly `git_crawler.py`).
|
| 10 |
+
- `chunk.py`: The Universal Chunker (formerly `repo_chunker.py`).
|
| 11 |
+
- `generate_data.py`: Creates training triplets (formerly `pairs_triplets_generator.py`).
|
| 12 |
+
|
| 13 |
+
**Usage:**
|
| 14 |
+
```bash
|
| 15 |
+
# Example: Ingest a repo
|
| 16 |
+
python -m scripts.core.ingestion.ingest --url https://github.com/crewAIInc/crewAI
|
| 17 |
+
|
| 18 |
+
# Example: Generate Triplets
|
| 19 |
+
python -m scripts.core.ingestion.generate_data --chunks data/processed/chunks.jsonl --output data/training
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
### 2. Training (`scripts/core/training`)
|
| 23 |
+
Handles model training and embedding generation.
|
| 24 |
+
- `train.py`: Main training loop.
|
| 25 |
+
- `model.py`: The CodeEmbedder model architecture.
|
| 26 |
+
- `trainer.py`: The training loop logic.
|
| 27 |
+
|
| 28 |
+
**Usage:**
|
| 29 |
+
```bash
|
| 30 |
+
# Example: Train the model
|
| 31 |
+
python -m scripts.core.training.train --data_path data/training/triplets.jsonl --epochs 3
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Why this structure?
|
| 35 |
+
- **Separation of Concerns:** Training logic doesn't depend on web scraping libraries.
|
| 36 |
+
- **Reusability:** You can import `CodeEmbedder` or `RepoChunker` in other projects easily.
|
| 37 |
+
- **Production Ready:** Direct python scripts instead of notebooks.
|
scripts/core/__init__.py
ADDED
|
File without changes
|
scripts/core/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (177 Bytes). View file
|
|
|
scripts/core/ingestion/__init__.py
ADDED
|
File without changes
|
scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (187 Bytes). View file
|
|
|
scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc
ADDED
|
Binary file (14.9 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc
ADDED
|
Binary file (20.4 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc
ADDED
|
Binary file (4.74 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc
ADDED
|
Binary file (14.8 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc
ADDED
|
Binary file (8.04 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc
ADDED
|
Binary file (18 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc
ADDED
|
Binary file (21.7 kB). View file
|
|
|
scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc
ADDED
|
Binary file (5.77 kB). View file
|
|
|
scripts/core/ingestion/ast_chunker.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AST-based semantic code chunker - Primary source of truth for code structure.
|
| 3 |
+
|
| 4 |
+
This module implements the core AST-based chunking strategy that forms the
|
| 5 |
+
authority layer of our hybrid chunking pipeline. It uses Python's built-in
|
| 6 |
+
AST parser to extract semantic chunks (modules, classes, functions, methods)
|
| 7 |
+
while preserving hierarchical relationships.
|
| 8 |
+
|
| 9 |
+
ARCHITECTURE POSITION:
|
| 10 |
+
- Authority Layer: Source of truth for semantic structure
|
| 11 |
+
- Primary Chunker: Generates all primary chunks
|
| 12 |
+
- Hierarchy Builder: Establishes parent-child relationships
|
| 13 |
+
|
| 14 |
+
KEY FEATURES:
|
| 15 |
+
1. AST-first parsing for semantic accuracy
|
| 16 |
+
2. Hierarchical chunk generation with depth tracking
|
| 17 |
+
3. Byte-level span calculation for precise positioning
|
| 18 |
+
4. Import and decorator extraction per node
|
| 19 |
+
5. Deterministic chunk ID generation
|
| 20 |
+
|
| 21 |
+
FLOW:
|
| 22 |
+
File → Python AST → ASTChunker visitor → Semantic chunks with hierarchy
|
| 23 |
+
|
| 24 |
+
USAGE:
|
| 25 |
+
from ast_chunker import extract_ast_chunks
|
| 26 |
+
chunks = extract_ast_chunks(Path("file.py"))
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import ast
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import List, Optional, Union, Dict, Tuple
|
| 32 |
+
import hashlib
|
| 33 |
+
|
| 34 |
+
from ..utils.id_utils import deterministic_chunk_id
|
| 35 |
+
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ASTSymbolType, ChunkType
|
| 36 |
+
|
| 37 |
+
DocNode = Union[
|
| 38 |
+
ast.Module,
|
| 39 |
+
ast.ClassDef,
|
| 40 |
+
ast.FunctionDef,
|
| 41 |
+
ast.AsyncFunctionDef,
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class ASTChunker(ast.NodeVisitor):
|
| 46 |
+
def __init__(self, source: str, file_path: str):
|
| 47 |
+
self.source = source
|
| 48 |
+
self.file_path = file_path
|
| 49 |
+
self.source_bytes = source.encode('utf-8')
|
| 50 |
+
self.chunks: List[CodeChunk] = []
|
| 51 |
+
self.tree = ast.parse(source)
|
| 52 |
+
|
| 53 |
+
# Track hierarchy
|
| 54 |
+
self.current_class: Optional[str] = None
|
| 55 |
+
self.imports_list: List[str] = []
|
| 56 |
+
|
| 57 |
+
# For hierarchy tracking
|
| 58 |
+
self.parent_stack: List[CodeChunk] = []
|
| 59 |
+
self.sibling_counters: Dict[str, int] = {}
|
| 60 |
+
|
| 61 |
+
# Attach parents to nodes
|
| 62 |
+
for node in ast.walk(self.tree):
|
| 63 |
+
for child in ast.iter_child_nodes(node):
|
| 64 |
+
setattr(child, "parent", node)
|
| 65 |
+
|
| 66 |
+
# ---------------- utilities ----------------
|
| 67 |
+
|
| 68 |
+
def _get_code(self, node: ast.AST) -> str:
|
| 69 |
+
code = ast.get_source_segment(self.source, node)
|
| 70 |
+
return code.strip() if code else ""
|
| 71 |
+
|
| 72 |
+
def _get_byte_span(self, start_line: int, end_line: int) -> Tuple[int, int]:
|
| 73 |
+
"""Convert line numbers to byte positions"""
|
| 74 |
+
lines = self.source.split('\n')
|
| 75 |
+
|
| 76 |
+
# Calculate start byte
|
| 77 |
+
start_byte = sum(len(line.encode()) + 1 for line in lines[:start_line-1])
|
| 78 |
+
|
| 79 |
+
# Calculate end byte (up to end_line)
|
| 80 |
+
end_byte = sum(len(line.encode()) + 1 for line in lines[:end_line])
|
| 81 |
+
|
| 82 |
+
return start_byte, end_byte
|
| 83 |
+
|
| 84 |
+
def _extract_node_imports(self, node: ast.AST) -> List[str]:
|
| 85 |
+
"""Extract imports specific to this node (not all module imports)"""
|
| 86 |
+
imports: List[str] = []
|
| 87 |
+
|
| 88 |
+
# Walk through this node's body
|
| 89 |
+
for child in ast.walk(node):
|
| 90 |
+
if isinstance(child, (ast.Import, ast.ImportFrom)):
|
| 91 |
+
try:
|
| 92 |
+
imports.append(ast.unparse(child))
|
| 93 |
+
except Exception:
|
| 94 |
+
imports.append(str(child))
|
| 95 |
+
return imports
|
| 96 |
+
|
| 97 |
+
def _extract_decorators(self, node: ast.AST) -> List[str]:
|
| 98 |
+
decorators: List[str] = []
|
| 99 |
+
if hasattr(node, "decorator_list"):
|
| 100 |
+
for d in node.decorator_list: # type: ignore[attr-defined]
|
| 101 |
+
try:
|
| 102 |
+
decorators.append(ast.unparse(d))
|
| 103 |
+
except Exception:
|
| 104 |
+
decorators.append(str(d))
|
| 105 |
+
return decorators
|
| 106 |
+
|
| 107 |
+
# ---------------- chunk creation ----------------
|
| 108 |
+
|
| 109 |
+
def _create_chunk(
|
| 110 |
+
self,
|
| 111 |
+
node: DocNode,
|
| 112 |
+
chunk_type: ChunkType,
|
| 113 |
+
name: str,
|
| 114 |
+
parent: Optional[str] = None,
|
| 115 |
+
parent_chunk: Optional[CodeChunk] = None,
|
| 116 |
+
) -> CodeChunk:
|
| 117 |
+
code = self._get_code(node)
|
| 118 |
+
|
| 119 |
+
# Get line numbers
|
| 120 |
+
start_line = getattr(node, "lineno", None)
|
| 121 |
+
end_line = getattr(node, "end_lineno", None)
|
| 122 |
+
|
| 123 |
+
# Calculate byte span
|
| 124 |
+
start_byte, end_byte = None, None
|
| 125 |
+
if start_line and end_line:
|
| 126 |
+
start_byte, end_byte = self._get_byte_span(start_line, end_line)
|
| 127 |
+
|
| 128 |
+
# Determine parent if not provided
|
| 129 |
+
if parent is None and chunk_type == "method":
|
| 130 |
+
parent = self.current_class
|
| 131 |
+
|
| 132 |
+
decorators: List[str] = []
|
| 133 |
+
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 134 |
+
decorators = self._extract_decorators(node)
|
| 135 |
+
|
| 136 |
+
# Get imports specific to this node (not all module imports)
|
| 137 |
+
node_imports = self._extract_node_imports(node)
|
| 138 |
+
|
| 139 |
+
# Get docstring only for nodes that can have one
|
| 140 |
+
docstring: Optional[str] = None
|
| 141 |
+
if hasattr(node, 'body'):
|
| 142 |
+
docstring = ast.get_docstring(node)
|
| 143 |
+
|
| 144 |
+
# Determine hierarchy depth
|
| 145 |
+
depth = 0
|
| 146 |
+
lineage: List[str] = []
|
| 147 |
+
sibling_index = 0
|
| 148 |
+
|
| 149 |
+
if parent_chunk:
|
| 150 |
+
depth = parent_chunk.hierarchy.depth + 1
|
| 151 |
+
lineage = parent_chunk.hierarchy.lineage.copy()
|
| 152 |
+
lineage.append(parent_chunk.chunk_id)
|
| 153 |
+
|
| 154 |
+
# Update sibling counter
|
| 155 |
+
parent_key = parent_chunk.chunk_id
|
| 156 |
+
self.sibling_counters[parent_key] = self.sibling_counters.get(parent_key, 0) + 1
|
| 157 |
+
sibling_index = self.sibling_counters[parent_key] - 1
|
| 158 |
+
|
| 159 |
+
ast_info = ChunkAST(
|
| 160 |
+
symbol_type=chunk_type,
|
| 161 |
+
name=name,
|
| 162 |
+
parent=parent,
|
| 163 |
+
docstring=docstring,
|
| 164 |
+
decorators=decorators,
|
| 165 |
+
imports=node_imports,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
span = ChunkSpan(
|
| 169 |
+
start_byte=start_byte,
|
| 170 |
+
end_byte=end_byte,
|
| 171 |
+
start_line=start_line,
|
| 172 |
+
end_line=end_line,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
# Generate chunk ID
|
| 176 |
+
chunk_id = deterministic_chunk_id(
|
| 177 |
+
file_path=self.file_path,
|
| 178 |
+
chunk_type=chunk_type,
|
| 179 |
+
name=name,
|
| 180 |
+
parent=parent,
|
| 181 |
+
start_line=start_line,
|
| 182 |
+
end_line=end_line,
|
| 183 |
+
code=code,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
chunk = CodeChunk(
|
| 187 |
+
chunk_id=chunk_id,
|
| 188 |
+
file_path=self.file_path,
|
| 189 |
+
language="python",
|
| 190 |
+
chunk_type=chunk_type,
|
| 191 |
+
code=code,
|
| 192 |
+
ast=ast_info,
|
| 193 |
+
span=span,
|
| 194 |
+
hierarchy=ChunkHierarchy(
|
| 195 |
+
parent_id=parent_chunk.chunk_id if parent_chunk else None,
|
| 196 |
+
children_ids=[],
|
| 197 |
+
depth=depth,
|
| 198 |
+
is_primary=True,
|
| 199 |
+
is_extracted=False,
|
| 200 |
+
lineage=lineage,
|
| 201 |
+
sibling_index=sibling_index,
|
| 202 |
+
),
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Add to parent's children if parent exists
|
| 206 |
+
if parent_chunk:
|
| 207 |
+
parent_chunk.hierarchy.children_ids.append(chunk_id)
|
| 208 |
+
|
| 209 |
+
self.chunks.append(chunk)
|
| 210 |
+
return chunk
|
| 211 |
+
|
| 212 |
+
def _create_module_chunk(self) -> CodeChunk:
|
| 213 |
+
"""Create module chunk with all imports"""
|
| 214 |
+
module_name = Path(self.file_path).stem
|
| 215 |
+
start_line = 1
|
| 216 |
+
end_line = len(self.source.split('\n'))
|
| 217 |
+
start_byte, end_byte = self._get_byte_span(start_line, end_line)
|
| 218 |
+
|
| 219 |
+
# Module code - entire file
|
| 220 |
+
module_code = self.source
|
| 221 |
+
|
| 222 |
+
# Extract ALL imports for module
|
| 223 |
+
module_imports: List[str] = []
|
| 224 |
+
for node in ast.walk(self.tree):
|
| 225 |
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
| 226 |
+
try:
|
| 227 |
+
module_imports.append(ast.unparse(node))
|
| 228 |
+
except Exception:
|
| 229 |
+
pass
|
| 230 |
+
|
| 231 |
+
chunk_id = deterministic_chunk_id(
|
| 232 |
+
file_path=self.file_path,
|
| 233 |
+
chunk_type="module",
|
| 234 |
+
name=module_name,
|
| 235 |
+
parent=None,
|
| 236 |
+
start_line=start_line,
|
| 237 |
+
end_line=end_line,
|
| 238 |
+
code=module_code,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
ast_info = ChunkAST(
|
| 242 |
+
symbol_type="module",
|
| 243 |
+
name=module_name,
|
| 244 |
+
parent=None,
|
| 245 |
+
docstring=ast.get_docstring(self.tree),
|
| 246 |
+
decorators=[],
|
| 247 |
+
imports=module_imports, # ALL imports in module
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
span = ChunkSpan(
|
| 251 |
+
start_byte=start_byte,
|
| 252 |
+
end_byte=end_byte,
|
| 253 |
+
start_line=start_line,
|
| 254 |
+
end_line=end_line,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
chunk = CodeChunk(
|
| 258 |
+
chunk_id=chunk_id,
|
| 259 |
+
file_path=self.file_path,
|
| 260 |
+
language="python",
|
| 261 |
+
chunk_type="module",
|
| 262 |
+
code=module_code,
|
| 263 |
+
ast=ast_info,
|
| 264 |
+
span=span,
|
| 265 |
+
hierarchy=ChunkHierarchy(
|
| 266 |
+
parent_id=None,
|
| 267 |
+
children_ids=[],
|
| 268 |
+
depth=0,
|
| 269 |
+
is_primary=True,
|
| 270 |
+
is_extracted=False,
|
| 271 |
+
lineage=[],
|
| 272 |
+
sibling_index=0,
|
| 273 |
+
),
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
self.chunks.append(chunk)
|
| 277 |
+
return chunk
|
| 278 |
+
|
| 279 |
+
# ---------------- visitors ----------------
|
| 280 |
+
|
| 281 |
+
def visit_Import(self, node: ast.Import) -> None:
|
| 282 |
+
try:
|
| 283 |
+
self.imports_list.append(ast.unparse(node))
|
| 284 |
+
except Exception:
|
| 285 |
+
pass
|
| 286 |
+
self.generic_visit(node)
|
| 287 |
+
|
| 288 |
+
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
| 289 |
+
try:
|
| 290 |
+
self.imports_list.append(ast.unparse(node))
|
| 291 |
+
except Exception:
|
| 292 |
+
pass
|
| 293 |
+
self.generic_visit(node)
|
| 294 |
+
|
| 295 |
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
| 296 |
+
# Create class chunk
|
| 297 |
+
class_chunk = self._create_chunk(
|
| 298 |
+
node,
|
| 299 |
+
"class",
|
| 300 |
+
node.name,
|
| 301 |
+
parent="module",
|
| 302 |
+
parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Save current class context
|
| 306 |
+
previous_class = self.current_class
|
| 307 |
+
self.current_class = node.name
|
| 308 |
+
|
| 309 |
+
# Push class to stack
|
| 310 |
+
self.parent_stack.append(class_chunk)
|
| 311 |
+
|
| 312 |
+
# Visit class body
|
| 313 |
+
self.generic_visit(node)
|
| 314 |
+
|
| 315 |
+
# Restore previous context
|
| 316 |
+
self.current_class = previous_class
|
| 317 |
+
self.parent_stack.pop()
|
| 318 |
+
|
| 319 |
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
| 320 |
+
parent = getattr(node, "parent", None)
|
| 321 |
+
|
| 322 |
+
if isinstance(parent, ast.Module):
|
| 323 |
+
# Top-level function
|
| 324 |
+
self._create_chunk(
|
| 325 |
+
node,
|
| 326 |
+
"function",
|
| 327 |
+
node.name,
|
| 328 |
+
parent="module",
|
| 329 |
+
parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
|
| 330 |
+
)
|
| 331 |
+
elif isinstance(parent, ast.ClassDef):
|
| 332 |
+
# Method inside class
|
| 333 |
+
self._create_chunk(
|
| 334 |
+
node,
|
| 335 |
+
"method",
|
| 336 |
+
node.name,
|
| 337 |
+
parent=parent.name,
|
| 338 |
+
parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
self.generic_visit(node)
|
| 342 |
+
|
| 343 |
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
| 344 |
+
parent = getattr(node, "parent", None)
|
| 345 |
+
|
| 346 |
+
if isinstance(parent, ast.Module):
|
| 347 |
+
# Top-level async function
|
| 348 |
+
self._create_chunk(
|
| 349 |
+
node,
|
| 350 |
+
"function",
|
| 351 |
+
node.name,
|
| 352 |
+
parent="module",
|
| 353 |
+
parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
|
| 354 |
+
)
|
| 355 |
+
elif isinstance(parent, ast.ClassDef):
|
| 356 |
+
# Async method inside class
|
| 357 |
+
self._create_chunk(
|
| 358 |
+
node,
|
| 359 |
+
"method",
|
| 360 |
+
node.name,
|
| 361 |
+
parent=parent.name,
|
| 362 |
+
parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
self.generic_visit(node)
|
| 366 |
+
|
| 367 |
+
def visit_Module(self, node: ast.Module) -> None:
|
| 368 |
+
# Create module chunk first (root)
|
| 369 |
+
module_chunk = self._create_module_chunk()
|
| 370 |
+
|
| 371 |
+
# Push module to stack
|
| 372 |
+
self.parent_stack.append(module_chunk)
|
| 373 |
+
|
| 374 |
+
# Visit children to create classes and functions
|
| 375 |
+
self.generic_visit(node)
|
| 376 |
+
|
| 377 |
+
# Pop module from stack
|
| 378 |
+
self.parent_stack.pop()
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ---------------- public API ----------------
|
| 382 |
+
|
| 383 |
+
def extract_ast_chunks(file_path: Path) -> List[CodeChunk]:
|
| 384 |
+
source = file_path.read_text(encoding="utf-8")
|
| 385 |
+
chunker = ASTChunker(source, str(file_path))
|
| 386 |
+
|
| 387 |
+
# Visit the tree (creates all chunks with relationships)
|
| 388 |
+
chunker.visit(chunker.tree)
|
| 389 |
+
|
| 390 |
+
return chunker.chunks
|
scripts/core/ingestion/chunk.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Repository File Type Chunker - Universal chunker for all file types.
|
| 3 |
+
|
| 4 |
+
This module provides file-type-aware chunking for repositories, handling
|
| 5 |
+
everything from Python code to configuration files, documentation, and
|
| 6 |
+
special files. It's the universal interface that delegates to specialized
|
| 7 |
+
chunkers based on file type.
|
| 8 |
+
|
| 9 |
+
ARCHITECTURE POSITION:
|
| 10 |
+
- File Type Dispatcher: Routes files to appropriate chunkers
|
| 11 |
+
- Universal Interface: Single entry point for all file types
|
| 12 |
+
- Metadata Enricher: Adds repository context to all chunks
|
| 13 |
+
|
| 14 |
+
KEY FEATURES:
|
| 15 |
+
1. File type detection and intelligent routing
|
| 16 |
+
2. Hierarchical chunking for Python files
|
| 17 |
+
3. Documentation chunking for markdown/RST
|
| 18 |
+
4. Configuration file handling (JSON/YAML/TOML)
|
| 19 |
+
5. Special file handling (README, requirements.txt, Dockerfile)
|
| 20 |
+
6. Binary file detection and skipping
|
| 21 |
+
|
| 22 |
+
FILE TYPE SUPPORT:
|
| 23 |
+
- .py: HierarchicalChunker (AST + Tree-sitter)
|
| 24 |
+
- .md/.mdx/.rst: Documentation chunker
|
| 25 |
+
- .json/.yaml/.toml: Configuration chunker
|
| 26 |
+
- requirements.txt/Dockerfile: Special chunker
|
| 27 |
+
- .txt/.cfg/.ini: Text chunker
|
| 28 |
+
- README/LICENSE: Documentation chunker
|
| 29 |
+
- Others: Text chunker with binary detection
|
| 30 |
+
|
| 31 |
+
DATA FLOW:
|
| 32 |
+
File → Type detection → Route to specialized chunker →
|
| 33 |
+
Add repo metadata → Return chunks
|
| 34 |
+
|
| 35 |
+
USAGE:
|
| 36 |
+
chunker = RepoChunker()
|
| 37 |
+
chunks = chunker.chunk_file(Path("file.py"), repo_metadata)
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
from pathlib import Path
|
| 41 |
+
from typing import List, Dict, Optional, cast
|
| 42 |
+
import json
|
| 43 |
+
import yaml
|
| 44 |
+
import re
|
| 45 |
+
import hashlib
|
| 46 |
+
from .hierarchical_chunker import HierarchicalChunker
|
| 47 |
+
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType, ASTSymbolType
|
| 48 |
+
from .doc_chunker import chunk_document as chunk_markdown_file
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class RepoChunker:
|
| 52 |
+
"""
|
| 53 |
+
Repository chunker that handles ALL file types with proper structure
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(self, use_hierarchical: bool = True):
|
| 57 |
+
if use_hierarchical:
|
| 58 |
+
self.hierarchical_chunker = HierarchicalChunker()
|
| 59 |
+
self.use_hierarchical = use_hierarchical
|
| 60 |
+
|
| 61 |
+
def _generate_stable_id(self, content: str, prefix: str = "stable") -> str:
|
| 62 |
+
"""
|
| 63 |
+
Generate deterministic chunk ID using SHA256.
|
| 64 |
+
|
| 65 |
+
IMPORTANT: This ensures IDs are stable across runs, processes,
|
| 66 |
+
and Python versions - crucial for RAG reproducibility.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
content: The text content to hash
|
| 70 |
+
prefix: ID prefix (config, doc, text, etc.)
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Deterministic ID like "config_8a3b2c1d"
|
| 74 |
+
"""
|
| 75 |
+
# Use SHA256 for consistency with id_utils.py
|
| 76 |
+
hash_digest = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
|
| 77 |
+
return f"{prefix}_{hash_digest}"
|
| 78 |
+
|
| 79 |
+
def chunk_file(self, file_path: Path, repo_metadata: Optional[Dict] = None) -> List[CodeChunk]:
|
| 80 |
+
"""
|
| 81 |
+
Chunk ANY file type with repository context
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
file_path: Path to the file
|
| 85 |
+
repo_metadata: Optional dict with repo metadata
|
| 86 |
+
"""
|
| 87 |
+
suffix = file_path.suffix.lower()
|
| 88 |
+
|
| 89 |
+
# Python files - use your advanced hierarchical chunker
|
| 90 |
+
if suffix == '.py':
|
| 91 |
+
return self._chunk_python_file(file_path, repo_metadata)
|
| 92 |
+
|
| 93 |
+
# Markdown/RST documentation
|
| 94 |
+
elif suffix in ['.md', '.mdx', '.rst']:
|
| 95 |
+
return self._chunk_markdown_file_wrapper(file_path, repo_metadata)
|
| 96 |
+
|
| 97 |
+
# JSON config files
|
| 98 |
+
elif suffix == '.json':
|
| 99 |
+
return self._chunk_json_file(file_path, repo_metadata)
|
| 100 |
+
|
| 101 |
+
# YAML/TOML config files
|
| 102 |
+
elif suffix in ['.yaml', '.yml', '.toml']:
|
| 103 |
+
return self._chunk_config_file(file_path, repo_metadata)
|
| 104 |
+
|
| 105 |
+
# Requirements/Docker files
|
| 106 |
+
elif file_path.name.lower() in ['requirements.txt', 'dockerfile', 'docker-compose.yml']:
|
| 107 |
+
return self._chunk_special_file(file_path, repo_metadata)
|
| 108 |
+
|
| 109 |
+
# Text files
|
| 110 |
+
elif suffix in ['.txt', '.cfg', '.ini', '.conf']:
|
| 111 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 112 |
+
|
| 113 |
+
# README/LICENSE files
|
| 114 |
+
elif file_path.name.lower() in ['readme', 'readme.md', 'license', 'license.txt', 'license.md']:
|
| 115 |
+
return self._chunk_readme_file(file_path, repo_metadata)
|
| 116 |
+
|
| 117 |
+
# All other files
|
| 118 |
+
else:
|
| 119 |
+
return self._chunk_other_file(file_path, repo_metadata)
|
| 120 |
+
|
| 121 |
+
def _chunk_python_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 122 |
+
"""Use our hierarchical chunker for Python files"""
|
| 123 |
+
try:
|
| 124 |
+
if self.use_hierarchical:
|
| 125 |
+
chunks = self.hierarchical_chunker.chunk_file(file_path)
|
| 126 |
+
else:
|
| 127 |
+
# Fallback to basic text chunking instead of hybrid
|
| 128 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 129 |
+
|
| 130 |
+
# Add repository metadata
|
| 131 |
+
if repo_metadata:
|
| 132 |
+
for chunk in chunks:
|
| 133 |
+
if "repo_info" not in chunk.metadata:
|
| 134 |
+
chunk.metadata["repo_info"] = {}
|
| 135 |
+
chunk.metadata["repo_info"].update(repo_metadata)
|
| 136 |
+
|
| 137 |
+
return chunks
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"[ERROR] Error chunking Python file {file_path}: {e}")
|
| 141 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 142 |
+
|
| 143 |
+
def _chunk_markdown_file_wrapper(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 144 |
+
"""Chunk markdown files using our doc_chunker"""
|
| 145 |
+
try:
|
| 146 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 147 |
+
|
| 148 |
+
# Use your existing doc_chunker
|
| 149 |
+
doc_chunks = chunk_markdown_file(
|
| 150 |
+
content,
|
| 151 |
+
source_name=file_path.name,
|
| 152 |
+
source_url=f"file://{file_path}"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Convert to CodeChunk schema
|
| 156 |
+
code_chunks = []
|
| 157 |
+
for doc_chunk in doc_chunks:
|
| 158 |
+
code_chunk = CodeChunk(
|
| 159 |
+
chunk_id=doc_chunk["chunk_id"], # Already uses SHA1 from doc_chunker.py
|
| 160 |
+
file_path=str(file_path),
|
| 161 |
+
language=doc_chunk.get("language", "markdown"),
|
| 162 |
+
chunk_type="documentation",
|
| 163 |
+
code=doc_chunk["content"],
|
| 164 |
+
ast=ChunkAST(
|
| 165 |
+
symbol_type="documentation",
|
| 166 |
+
name=file_path.name,
|
| 167 |
+
parent=None,
|
| 168 |
+
docstring=None
|
| 169 |
+
),
|
| 170 |
+
span=ChunkSpan(
|
| 171 |
+
start_line=doc_chunk.get("metadata", {}).get("line_start", 1),
|
| 172 |
+
end_line=doc_chunk.get("metadata", {}).get("line_end", 1)
|
| 173 |
+
),
|
| 174 |
+
metadata={
|
| 175 |
+
"doc_chunk_type": doc_chunk.get("chunk_type", "text"),
|
| 176 |
+
"repo_info": repo_metadata or {},
|
| 177 |
+
**doc_chunk.get("metadata", {})
|
| 178 |
+
},
|
| 179 |
+
hierarchy=ChunkHierarchy(
|
| 180 |
+
is_primary=True,
|
| 181 |
+
is_extracted=False,
|
| 182 |
+
depth=0
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
code_chunks.append(code_chunk)
|
| 186 |
+
|
| 187 |
+
return code_chunks
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
print(f"[ERROR] Error chunking markdown file {file_path}: {e}")
|
| 191 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 192 |
+
|
| 193 |
+
def _chunk_json_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 194 |
+
"""Chunk JSON config files"""
|
| 195 |
+
try:
|
| 196 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 197 |
+
data = json.loads(content)
|
| 198 |
+
|
| 199 |
+
pretty_content = json.dumps(data, indent=2)
|
| 200 |
+
|
| 201 |
+
# FIXED: Use deterministic SHA256 instead of hash()
|
| 202 |
+
chunk = CodeChunk(
|
| 203 |
+
chunk_id=self._generate_stable_id(pretty_content, "config"),
|
| 204 |
+
file_path=str(file_path),
|
| 205 |
+
language="json",
|
| 206 |
+
chunk_type="configuration",
|
| 207 |
+
code=pretty_content,
|
| 208 |
+
ast=ChunkAST(
|
| 209 |
+
symbol_type="configuration",
|
| 210 |
+
name=file_path.name,
|
| 211 |
+
parent=None,
|
| 212 |
+
docstring=None
|
| 213 |
+
),
|
| 214 |
+
span=ChunkSpan(
|
| 215 |
+
start_line=1,
|
| 216 |
+
end_line=len(pretty_content.split('\n'))
|
| 217 |
+
),
|
| 218 |
+
metadata={
|
| 219 |
+
"file_type": "json_config",
|
| 220 |
+
"config_keys": list(data.keys()) if isinstance(data, dict) else [],
|
| 221 |
+
"repo_info": repo_metadata or {}
|
| 222 |
+
},
|
| 223 |
+
hierarchy=ChunkHierarchy(
|
| 224 |
+
is_primary=True,
|
| 225 |
+
is_extracted=False,
|
| 226 |
+
depth=0
|
| 227 |
+
)
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
return [chunk]
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"[ERROR] Error chunking JSON file {file_path}: {e}")
|
| 234 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 235 |
+
|
| 236 |
+
def _chunk_config_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 237 |
+
"""Chunk YAML/TOML config files"""
|
| 238 |
+
try:
|
| 239 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 240 |
+
suffix = file_path.suffix.lower()
|
| 241 |
+
|
| 242 |
+
language = "yaml" if suffix in ['.yaml', '.yml'] else "toml"
|
| 243 |
+
|
| 244 |
+
# FIXED: Use deterministic SHA256 instead of hash()
|
| 245 |
+
chunk = CodeChunk(
|
| 246 |
+
chunk_id=self._generate_stable_id(content, "config"),
|
| 247 |
+
file_path=str(file_path),
|
| 248 |
+
language=language,
|
| 249 |
+
chunk_type="configuration",
|
| 250 |
+
code=content,
|
| 251 |
+
ast=ChunkAST(
|
| 252 |
+
symbol_type="configuration",
|
| 253 |
+
name=file_path.name,
|
| 254 |
+
parent=None,
|
| 255 |
+
docstring=None
|
| 256 |
+
),
|
| 257 |
+
span=ChunkSpan(
|
| 258 |
+
start_line=1,
|
| 259 |
+
end_line=len(content.split('\n'))
|
| 260 |
+
),
|
| 261 |
+
metadata={
|
| 262 |
+
"file_type": f"{language}_config",
|
| 263 |
+
"repo_info": repo_metadata or {}
|
| 264 |
+
},
|
| 265 |
+
hierarchy=ChunkHierarchy(
|
| 266 |
+
is_primary=True,
|
| 267 |
+
is_extracted=False,
|
| 268 |
+
depth=0
|
| 269 |
+
)
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
return [chunk]
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
print(f"[ERROR] Error chunking config file {file_path}: {e}")
|
| 276 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 277 |
+
|
| 278 |
+
def _chunk_special_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 279 |
+
"""Chunk special files (requirements.txt, Dockerfile, etc.)"""
|
| 280 |
+
try:
|
| 281 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 282 |
+
file_name = file_path.name.lower()
|
| 283 |
+
|
| 284 |
+
if 'requirements' in file_name:
|
| 285 |
+
language = "requirements"
|
| 286 |
+
chunk_type = "configuration"
|
| 287 |
+
prefix = "config"
|
| 288 |
+
elif 'docker' in file_name:
|
| 289 |
+
language = "dockerfile"
|
| 290 |
+
chunk_type = "script"
|
| 291 |
+
prefix = "script"
|
| 292 |
+
else:
|
| 293 |
+
language = "text"
|
| 294 |
+
chunk_type = "text"
|
| 295 |
+
prefix = "text"
|
| 296 |
+
|
| 297 |
+
# FIXED: Use deterministic SHA256 instead of hash()
|
| 298 |
+
chunk = CodeChunk(
|
| 299 |
+
chunk_id=self._generate_stable_id(content, prefix),
|
| 300 |
+
file_path=str(file_path),
|
| 301 |
+
language=language,
|
| 302 |
+
chunk_type=chunk_type,
|
| 303 |
+
code=content,
|
| 304 |
+
ast=ChunkAST(
|
| 305 |
+
symbol_type=chunk_type,
|
| 306 |
+
name=file_path.name,
|
| 307 |
+
parent=None,
|
| 308 |
+
docstring=None
|
| 309 |
+
),
|
| 310 |
+
span=ChunkSpan(
|
| 311 |
+
start_line=1,
|
| 312 |
+
end_line=len(content.split('\n'))
|
| 313 |
+
),
|
| 314 |
+
metadata={
|
| 315 |
+
"file_type": file_name,
|
| 316 |
+
"repo_info": repo_metadata or {},
|
| 317 |
+
"dependencies": self._extract_dependencies(content) if "requirements" in file_name else []
|
| 318 |
+
},
|
| 319 |
+
hierarchy=ChunkHierarchy(
|
| 320 |
+
is_primary=True,
|
| 321 |
+
is_extracted=False,
|
| 322 |
+
depth=0
|
| 323 |
+
)
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
return [chunk]
|
| 327 |
+
|
| 328 |
+
except Exception as e:
|
| 329 |
+
print(f"[ERROR] Error chunking special file {file_path}: {e}")
|
| 330 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 331 |
+
|
| 332 |
+
def _chunk_text_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 333 |
+
"""Chunk plain text files"""
|
| 334 |
+
try:
|
| 335 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 336 |
+
|
| 337 |
+
# Create a single chunk for small files, multiple for large ones
|
| 338 |
+
if len(content.split('\n')) <= 200:
|
| 339 |
+
chunks = [self._create_text_chunk(content, file_path, repo_metadata)]
|
| 340 |
+
else:
|
| 341 |
+
# Split large text files into reasonable chunks
|
| 342 |
+
chunks = []
|
| 343 |
+
lines = content.split('\n')
|
| 344 |
+
chunk_size = 100
|
| 345 |
+
|
| 346 |
+
for i in range(0, len(lines), chunk_size):
|
| 347 |
+
chunk_lines = lines[i:i + chunk_size]
|
| 348 |
+
chunk_content = '\n'.join(chunk_lines)
|
| 349 |
+
|
| 350 |
+
chunk = self._create_text_chunk(
|
| 351 |
+
chunk_content,
|
| 352 |
+
file_path,
|
| 353 |
+
repo_metadata,
|
| 354 |
+
chunk_index=i // chunk_size
|
| 355 |
+
)
|
| 356 |
+
chunks.append(chunk)
|
| 357 |
+
|
| 358 |
+
return chunks
|
| 359 |
+
|
| 360 |
+
except Exception as e:
|
| 361 |
+
print(f"[ERROR] Error reading text file {file_path}: {e}")
|
| 362 |
+
return []
|
| 363 |
+
|
| 364 |
+
def _chunk_readme_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 365 |
+
"""Special handling for README/LICENSE files"""
|
| 366 |
+
try:
|
| 367 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 368 |
+
file_name_lower = file_path.name.lower()
|
| 369 |
+
|
| 370 |
+
# Determine appropriate prefix
|
| 371 |
+
if 'readme' in file_name_lower:
|
| 372 |
+
prefix = "doc"
|
| 373 |
+
elif 'license' in file_name_lower:
|
| 374 |
+
prefix = "license"
|
| 375 |
+
else:
|
| 376 |
+
prefix = "doc"
|
| 377 |
+
|
| 378 |
+
# FIXED: Use deterministic SHA256 instead of hash()
|
| 379 |
+
chunk = CodeChunk(
|
| 380 |
+
chunk_id=self._generate_stable_id(content, prefix),
|
| 381 |
+
file_path=str(file_path),
|
| 382 |
+
language="markdown" if file_path.suffix in ['.md', '.mdx'] else "text",
|
| 383 |
+
chunk_type="documentation",
|
| 384 |
+
code=content,
|
| 385 |
+
ast=ChunkAST(
|
| 386 |
+
symbol_type="documentation",
|
| 387 |
+
name=file_path.name,
|
| 388 |
+
parent=None,
|
| 389 |
+
docstring=None
|
| 390 |
+
),
|
| 391 |
+
span=ChunkSpan(
|
| 392 |
+
start_line=1,
|
| 393 |
+
end_line=len(content.split('\n'))
|
| 394 |
+
),
|
| 395 |
+
metadata={
|
| 396 |
+
"file_type": "readme_license",
|
| 397 |
+
"is_readme": "readme" in file_name_lower,
|
| 398 |
+
"is_license": "license" in file_name_lower,
|
| 399 |
+
"repo_info": repo_metadata or {}
|
| 400 |
+
},
|
| 401 |
+
hierarchy=ChunkHierarchy(
|
| 402 |
+
is_primary=True,
|
| 403 |
+
is_extracted=False,
|
| 404 |
+
depth=0
|
| 405 |
+
)
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
return [chunk]
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
print(f"[ERROR] Error chunking README file {file_path}: {e}")
|
| 412 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 413 |
+
|
| 414 |
+
def _chunk_other_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
|
| 415 |
+
"""Fallback for unknown file types (binary or unsupported)"""
|
| 416 |
+
try:
|
| 417 |
+
# Try to read as text first
|
| 418 |
+
content = file_path.read_text(encoding='utf-8', errors='ignore')
|
| 419 |
+
|
| 420 |
+
# If it looks like binary (mostly non-printable characters)
|
| 421 |
+
if self._looks_like_binary(content):
|
| 422 |
+
print(f"[SKIPPED] Skipping binary file: {file_path}")
|
| 423 |
+
return []
|
| 424 |
+
|
| 425 |
+
# If readable text, treat as text file
|
| 426 |
+
return self._chunk_text_file(file_path, repo_metadata)
|
| 427 |
+
|
| 428 |
+
except UnicodeDecodeError:
|
| 429 |
+
print(f"[SKIPPED] Skipping binary file: {file_path}")
|
| 430 |
+
return []
|
| 431 |
+
except Exception as e:
|
| 432 |
+
print(f"[ERROR] Error with file {file_path}: {e}")
|
| 433 |
+
return []
|
| 434 |
+
|
| 435 |
+
def _create_text_chunk(self, content: str, file_path: Path,
|
| 436 |
+
repo_metadata: Optional[Dict], chunk_index: int = 0) -> CodeChunk:
|
| 437 |
+
"""Helper to create a text chunk"""
|
| 438 |
+
lines = content.split('\n')
|
| 439 |
+
|
| 440 |
+
# ENHANCED: Use deterministic ID that includes chunk_index for uniqueness
|
| 441 |
+
id_payload = f"{content}_{chunk_index}"
|
| 442 |
+
|
| 443 |
+
return CodeChunk(
|
| 444 |
+
chunk_id=self._generate_stable_id(id_payload, "text"),
|
| 445 |
+
file_path=str(file_path),
|
| 446 |
+
language="text",
|
| 447 |
+
chunk_type="text",
|
| 448 |
+
code=content,
|
| 449 |
+
ast=ChunkAST(
|
| 450 |
+
symbol_type="text",
|
| 451 |
+
name=file_path.name,
|
| 452 |
+
parent=None,
|
| 453 |
+
docstring=None
|
| 454 |
+
),
|
| 455 |
+
span=ChunkSpan(
|
| 456 |
+
start_line=1,
|
| 457 |
+
end_line=len(lines)
|
| 458 |
+
),
|
| 459 |
+
metadata={
|
| 460 |
+
"file_type": "text",
|
| 461 |
+
"chunk_index": chunk_index,
|
| 462 |
+
"total_lines": len(lines),
|
| 463 |
+
"repo_info": repo_metadata or {}
|
| 464 |
+
},
|
| 465 |
+
hierarchy=ChunkHierarchy(
|
| 466 |
+
is_primary=True,
|
| 467 |
+
is_extracted=False,
|
| 468 |
+
depth=0
|
| 469 |
+
)
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
def _extract_dependencies(self, requirements_content: str) -> List[str]:
|
| 473 |
+
"""Extract package names from requirements.txt"""
|
| 474 |
+
dependencies = []
|
| 475 |
+
for line in requirements_content.split('\n'):
|
| 476 |
+
line = line.strip()
|
| 477 |
+
if line and not line.startswith('#'):
|
| 478 |
+
# Extract package name (before version specifiers)
|
| 479 |
+
package = line.split('==')[0].split('>=')[0].split('<=')[0].strip()
|
| 480 |
+
if package:
|
| 481 |
+
dependencies.append(package)
|
| 482 |
+
return dependencies
|
| 483 |
+
|
| 484 |
+
def _looks_like_binary(self, content: str, threshold: float = 0.3) -> bool:
|
| 485 |
+
"""Check if content looks like binary data"""
|
| 486 |
+
if not content:
|
| 487 |
+
return False
|
| 488 |
+
|
| 489 |
+
# Count printable vs non-printable characters
|
| 490 |
+
printable = sum(1 for c in content if 32 <= ord(c) <= 126 or c in '\n\r\t')
|
| 491 |
+
total = len(content)
|
| 492 |
+
|
| 493 |
+
if total == 0:
|
| 494 |
+
return False
|
| 495 |
+
|
| 496 |
+
ratio = printable / total
|
| 497 |
+
return ratio < threshold
|
scripts/core/ingestion/chunk_schema.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
chunk_schema.py - UPDATED with enhanced hierarchy
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Optional, Literal, Union
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# ✅ EXPANDED ChunkType to support ALL file types
|
| 10 |
+
ChunkType = Literal[
|
| 11 |
+
"module", # Python module
|
| 12 |
+
"class", # Python class
|
| 13 |
+
"function", # Python function
|
| 14 |
+
"method", # Python method
|
| 15 |
+
"context", # General context
|
| 16 |
+
"documentation", # Markdown/RST docs
|
| 17 |
+
"configuration", # Config files (JSON, YAML, TOML)
|
| 18 |
+
"notebook", # Jupyter notebook
|
| 19 |
+
"script", # Shell scripts
|
| 20 |
+
"dockerfile", # Docker files
|
| 21 |
+
"typescript", # TypeScript files
|
| 22 |
+
"javascript", # JavaScript files
|
| 23 |
+
"text", # Plain text
|
| 24 |
+
"imports", # Import statements
|
| 25 |
+
"unknown" # Unknown file type
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
# For AST symbol types
|
| 29 |
+
ASTSymbolType = Literal[
|
| 30 |
+
"module", "class", "function", "method", "context",
|
| 31 |
+
"documentation", "configuration", "notebook", "script",
|
| 32 |
+
"dockerfile", "typescript", "javascript", "text",
|
| 33 |
+
"imports",
|
| 34 |
+
"unknown"
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# @dataclass
|
| 39 |
+
# class ChunkHierarchy:
|
| 40 |
+
# """Enhanced hierarchical relationship metadata"""
|
| 41 |
+
# parent_id: Optional[str] = None
|
| 42 |
+
# children_ids: List[str] = field(default_factory=list)
|
| 43 |
+
# depth: int = 0
|
| 44 |
+
# is_primary: bool = True
|
| 45 |
+
# is_extracted: bool = False
|
| 46 |
+
# lineage: List[str] = field(default_factory=list) # Path from root
|
| 47 |
+
# sibling_index: int = 0 # Position among siblings
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class ChunkHierarchy:
|
| 51 |
+
"""Enhanced hierarchical relationship metadata"""
|
| 52 |
+
parent_id: Optional[str] = None
|
| 53 |
+
children_ids: List[str] = field(default_factory=list)
|
| 54 |
+
depth: int = 0
|
| 55 |
+
is_primary: bool = True
|
| 56 |
+
is_extracted: bool = False
|
| 57 |
+
lineage: List[str] = field(default_factory=list) # Path from root
|
| 58 |
+
sibling_index: int = 0 # Position among siblings
|
| 59 |
+
|
| 60 |
+
# Optional: Add methods for type-safe operations
|
| 61 |
+
def add_child(self, child_id: str) -> None:
|
| 62 |
+
"""Type-safe method to add child"""
|
| 63 |
+
if child_id not in self.children_ids:
|
| 64 |
+
self.children_ids.append(child_id)
|
| 65 |
+
|
| 66 |
+
def remove_child(self, child_id: str) -> None:
|
| 67 |
+
"""Type-safe method to remove child"""
|
| 68 |
+
if child_id in self.children_ids:
|
| 69 |
+
self.children_ids.remove(child_id)
|
| 70 |
+
|
| 71 |
+
def set_parent(self, parent_id: Optional[str]) -> None:
|
| 72 |
+
"""Type-safe method to set parent"""
|
| 73 |
+
self.parent_id = parent_id
|
| 74 |
+
|
| 75 |
+
def increment_depth(self) -> None:
|
| 76 |
+
"""Increment depth by 1"""
|
| 77 |
+
self.depth += 1
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@dataclass
|
| 81 |
+
class ChunkAST:
|
| 82 |
+
symbol_type: Optional[ASTSymbolType] = None
|
| 83 |
+
name: Optional[str] = None
|
| 84 |
+
parent: Optional[str] = None
|
| 85 |
+
docstring: Optional[str] = None
|
| 86 |
+
decorators: List[str] = field(default_factory=list)
|
| 87 |
+
imports: List[str] = field(default_factory=list)
|
| 88 |
+
node_type: Optional[str] = None # Original AST node type
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@dataclass
|
| 92 |
+
class ChunkSpan:
|
| 93 |
+
start_byte: Optional[int] = None
|
| 94 |
+
end_byte: Optional[int] = None
|
| 95 |
+
start_line: Optional[int] = None
|
| 96 |
+
end_line: Optional[int] = None
|
| 97 |
+
char_count: Optional[int] = None # Character count for quick reference
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@dataclass
|
| 102 |
+
class CodeChunk:
|
| 103 |
+
chunk_id: str
|
| 104 |
+
file_path: str
|
| 105 |
+
language: str
|
| 106 |
+
chunk_type: ChunkType # ✅ Now accepts ALL types
|
| 107 |
+
code: str
|
| 108 |
+
ast: ChunkAST
|
| 109 |
+
span: ChunkSpan
|
| 110 |
+
metadata: Dict = field(default_factory=dict)
|
| 111 |
+
hierarchy: ChunkHierarchy = field(default_factory=ChunkHierarchy)
|
| 112 |
+
|
scripts/core/ingestion/doc_chunker.py
ADDED
|
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import re
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy
|
| 7 |
+
|
| 8 |
+
def _hash_id(text: str, prefix: str) -> str:
|
| 9 |
+
"""
|
| 10 |
+
Generate deterministic ID using SHA256 (standardized).
|
| 11 |
+
|
| 12 |
+
Previously used SHA1, now standardized to SHA256 for consistency
|
| 13 |
+
with repo_chunker.py and id_utils.py.
|
| 14 |
+
"""
|
| 15 |
+
# CHANGED: sha1 → sha256
|
| 16 |
+
h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
|
| 17 |
+
return f"{prefix}_{h}"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _is_actual_code(text: str) -> bool:
|
| 21 |
+
"""
|
| 22 |
+
Check if text inside a fenced block is actual executable code
|
| 23 |
+
or just formatted text.
|
| 24 |
+
"""
|
| 25 |
+
text = text.strip()
|
| 26 |
+
|
| 27 |
+
# Common patterns that indicate formatted text, not code
|
| 28 |
+
formatted_text_patterns = [
|
| 29 |
+
# Lines with many = or - characters (dividers)
|
| 30 |
+
r'^=+\s*[A-Za-z\s]+\s*=+$',
|
| 31 |
+
r'^-+\s*[A-Za-z\s]+\s*-+$',
|
| 32 |
+
# Lines that look like headers/separators
|
| 33 |
+
r'^[=_-]{20,}$',
|
| 34 |
+
# Contains natural language sentences
|
| 35 |
+
r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
|
| 36 |
+
r'[.!?]\s+[A-Z]', # Sentence boundaries
|
| 37 |
+
# Message-like patterns
|
| 38 |
+
r'^\s*(Human|AI|Tool|System|User|Assistant)\s+(Message|Response|Input|Output)?\s*[:=-]',
|
| 39 |
+
r'^\s*[A-Z][a-z]+\s*:', # "Reasoning:", "Acting:", etc.
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
# Check if it looks like formatted text
|
| 43 |
+
lines = text.split('\n')
|
| 44 |
+
formatted_line_count = 0
|
| 45 |
+
code_line_count = 0
|
| 46 |
+
|
| 47 |
+
# Patterns that indicate actual code
|
| 48 |
+
code_patterns = [
|
| 49 |
+
r'^\s*(def|class|import|from|async|await|return|if|for|while|try|except|with)\b',
|
| 50 |
+
r'^\s*@\w+',
|
| 51 |
+
r'^\s*\w+\s*=\s*.+',
|
| 52 |
+
r'^\s*\w+\(.+\)',
|
| 53 |
+
r'^\s*print\(.+\)',
|
| 54 |
+
r'^\s*\{.*\}', # JSON/dict
|
| 55 |
+
r'^\s*\[.*\]', # List
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
for line in lines:
|
| 59 |
+
line = line.strip()
|
| 60 |
+
if not line:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
# Check for formatted text patterns
|
| 64 |
+
is_formatted = any(re.search(pattern, line, re.IGNORECASE) for pattern in formatted_text_patterns)
|
| 65 |
+
|
| 66 |
+
# Check for code patterns
|
| 67 |
+
is_code = any(re.search(pattern, line) for pattern in code_patterns)
|
| 68 |
+
|
| 69 |
+
if is_formatted:
|
| 70 |
+
formatted_line_count += 1
|
| 71 |
+
if is_code:
|
| 72 |
+
code_line_count += 1
|
| 73 |
+
|
| 74 |
+
# If it has many formatted text lines and few/no code lines, it's not actual code
|
| 75 |
+
if formatted_line_count > 1 and code_line_count == 0:
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
# Default to treating fenced blocks as code (original behavior)
|
| 79 |
+
return True
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _looks_like_code_block(lines: List[str]) -> bool:
|
| 83 |
+
"""
|
| 84 |
+
Heuristic to recover code blocks when Markdown fences are missing
|
| 85 |
+
(common after HTML → MD conversion).
|
| 86 |
+
"""
|
| 87 |
+
if not lines:
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
# Join lines and check for minimum length
|
| 91 |
+
joined = "\n".join(lines)
|
| 92 |
+
text = joined.strip()
|
| 93 |
+
|
| 94 |
+
# Too short? Probably not code
|
| 95 |
+
if len(text) < 50:
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
# Check for code patterns
|
| 99 |
+
code_patterns = [
|
| 100 |
+
# Python keywords at line start
|
| 101 |
+
r'^\s*(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|from\s+\w+\s+import)',
|
| 102 |
+
# Function calls or assignments
|
| 103 |
+
r'^\s*\w+\s*=\s*.+|^\s*\w+\s*\(.+\)',
|
| 104 |
+
# Control structures
|
| 105 |
+
r'^\s*(if|for|while|with|try|except|finally|async|await)\s+',
|
| 106 |
+
# Decorators
|
| 107 |
+
r'^\s*@\w+',
|
| 108 |
+
# Return statements
|
| 109 |
+
r'^\s*return\b',
|
| 110 |
+
# Print statements
|
| 111 |
+
r'^\s*print\(',
|
| 112 |
+
# Indented blocks (common in Python)
|
| 113 |
+
r'^\s{4,}\S',
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
# Check for prose indicators (if these are present, it's likely text)
|
| 117 |
+
prose_indicators = [
|
| 118 |
+
# Common English words in prose
|
| 119 |
+
r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
|
| 120 |
+
# Sentence endings followed by capital
|
| 121 |
+
r'[.!?]\s+[A-Z]',
|
| 122 |
+
# Articles
|
| 123 |
+
r'\b(a|an|the)\s+\w+',
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
lines_list = text.split('\n')
|
| 127 |
+
code_line_count = 0
|
| 128 |
+
prose_line_count = 0
|
| 129 |
+
|
| 130 |
+
for line in lines_list:
|
| 131 |
+
line = line.strip()
|
| 132 |
+
if not line:
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
# Check if line looks like code
|
| 136 |
+
is_code = any(re.search(pattern, line) for pattern in code_patterns)
|
| 137 |
+
|
| 138 |
+
# Check if line looks like prose (but only if it's not empty/short)
|
| 139 |
+
is_prose = len(line) > 20 and any(re.search(pattern, line, re.IGNORECASE) for pattern in prose_indicators)
|
| 140 |
+
|
| 141 |
+
if is_code:
|
| 142 |
+
code_line_count += 1
|
| 143 |
+
if is_prose:
|
| 144 |
+
prose_line_count += 1
|
| 145 |
+
|
| 146 |
+
# Need strong evidence for code
|
| 147 |
+
total_non_empty_lines = len([l for l in lines_list if l.strip()])
|
| 148 |
+
|
| 149 |
+
# If more than 2 lines look like code and not many look like prose
|
| 150 |
+
if code_line_count >= 2 and prose_line_count <= code_line_count // 2:
|
| 151 |
+
return True
|
| 152 |
+
|
| 153 |
+
# Special case: single strong code line in short text
|
| 154 |
+
if total_non_empty_lines <= 3 and code_line_count >= 1 and prose_line_count == 0:
|
| 155 |
+
return True
|
| 156 |
+
|
| 157 |
+
# Check for specific code-only patterns
|
| 158 |
+
code_only_patterns = [
|
| 159 |
+
r'^\s*from langchain\.',
|
| 160 |
+
r'^\s*import langchain',
|
| 161 |
+
r'^\s*@tool\b', # Decorator
|
| 162 |
+
r'^\s*agent = create_agent\(',
|
| 163 |
+
r'^\s*result = agent\.invoke\(',
|
| 164 |
+
]
|
| 165 |
+
|
| 166 |
+
if any(re.search(pattern, text) for pattern in code_only_patterns):
|
| 167 |
+
return True
|
| 168 |
+
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _looks_like_executable_code(text: str) -> bool:
|
| 173 |
+
"""Check if code looks like it could be executed"""
|
| 174 |
+
# First check if it's actually code (not formatted text)
|
| 175 |
+
if not _is_actual_code(text):
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
# Check for actual Python syntax patterns
|
| 179 |
+
patterns = [
|
| 180 |
+
r'\bdef\s+\w+\s*\([^)]*\)\s*:',
|
| 181 |
+
r'\bclass\s+\w+\s*\(?[^:]*\)?\s*:',
|
| 182 |
+
r'^\s*from\s+\w+\s+import\s+\w+',
|
| 183 |
+
r'^\s*import\s+\w+',
|
| 184 |
+
r'\breturn\b',
|
| 185 |
+
r'\bprint\(',
|
| 186 |
+
r'^\s*\w+\s*=\s*[^=\n]+$', # Variable assignment
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
lines = text.split('\n')
|
| 190 |
+
executable_lines = 0
|
| 191 |
+
|
| 192 |
+
for line in lines:
|
| 193 |
+
line = line.strip()
|
| 194 |
+
if not line or line.startswith('#') or line.startswith('"""'):
|
| 195 |
+
continue
|
| 196 |
+
if any(re.search(pattern, line) for pattern in patterns):
|
| 197 |
+
executable_lines += 1
|
| 198 |
+
|
| 199 |
+
# Need at least 2 executable lines or 1 strong executable line
|
| 200 |
+
return executable_lines >= 2 or (
|
| 201 |
+
executable_lines >= 1 and len([l for l in lines if l.strip()]) <= 3
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def chunk_document(
|
| 206 |
+
raw_text: str,
|
| 207 |
+
source_name: str,
|
| 208 |
+
source_url: Optional[str] = None,
|
| 209 |
+
) -> List[Dict]:
|
| 210 |
+
"""
|
| 211 |
+
Chunk documentation text containing headings, prose, and code examples.
|
| 212 |
+
|
| 213 |
+
Design goals:
|
| 214 |
+
- Preserve document hierarchy
|
| 215 |
+
- Separate prose vs code
|
| 216 |
+
- Recover code even if Markdown fences are lost
|
| 217 |
+
- Deterministic chunk IDs
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
chunks: List[Dict] = []
|
| 221 |
+
|
| 222 |
+
heading_stack: List[str] = []
|
| 223 |
+
current_heading: Optional[str] = None
|
| 224 |
+
current_heading_level: Optional[int] = None
|
| 225 |
+
|
| 226 |
+
buffer: List[str] = []
|
| 227 |
+
|
| 228 |
+
code_block = False
|
| 229 |
+
code_language: Optional[str] = None
|
| 230 |
+
code_lines: List[str] = []
|
| 231 |
+
|
| 232 |
+
lines = raw_text.splitlines()
|
| 233 |
+
chunk_index = 0
|
| 234 |
+
line_cursor = 0
|
| 235 |
+
|
| 236 |
+
def heading_path() -> Optional[str]:
|
| 237 |
+
return " > ".join(heading_stack) if heading_stack else None
|
| 238 |
+
|
| 239 |
+
def flush_text(start_line: int, end_line: int):
|
| 240 |
+
nonlocal buffer, chunk_index
|
| 241 |
+
if not buffer:
|
| 242 |
+
return
|
| 243 |
+
|
| 244 |
+
text = "\n".join(buffer).strip()
|
| 245 |
+
buffer = []
|
| 246 |
+
|
| 247 |
+
if not text:
|
| 248 |
+
return
|
| 249 |
+
|
| 250 |
+
lines_local = text.splitlines()
|
| 251 |
+
|
| 252 |
+
# 🔹 Recover unfenced code blocks - use stricter heuristic
|
| 253 |
+
# Only mark as code if it's very clearly code
|
| 254 |
+
if _looks_like_code_block(lines_local) and len(text) > 30:
|
| 255 |
+
# Double-check: make sure it doesn't look like prose
|
| 256 |
+
looks_like_prose = any(word in text.lower() for word in
|
| 257 |
+
['the', 'and', 'that', 'this', 'with', 'for', 'are', 'is', 'was'])
|
| 258 |
+
|
| 259 |
+
if not looks_like_prose:
|
| 260 |
+
chunks.append(
|
| 261 |
+
{
|
| 262 |
+
"chunk_id": _hash_id(text, "doc_code"),
|
| 263 |
+
"source": "documentation",
|
| 264 |
+
"source_name": source_name,
|
| 265 |
+
"source_url": source_url,
|
| 266 |
+
"language": "python",
|
| 267 |
+
"chunk_type": "code",
|
| 268 |
+
"content": text,
|
| 269 |
+
"chunk_index": chunk_index,
|
| 270 |
+
"metadata": {
|
| 271 |
+
"heading": current_heading,
|
| 272 |
+
"heading_level": current_heading_level,
|
| 273 |
+
"heading_path": heading_path(),
|
| 274 |
+
"line_start": start_line,
|
| 275 |
+
"line_end": end_line,
|
| 276 |
+
"inferred_block": True,
|
| 277 |
+
},
|
| 278 |
+
}
|
| 279 |
+
)
|
| 280 |
+
chunk_index += 1
|
| 281 |
+
return
|
| 282 |
+
|
| 283 |
+
# Default to text
|
| 284 |
+
chunks.append(
|
| 285 |
+
{
|
| 286 |
+
"chunk_id": _hash_id(text, "doc_text"),
|
| 287 |
+
"source": "documentation",
|
| 288 |
+
"source_name": source_name,
|
| 289 |
+
"source_url": source_url,
|
| 290 |
+
"language": "markdown",
|
| 291 |
+
"chunk_type": "text",
|
| 292 |
+
"content": text,
|
| 293 |
+
"chunk_index": chunk_index,
|
| 294 |
+
"metadata": {
|
| 295 |
+
"heading": current_heading,
|
| 296 |
+
"heading_level": current_heading_level,
|
| 297 |
+
"heading_path": heading_path(),
|
| 298 |
+
"line_start": start_line,
|
| 299 |
+
"line_end": end_line,
|
| 300 |
+
},
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
chunk_index += 1
|
| 304 |
+
|
| 305 |
+
def flush_code(start_line: int, end_line: int):
|
| 306 |
+
nonlocal code_lines, code_language, chunk_index
|
| 307 |
+
if not code_lines:
|
| 308 |
+
return
|
| 309 |
+
|
| 310 |
+
code = "\n".join(code_lines)
|
| 311 |
+
code_lines = []
|
| 312 |
+
|
| 313 |
+
# Check if this is actually code or just formatted text
|
| 314 |
+
is_actual_code = _is_actual_code(code)
|
| 315 |
+
|
| 316 |
+
if is_actual_code:
|
| 317 |
+
chunks.append(
|
| 318 |
+
{
|
| 319 |
+
"chunk_id": _hash_id(code, "doc_code"),
|
| 320 |
+
"source": "documentation",
|
| 321 |
+
"source_name": source_name,
|
| 322 |
+
"source_url": source_url,
|
| 323 |
+
"language": code_language or "unknown",
|
| 324 |
+
"chunk_type": "code",
|
| 325 |
+
"content": code,
|
| 326 |
+
"chunk_index": chunk_index,
|
| 327 |
+
"metadata": {
|
| 328 |
+
"heading": current_heading,
|
| 329 |
+
"heading_level": current_heading_level,
|
| 330 |
+
"heading_path": heading_path(),
|
| 331 |
+
"fenced_block": True,
|
| 332 |
+
"line_start": start_line,
|
| 333 |
+
"line_end": end_line,
|
| 334 |
+
"looks_executable": _looks_like_executable_code(code),
|
| 335 |
+
},
|
| 336 |
+
}
|
| 337 |
+
)
|
| 338 |
+
else:
|
| 339 |
+
# It's formatted text, not actual code
|
| 340 |
+
chunks.append(
|
| 341 |
+
{
|
| 342 |
+
"chunk_id": _hash_id(code, "doc_text"),
|
| 343 |
+
"source": "documentation",
|
| 344 |
+
"source_name": source_name,
|
| 345 |
+
"source_url": source_url,
|
| 346 |
+
"language": "markdown",
|
| 347 |
+
"chunk_type": "text",
|
| 348 |
+
"content": code,
|
| 349 |
+
"chunk_index": chunk_index,
|
| 350 |
+
"metadata": {
|
| 351 |
+
"heading": current_heading,
|
| 352 |
+
"heading_level": current_heading_level,
|
| 353 |
+
"heading_path": heading_path(),
|
| 354 |
+
"line_start": start_line,
|
| 355 |
+
"line_end": end_line,
|
| 356 |
+
"was_fenced_block": True, # Note: was in ``` but isn't code
|
| 357 |
+
},
|
| 358 |
+
}
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
chunk_index += 1
|
| 362 |
+
code_language = None
|
| 363 |
+
|
| 364 |
+
buffer_start_line = 0
|
| 365 |
+
code_start_line = 0
|
| 366 |
+
|
| 367 |
+
for i, line in enumerate(lines):
|
| 368 |
+
line_cursor = i + 1
|
| 369 |
+
|
| 370 |
+
# ---- Heading detection ----
|
| 371 |
+
m = re.match(r"^(#{2,6})\s+(.*)", line)
|
| 372 |
+
if not code_block and m:
|
| 373 |
+
flush_text(buffer_start_line, line_cursor - 1)
|
| 374 |
+
|
| 375 |
+
level = len(m.group(1))
|
| 376 |
+
title = m.group(2).strip()
|
| 377 |
+
|
| 378 |
+
# Maintain heading stack
|
| 379 |
+
heading_stack[:] = heading_stack[: level - 2]
|
| 380 |
+
heading_stack.append(title)
|
| 381 |
+
|
| 382 |
+
current_heading = title
|
| 383 |
+
current_heading_level = level
|
| 384 |
+
buffer_start_line = line_cursor
|
| 385 |
+
continue
|
| 386 |
+
|
| 387 |
+
# ---- Code fence detection ----
|
| 388 |
+
if line.strip().startswith("```"):
|
| 389 |
+
if not code_block:
|
| 390 |
+
flush_text(buffer_start_line, line_cursor - 1)
|
| 391 |
+
code_block = True
|
| 392 |
+
code_language = line.strip().replace("```", "").strip() or None
|
| 393 |
+
code_start_line = line_cursor + 1
|
| 394 |
+
else:
|
| 395 |
+
code_block = False
|
| 396 |
+
flush_code(code_start_line, line_cursor - 1)
|
| 397 |
+
buffer_start_line = line_cursor + 1
|
| 398 |
+
continue
|
| 399 |
+
|
| 400 |
+
if code_block:
|
| 401 |
+
code_lines.append(line)
|
| 402 |
+
else:
|
| 403 |
+
if not buffer:
|
| 404 |
+
buffer_start_line = line_cursor
|
| 405 |
+
buffer.append(line)
|
| 406 |
+
|
| 407 |
+
flush_text(buffer_start_line, line_cursor)
|
| 408 |
+
flush_code(code_start_line, line_cursor)
|
| 409 |
+
|
| 410 |
+
return chunks
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
def wrap_doc_chunks(doc_chunks: List[dict]) -> List[CodeChunk]:
|
| 414 |
+
"""
|
| 415 |
+
Adapter: convert doc_chunker output (dict)
|
| 416 |
+
into CodeChunk(documentation).
|
| 417 |
+
Does NOT affect core doc_chunker parsing logic.
|
| 418 |
+
"""
|
| 419 |
+
wrapped: List[CodeChunk] = []
|
| 420 |
+
|
| 421 |
+
for d in doc_chunks:
|
| 422 |
+
wrapped.append(
|
| 423 |
+
CodeChunk(
|
| 424 |
+
chunk_id=d["chunk_id"],
|
| 425 |
+
file_path=d["source_name"],
|
| 426 |
+
language=d.get("language", "markdown"),
|
| 427 |
+
chunk_type="documentation",
|
| 428 |
+
code=d["content"],
|
| 429 |
+
ast=ChunkAST(
|
| 430 |
+
symbol_type="documentation",
|
| 431 |
+
name=d.get("metadata", {}).get("heading"),
|
| 432 |
+
parent=d.get("metadata", {}).get("heading_path"),
|
| 433 |
+
),
|
| 434 |
+
span=ChunkSpan(
|
| 435 |
+
start_line=d.get("metadata", {}).get("line_start"),
|
| 436 |
+
end_line=d.get("metadata", {}).get("line_end"),
|
| 437 |
+
),
|
| 438 |
+
hierarchy=ChunkHierarchy(
|
| 439 |
+
is_primary=True,
|
| 440 |
+
is_extracted=True,
|
| 441 |
+
),
|
| 442 |
+
metadata=d.get("metadata", {}),
|
| 443 |
+
)
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
return wrapped
|
scripts/core/ingestion/generate_data.py
ADDED
|
@@ -0,0 +1,658 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Positive Pairs and Triplets Generator for Training Data
|
| 3 |
+
|
| 4 |
+
This module generates positive pairs and triplets from code chunks for
|
| 5 |
+
contrastive learning and similarity-based model training.
|
| 6 |
+
|
| 7 |
+
ARCHITECTURE POSITION:
|
| 8 |
+
- Training Data Generator: Creates pairs/triplets from code chunks
|
| 9 |
+
- Question Generator: Creates natural language queries for code
|
| 10 |
+
- Variance Generator: Creates multiple variations of pairs
|
| 11 |
+
|
| 12 |
+
KEY FEATURES:
|
| 13 |
+
1. Positive Pairs: (question, code) with 4-5 variations per sample
|
| 14 |
+
2. Triplets: (anchor_question, positive_code, negative_code)
|
| 15 |
+
3. Global ID tracking via chunk_id
|
| 16 |
+
4. Supports code-to-question and question-to-code mappings
|
| 17 |
+
|
| 18 |
+
OUTPUT FORMATS:
|
| 19 |
+
Positive Pairs:
|
| 20 |
+
{
|
| 21 |
+
"id": "pair_001",
|
| 22 |
+
"global_id": "chunk_id",
|
| 23 |
+
"anchor": "How to create a state graph with conditional edges?",
|
| 24 |
+
"positive": "<code snippet>"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
Triplets:
|
| 28 |
+
{
|
| 29 |
+
"id": "triplet_001",
|
| 30 |
+
"global_id": "chunk_id",
|
| 31 |
+
"anchor": "How to create a reusable prompt template?",
|
| 32 |
+
"positive": "<relevant code>",
|
| 33 |
+
"negative": "<irrelevant code>"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
USAGE:
|
| 37 |
+
from export.pairs_triplets_generator import generate_pairs_and_triplets
|
| 38 |
+
|
| 39 |
+
pairs, triplets = generate_pairs_and_triplets(
|
| 40 |
+
chunks_path="data/processed/chunks/chunks.jsonl",
|
| 41 |
+
output_dir="data/processed/training",
|
| 42 |
+
num_pairs=100,
|
| 43 |
+
variance=5
|
| 44 |
+
)
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
import json
|
| 48 |
+
import random
|
| 49 |
+
import hashlib
|
| 50 |
+
from pathlib import Path
|
| 51 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 52 |
+
from dataclasses import dataclass, field, asdict
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass
|
| 56 |
+
class PositivePairVariation:
|
| 57 |
+
"""A single anchor-positive variation."""
|
| 58 |
+
anchor: str # Question (natural language query)
|
| 59 |
+
positive: str # Code snippet
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class PositivePair:
|
| 64 |
+
"""A positive pair document with multiple anchor-positive variations.
|
| 65 |
+
|
| 66 |
+
Format:
|
| 67 |
+
{
|
| 68 |
+
"document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
|
| 69 |
+
"variations": [
|
| 70 |
+
{"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
|
| 71 |
+
{"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
|
| 72 |
+
{"anchor": "How to implement async aadd_documents?", "positive": "<code>"},
|
| 73 |
+
{"anchor": "Show the async aadd_documents code", "positive": "<code>"},
|
| 74 |
+
{"anchor": "Explain async aadd_documents function", "positive": "<code>"}
|
| 75 |
+
],
|
| 76 |
+
"framework": "crewai"
|
| 77 |
+
}
|
| 78 |
+
"""
|
| 79 |
+
document_id: str # Original chunk_id
|
| 80 |
+
variations: List[PositivePairVariation] # List of (anchor, positive) pairs
|
| 81 |
+
framework: str # Framework name from file path
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@dataclass
|
| 85 |
+
class Triplet:
|
| 86 |
+
"""A triplet for contrastive learning.
|
| 87 |
+
|
| 88 |
+
Format:
|
| 89 |
+
{
|
| 90 |
+
"document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
|
| 91 |
+
"anchor": "Best practices for async aadd_documents",
|
| 92 |
+
"positive": "async def aadd_documents(...)",
|
| 93 |
+
"negative": "async def async_agent(self):...",
|
| 94 |
+
"framework": "crewai"
|
| 95 |
+
}
|
| 96 |
+
"""
|
| 97 |
+
document_id: str # Original chunk_id
|
| 98 |
+
anchor: str # Question (natural language query)
|
| 99 |
+
positive: str # Relevant code snippet
|
| 100 |
+
negative: str # Irrelevant/different code snippet
|
| 101 |
+
framework: str # Framework name from file path
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Question templates for different code patterns - IMPROVED for cleaner questions
|
| 105 |
+
QUESTION_TEMPLATES = {
|
| 106 |
+
"class": [
|
| 107 |
+
"How does the {name} class work in Python?",
|
| 108 |
+
"What is the implementation of the {name} class?",
|
| 109 |
+
"How to create a {name} class?",
|
| 110 |
+
"Show me the {name} class implementation",
|
| 111 |
+
"Explain the {name} class structure",
|
| 112 |
+
],
|
| 113 |
+
"function": [
|
| 114 |
+
"How does {name} function work in Python?",
|
| 115 |
+
"What is the implementation of {name}?",
|
| 116 |
+
"How to implement the {name} function?",
|
| 117 |
+
"Show the code for {name} function",
|
| 118 |
+
"Explain how {name} works",
|
| 119 |
+
],
|
| 120 |
+
"method": [
|
| 121 |
+
"How does the {name} method work in Python?",
|
| 122 |
+
"What is the implementation of {name} method?",
|
| 123 |
+
"How to implement the {name} method?",
|
| 124 |
+
"Show me the {name} method code",
|
| 125 |
+
"Explain the {name} method",
|
| 126 |
+
],
|
| 127 |
+
"async_function": [
|
| 128 |
+
"How does async {name} work in Python?",
|
| 129 |
+
"What is the async implementation of {name}?",
|
| 130 |
+
"How to implement async {name}?",
|
| 131 |
+
"Show the async {name} code",
|
| 132 |
+
"Explain async {name} function",
|
| 133 |
+
],
|
| 134 |
+
"module": [
|
| 135 |
+
"How to implement {name} module?",
|
| 136 |
+
"What's the structure of {name}?",
|
| 137 |
+
"Show the {name} module implementation",
|
| 138 |
+
"Explain the {name} module",
|
| 139 |
+
"How does {name} module work?",
|
| 140 |
+
],
|
| 141 |
+
"workflow": [
|
| 142 |
+
"How to create a {name} workflow?",
|
| 143 |
+
"What's the pattern for {name}?",
|
| 144 |
+
"Show the {name} workflow implementation",
|
| 145 |
+
"Explain the {name} workflow",
|
| 146 |
+
"How does the {name} workflow work?",
|
| 147 |
+
],
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
# Variance templates to create multiple questions for the same code
|
| 151 |
+
VARIANCE_TEMPLATES = [
|
| 152 |
+
"How to {action}?",
|
| 153 |
+
"What's the code for {action}?",
|
| 154 |
+
"Show me how to {action}",
|
| 155 |
+
"Implement {action}",
|
| 156 |
+
"Write code that {action}",
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def extract_code_context(code: str, ast_info: Dict, file_path: str) -> Dict[str, str]:
|
| 161 |
+
"""Extract contextual information from code for question generation."""
|
| 162 |
+
context = {
|
| 163 |
+
"name": ast_info.get("name", "unknown"),
|
| 164 |
+
"parent": ast_info.get("parent", ""),
|
| 165 |
+
"symbol_type": ast_info.get("symbol_type", "unknown"),
|
| 166 |
+
"docstring": ast_info.get("docstring", ""),
|
| 167 |
+
"file_name": Path(file_path).stem if file_path else "unknown",
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
# Extract purpose/description from docstring or code patterns
|
| 171 |
+
if context["docstring"]:
|
| 172 |
+
# Use first sentence of docstring as description
|
| 173 |
+
desc = context["docstring"].split(".")[0].strip()
|
| 174 |
+
context["description"] = desc[:100] if len(desc) > 100 else desc
|
| 175 |
+
else:
|
| 176 |
+
# Generate description from code patterns
|
| 177 |
+
context["description"] = _infer_description(code, context["name"])
|
| 178 |
+
|
| 179 |
+
context["purpose"] = context["description"].lower()
|
| 180 |
+
|
| 181 |
+
return context
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _infer_description(code: str, name: str) -> str:
|
| 185 |
+
"""Infer a description from code patterns when no docstring exists."""
|
| 186 |
+
code_lower = code.lower()
|
| 187 |
+
|
| 188 |
+
# Common patterns
|
| 189 |
+
if "stategraph" in code_lower or "workflow" in code_lower:
|
| 190 |
+
return f"building a stateful workflow"
|
| 191 |
+
elif "agent" in code_lower:
|
| 192 |
+
return f"creating an AI agent"
|
| 193 |
+
elif "tool" in code_lower or "@tool" in code:
|
| 194 |
+
return f"implementing a tool"
|
| 195 |
+
elif "async" in code_lower:
|
| 196 |
+
return f"async operations"
|
| 197 |
+
elif "api" in code_lower or "request" in code_lower:
|
| 198 |
+
return f"API interactions"
|
| 199 |
+
elif "database" in code_lower or "sql" in code_lower:
|
| 200 |
+
return f"database operations"
|
| 201 |
+
elif "parse" in code_lower:
|
| 202 |
+
return f"parsing data"
|
| 203 |
+
elif "format" in code_lower:
|
| 204 |
+
return f"formatting output"
|
| 205 |
+
elif "template" in code_lower:
|
| 206 |
+
return f"creating templates"
|
| 207 |
+
elif "filter" in code_lower:
|
| 208 |
+
return f"filtering data"
|
| 209 |
+
elif "search" in code_lower:
|
| 210 |
+
return f"search functionality"
|
| 211 |
+
elif "create" in code_lower or "build" in code_lower:
|
| 212 |
+
return f"building {name}"
|
| 213 |
+
else:
|
| 214 |
+
return f"implementing {name}"
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def generate_question(code: str, ast_info: Dict, file_path: str,
|
| 218 |
+
variation_index: int = 0) -> str:
|
| 219 |
+
"""Generate a clean natural language question for a code snippet."""
|
| 220 |
+
name = ast_info.get("name", "unknown")
|
| 221 |
+
symbol_type = ast_info.get("symbol_type", "function")
|
| 222 |
+
|
| 223 |
+
# Clean up the name for display
|
| 224 |
+
clean_name = name.replace("_", " ") if name else "this code"
|
| 225 |
+
|
| 226 |
+
# Check if it's async
|
| 227 |
+
is_async = code.strip().startswith("async ") or "async def" in code[:100]
|
| 228 |
+
|
| 229 |
+
# Determine template category
|
| 230 |
+
if is_async and symbol_type in ("function", "method"):
|
| 231 |
+
template_category = "async_function"
|
| 232 |
+
elif symbol_type in QUESTION_TEMPLATES:
|
| 233 |
+
template_category = symbol_type
|
| 234 |
+
elif "graph" in code.lower() or "workflow" in code.lower() or "state" in code.lower():
|
| 235 |
+
template_category = "workflow"
|
| 236 |
+
else:
|
| 237 |
+
template_category = "function"
|
| 238 |
+
|
| 239 |
+
templates = QUESTION_TEMPLATES[template_category]
|
| 240 |
+
|
| 241 |
+
# Select template based on variation index
|
| 242 |
+
template_idx = variation_index % len(templates)
|
| 243 |
+
template = templates[template_idx]
|
| 244 |
+
|
| 245 |
+
# Fill in template with clean name
|
| 246 |
+
question = template.format(name=name)
|
| 247 |
+
|
| 248 |
+
return question
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def generate_question_variations(code: str, ast_info: Dict, file_path: str,
|
| 252 |
+
num_variations: int = 5) -> List[str]:
|
| 253 |
+
"""Generate multiple unique question variations for a code snippet."""
|
| 254 |
+
questions = []
|
| 255 |
+
seen_questions = set()
|
| 256 |
+
|
| 257 |
+
# Generate primary variations using templates
|
| 258 |
+
for i in range(num_variations):
|
| 259 |
+
q = generate_question(code, ast_info, file_path, variation_index=i)
|
| 260 |
+
q_lower = q.lower()
|
| 261 |
+
if q_lower not in seen_questions:
|
| 262 |
+
questions.append(q)
|
| 263 |
+
seen_questions.add(q_lower)
|
| 264 |
+
|
| 265 |
+
# Return exactly num_variations (templates should provide enough)
|
| 266 |
+
return questions[:num_variations]
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def extract_framework(file_path: str) -> str:
|
| 270 |
+
"""Extract framework name from file path.
|
| 271 |
+
|
| 272 |
+
Examples:
|
| 273 |
+
'data/raw/codebases/crewai/...' -> 'crewai'
|
| 274 |
+
'data/raw/codebases/langgraph/...' -> 'langgraph'
|
| 275 |
+
'data/processed/repos/langgraph_20260116/...' -> 'langgraph'
|
| 276 |
+
"""
|
| 277 |
+
path_lower = file_path.lower()
|
| 278 |
+
|
| 279 |
+
# Known frameworks to detect
|
| 280 |
+
frameworks = [
|
| 281 |
+
"crewai", "langgraph", "langchain", "autogen", "llamaindex",
|
| 282 |
+
"dspy", "haystack", "semantic_kernel", "fastapi", "flask", "django"
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
for framework in frameworks:
|
| 286 |
+
if framework in path_lower:
|
| 287 |
+
return framework
|
| 288 |
+
|
| 289 |
+
# Try to extract from path structure
|
| 290 |
+
parts = file_path.replace("\\", "/").split("/")
|
| 291 |
+
for part in parts:
|
| 292 |
+
if "codebases" in parts or "repos" in parts:
|
| 293 |
+
# Get the next part after codebases/repos
|
| 294 |
+
try:
|
| 295 |
+
idx = parts.index("codebases") if "codebases" in parts else parts.index("repos")
|
| 296 |
+
if idx + 1 < len(parts):
|
| 297 |
+
framework_part = parts[idx + 1].split("_")[0] # Handle 'langgraph_20260116'
|
| 298 |
+
if framework_part and framework_part not in ["raw", "processed"]:
|
| 299 |
+
return framework_part
|
| 300 |
+
except (ValueError, IndexError):
|
| 301 |
+
pass
|
| 302 |
+
|
| 303 |
+
return "unknown"
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def is_semantically_different(chunk1: Dict, chunk2: Dict) -> bool:
|
| 307 |
+
"""Check if two chunks are semantically different (good for negative pairs)."""
|
| 308 |
+
# Different symbol types
|
| 309 |
+
type1 = chunk1.get("ast", {}).get("symbol_type", "")
|
| 310 |
+
type2 = chunk2.get("ast", {}).get("symbol_type", "")
|
| 311 |
+
|
| 312 |
+
# Different purposes (check for different keywords)
|
| 313 |
+
code1 = chunk1.get("code", "").lower()
|
| 314 |
+
code2 = chunk2.get("code", "").lower()
|
| 315 |
+
|
| 316 |
+
# Keywords that indicate different functionality
|
| 317 |
+
keywords = [
|
| 318 |
+
"parse", "format", "create", "delete", "update", "read", "write",
|
| 319 |
+
"input", "output", "agent", "tool", "graph", "state", "workflow",
|
| 320 |
+
"template", "filter", "search", "database", "api", "async"
|
| 321 |
+
]
|
| 322 |
+
|
| 323 |
+
keywords1 = set(k for k in keywords if k in code1)
|
| 324 |
+
keywords2 = set(k for k in keywords if k in code2)
|
| 325 |
+
|
| 326 |
+
# Consider different if keyword overlap is low
|
| 327 |
+
if not keywords1 or not keywords2:
|
| 328 |
+
return type1 != type2
|
| 329 |
+
|
| 330 |
+
overlap = len(keywords1 & keywords2) / len(keywords1 | keywords2)
|
| 331 |
+
return overlap < 0.3
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def select_negative_sample(anchor_chunk: Dict, all_chunks: List[Dict],
|
| 335 |
+
max_attempts: int = 50) -> Optional[Dict]:
|
| 336 |
+
"""Select a semantically different chunk as negative sample."""
|
| 337 |
+
anchor_id = anchor_chunk.get("chunk_id", "")
|
| 338 |
+
|
| 339 |
+
# Shuffle chunks for random selection
|
| 340 |
+
candidates = [c for c in all_chunks if c.get("chunk_id") != anchor_id]
|
| 341 |
+
random.shuffle(candidates)
|
| 342 |
+
|
| 343 |
+
for candidate in candidates[:max_attempts]:
|
| 344 |
+
if is_semantically_different(anchor_chunk, candidate):
|
| 345 |
+
return candidate
|
| 346 |
+
|
| 347 |
+
# Fallback: return any different chunk
|
| 348 |
+
if candidates:
|
| 349 |
+
return candidates[0]
|
| 350 |
+
return None
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def load_chunks(chunks_path: Path) -> List[Dict]:
|
| 354 |
+
"""Load chunks from JSONL file."""
|
| 355 |
+
chunks = []
|
| 356 |
+
with open(chunks_path, "r", encoding="utf-8") as f:
|
| 357 |
+
for line in f:
|
| 358 |
+
line = line.strip()
|
| 359 |
+
if line:
|
| 360 |
+
try:
|
| 361 |
+
chunks.append(json.loads(line))
|
| 362 |
+
except json.JSONDecodeError:
|
| 363 |
+
continue
|
| 364 |
+
return chunks
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def filter_valid_chunks(chunks: List[Dict], min_code_length: int = 50) -> List[Dict]:
|
| 368 |
+
"""Filter chunks that are suitable for training pairs."""
|
| 369 |
+
valid_chunks = []
|
| 370 |
+
|
| 371 |
+
for chunk in chunks:
|
| 372 |
+
code = chunk.get("code", "")
|
| 373 |
+
chunk_type = chunk.get("chunk_type", "")
|
| 374 |
+
ast_info = chunk.get("ast", {})
|
| 375 |
+
|
| 376 |
+
# Skip empty or very short chunks
|
| 377 |
+
if len(code) < min_code_length:
|
| 378 |
+
continue
|
| 379 |
+
|
| 380 |
+
# Skip pure imports or empty modules
|
| 381 |
+
if chunk_type == "imports" or (chunk_type == "module" and not ast_info.get("docstring")):
|
| 382 |
+
symbol_type = ast_info.get("symbol_type", "")
|
| 383 |
+
if symbol_type == "imports":
|
| 384 |
+
continue
|
| 385 |
+
|
| 386 |
+
# Skip __init__ files without content
|
| 387 |
+
if "__init__" in chunk.get("file_path", "") and len(code) < 100:
|
| 388 |
+
continue
|
| 389 |
+
|
| 390 |
+
valid_chunks.append(chunk)
|
| 391 |
+
|
| 392 |
+
return valid_chunks
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def generate_positive_pairs(chunks: List[Dict], num_pairs: int = 100,
|
| 396 |
+
variance: int = 5) -> List[PositivePair]:
|
| 397 |
+
"""
|
| 398 |
+
Generate positive pairs from chunks with multiple (anchor, positive) variations per document.
|
| 399 |
+
|
| 400 |
+
Output format:
|
| 401 |
+
{
|
| 402 |
+
"document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
|
| 403 |
+
"variations": [
|
| 404 |
+
{"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
|
| 405 |
+
{"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
|
| 406 |
+
...
|
| 407 |
+
],
|
| 408 |
+
"framework": "crewai"
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
Args:
|
| 412 |
+
chunks: List of code chunks
|
| 413 |
+
num_pairs: Number of documents to generate (each with `variance` variations)
|
| 414 |
+
variance: Number of (anchor, positive) variations per document (4-5 recommended)
|
| 415 |
+
|
| 416 |
+
Returns:
|
| 417 |
+
List of PositivePair objects (one per document, each with multiple variations)
|
| 418 |
+
"""
|
| 419 |
+
pairs = []
|
| 420 |
+
|
| 421 |
+
# Filter valid chunks
|
| 422 |
+
valid_chunks = filter_valid_chunks(chunks)
|
| 423 |
+
|
| 424 |
+
# Sample chunks if needed
|
| 425 |
+
if len(valid_chunks) > num_pairs:
|
| 426 |
+
selected_chunks = random.sample(valid_chunks, num_pairs)
|
| 427 |
+
else:
|
| 428 |
+
selected_chunks = valid_chunks
|
| 429 |
+
|
| 430 |
+
for chunk in selected_chunks:
|
| 431 |
+
code = chunk.get("code", "")
|
| 432 |
+
ast_info = chunk.get("ast", {})
|
| 433 |
+
file_path = chunk.get("file_path", "")
|
| 434 |
+
document_id = chunk.get("chunk_id", "")
|
| 435 |
+
|
| 436 |
+
# Extract framework from file path
|
| 437 |
+
framework = extract_framework(file_path)
|
| 438 |
+
|
| 439 |
+
# Generate multiple question variations
|
| 440 |
+
anchors = generate_question_variations(code, ast_info, file_path, variance)
|
| 441 |
+
|
| 442 |
+
# Create variations list with (anchor, positive) pairs
|
| 443 |
+
variations = [
|
| 444 |
+
PositivePairVariation(anchor=anchor, positive=code)
|
| 445 |
+
for anchor in anchors
|
| 446 |
+
]
|
| 447 |
+
|
| 448 |
+
pair = PositivePair(
|
| 449 |
+
document_id=document_id,
|
| 450 |
+
variations=variations,
|
| 451 |
+
framework=framework
|
| 452 |
+
)
|
| 453 |
+
pairs.append(pair)
|
| 454 |
+
|
| 455 |
+
return pairs
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def generate_triplets(chunks: List[Dict], num_triplets: int = 100) -> List[Triplet]:
|
| 459 |
+
"""
|
| 460 |
+
Generate triplets from chunks (no variations, flat structure).
|
| 461 |
+
|
| 462 |
+
Output format:
|
| 463 |
+
{
|
| 464 |
+
"document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
|
| 465 |
+
"anchor": "Best practices for async aadd_documents",
|
| 466 |
+
"positive": "async def aadd_documents(...)",
|
| 467 |
+
"negative": "async def async_agent(self):...",
|
| 468 |
+
"framework": "crewai"
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
Args:
|
| 472 |
+
chunks: List of code chunks
|
| 473 |
+
num_triplets: Number of triplets to generate (100, no variance)
|
| 474 |
+
|
| 475 |
+
Returns:
|
| 476 |
+
List of Triplet objects
|
| 477 |
+
"""
|
| 478 |
+
triplets = []
|
| 479 |
+
|
| 480 |
+
# Filter valid chunks
|
| 481 |
+
valid_chunks = filter_valid_chunks(chunks)
|
| 482 |
+
|
| 483 |
+
if len(valid_chunks) < 2:
|
| 484 |
+
return triplets
|
| 485 |
+
|
| 486 |
+
# Sample chunks if needed
|
| 487 |
+
if len(valid_chunks) > num_triplets:
|
| 488 |
+
selected_chunks = random.sample(valid_chunks, num_triplets)
|
| 489 |
+
else:
|
| 490 |
+
selected_chunks = valid_chunks
|
| 491 |
+
|
| 492 |
+
for anchor_chunk in selected_chunks:
|
| 493 |
+
# Find a semantically different chunk as negative
|
| 494 |
+
negative_chunk = select_negative_sample(anchor_chunk, valid_chunks)
|
| 495 |
+
|
| 496 |
+
if negative_chunk is None:
|
| 497 |
+
continue
|
| 498 |
+
|
| 499 |
+
code = anchor_chunk.get("code", "")
|
| 500 |
+
ast_info = anchor_chunk.get("ast", {})
|
| 501 |
+
file_path = anchor_chunk.get("file_path", "")
|
| 502 |
+
document_id = anchor_chunk.get("chunk_id", "")
|
| 503 |
+
|
| 504 |
+
# Extract framework from file path
|
| 505 |
+
framework = extract_framework(file_path)
|
| 506 |
+
|
| 507 |
+
# Generate question for anchor
|
| 508 |
+
question = generate_question(code, ast_info, file_path)
|
| 509 |
+
|
| 510 |
+
triplet = Triplet(
|
| 511 |
+
document_id=document_id,
|
| 512 |
+
anchor=question,
|
| 513 |
+
positive=code,
|
| 514 |
+
negative=negative_chunk.get("code", ""),
|
| 515 |
+
framework=framework
|
| 516 |
+
)
|
| 517 |
+
triplets.append(triplet)
|
| 518 |
+
|
| 519 |
+
return triplets
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def export_pairs_jsonl(pairs: List[PositivePair], output_path: Path) -> None:
|
| 523 |
+
"""Export positive pairs to JSONL file."""
|
| 524 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 525 |
+
|
| 526 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 527 |
+
for pair in pairs:
|
| 528 |
+
f.write(json.dumps(asdict(pair), ensure_ascii=False) + "\n")
|
| 529 |
+
|
| 530 |
+
print(f"Exported {len(pairs)} positive pairs to {output_path}")
|
| 531 |
+
|
| 532 |
+
def export_triplets_jsonl(triplets: List[Triplet], output_path: Path) -> None:
|
| 533 |
+
"""Export triplets to JSONL file."""
|
| 534 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 535 |
+
|
| 536 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 537 |
+
for triplet in triplets:
|
| 538 |
+
f.write(json.dumps(asdict(triplet), ensure_ascii=False) + "\n")
|
| 539 |
+
|
| 540 |
+
print(f"Exported {len(triplets)} triplets to {output_path}")
|
| 541 |
+
|
| 542 |
+
def export_pairs_json(pairs: List[PositivePair], output_path: Path) -> None:
|
| 543 |
+
"""Export positive pairs to JSON file (list format for easier inspection)."""
|
| 544 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 545 |
+
|
| 546 |
+
data = [asdict(p) for p in pairs]
|
| 547 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 548 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 549 |
+
|
| 550 |
+
print(f"Exported {len(pairs)} positive pairs to {output_path}")
|
| 551 |
+
|
| 552 |
+
def export_triplets_json(triplets: List[Triplet], output_path: Path) -> None:
|
| 553 |
+
"""Export triplets to JSON file (flat list format)."""
|
| 554 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 555 |
+
|
| 556 |
+
data = [asdict(t) for t in triplets]
|
| 557 |
+
|
| 558 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 559 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 560 |
+
|
| 561 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 562 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 563 |
+
|
| 564 |
+
print(f"Exported {len(triplets)} triplets to {output_path}")
|
| 565 |
+
|
| 566 |
+
|
| 567 |
+
def generate_pairs_and_triplets(
|
| 568 |
+
chunks_path: Path,
|
| 569 |
+
output_dir: Path,
|
| 570 |
+
num_pairs: int = 100,
|
| 571 |
+
num_triplets: int = 100,
|
| 572 |
+
variance: int = 5,
|
| 573 |
+
export_format: str = "both" # "jsonl", "json", or "both"
|
| 574 |
+
) -> Tuple[List[PositivePair], List[Triplet]]:
|
| 575 |
+
"""
|
| 576 |
+
Main function to generate positive pairs and triplets from chunks.
|
| 577 |
+
|
| 578 |
+
Args:
|
| 579 |
+
chunks_path: Path to chunks JSONL file
|
| 580 |
+
output_dir: Directory to save output files
|
| 581 |
+
num_pairs: Number of base pairs (will generate num_pairs * variance total)
|
| 582 |
+
num_triplets: Number of triplets (no variance)
|
| 583 |
+
variance: Number of variations per positive pair (4-5)
|
| 584 |
+
export_format: Output format ("jsonl", "json", or "both")
|
| 585 |
+
|
| 586 |
+
Returns:
|
| 587 |
+
Tuple of (pairs, triplets)
|
| 588 |
+
"""
|
| 589 |
+
print(f"Loading chunks from {chunks_path}...")
|
| 590 |
+
chunks = load_chunks(chunks_path)
|
| 591 |
+
print(f" Loaded {len(chunks)} chunks")
|
| 592 |
+
|
| 593 |
+
# Generate positive pairs with variance
|
| 594 |
+
print(f"Generating positive pairs (base={num_pairs}, variance={variance})...")
|
| 595 |
+
pairs = generate_positive_pairs(chunks, num_pairs=num_pairs, variance=variance)
|
| 596 |
+
print(f" Generated {len(pairs)} positive pairs")
|
| 597 |
+
|
| 598 |
+
# Generate triplets (no variance)
|
| 599 |
+
print(f"Generating triplets (count={num_triplets})...")
|
| 600 |
+
triplets = generate_triplets(chunks, num_triplets=num_triplets)
|
| 601 |
+
print(f" Generated {len(triplets)} triplets")
|
| 602 |
+
|
| 603 |
+
# Create output directory
|
| 604 |
+
output_dir = Path(output_dir)
|
| 605 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 606 |
+
|
| 607 |
+
# Export based on format
|
| 608 |
+
if export_format in ("jsonl", "both"):
|
| 609 |
+
export_pairs_jsonl(pairs, output_dir / "positive_pairs.jsonl")
|
| 610 |
+
export_triplets_jsonl(triplets, output_dir / "triplets.jsonl")
|
| 611 |
+
|
| 612 |
+
if export_format in ("json", "both"):
|
| 613 |
+
export_pairs_json(pairs, output_dir / "positive_pairs.json")
|
| 614 |
+
export_triplets_json(triplets, output_dir / "triplets.json")
|
| 615 |
+
|
| 616 |
+
# Print summary statistics
|
| 617 |
+
print("Summary Statistics:")
|
| 618 |
+
print(f" Total Positive Pair Documents: {len(pairs)}")
|
| 619 |
+
print(f" Total Variations: {sum(len(p.variations) for p in pairs)}")
|
| 620 |
+
print(f" Total Triplets: {len(triplets)}")
|
| 621 |
+
|
| 622 |
+
return pairs, triplets
|
| 623 |
+
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
def main():
|
| 627 |
+
"""CLI entry point for generating pairs and triplets."""
|
| 628 |
+
import argparse
|
| 629 |
+
|
| 630 |
+
parser = argparse.ArgumentParser(description="Generate positive pairs and triplets from code chunks")
|
| 631 |
+
parser.add_argument("--chunks", "-c", type=str, required=True,
|
| 632 |
+
help="Path to chunks JSONL file")
|
| 633 |
+
parser.add_argument("--output", "-o", type=str, required=True,
|
| 634 |
+
help="Output directory for generated files")
|
| 635 |
+
parser.add_argument("--pairs", "-p", type=int, default=100,
|
| 636 |
+
help="Number of base positive pairs (default: 100)")
|
| 637 |
+
parser.add_argument("--triplets", "-t", type=int, default=100,
|
| 638 |
+
help="Number of triplets (default: 100)")
|
| 639 |
+
parser.add_argument("--variance", "-v", type=int, default=5,
|
| 640 |
+
help="Number of variations per pair (default: 5)")
|
| 641 |
+
parser.add_argument("--format", "-f", type=str, default="both",
|
| 642 |
+
choices=["jsonl", "json", "both"],
|
| 643 |
+
help="Output format (default: both)")
|
| 644 |
+
|
| 645 |
+
args = parser.parse_args()
|
| 646 |
+
|
| 647 |
+
generate_pairs_and_triplets(
|
| 648 |
+
chunks_path=Path(args.chunks),
|
| 649 |
+
output_dir=Path(args.output),
|
| 650 |
+
num_pairs=args.pairs,
|
| 651 |
+
num_triplets=args.triplets,
|
| 652 |
+
variance=args.variance,
|
| 653 |
+
export_format=args.format
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
if __name__ == "__main__":
|
| 658 |
+
main()
|
scripts/core/ingestion/hierarchical_chunker.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hierarchical chunk coordinator - Orchestrates AST and Tree-sitter chunking.
|
| 3 |
+
|
| 4 |
+
This module serves as the coordination layer that integrates AST (semantic)
|
| 5 |
+
and Tree-sitter (syntactic) chunking. It ensures that:
|
| 6 |
+
1. AST chunks get precise byte spans from Tree-sitter
|
| 7 |
+
2. Hierarchy relationships are preserved across both sources
|
| 8 |
+
3. Parent-child relationships are correctly established
|
| 9 |
+
4. All chunks have consistent metadata and structure
|
| 10 |
+
|
| 11 |
+
ARCHITECTURE POSITION:
|
| 12 |
+
- Coordination Layer: Integrates AST and Tree-sitter
|
| 13 |
+
- Relationship Manager: Maintains parent-child links
|
| 14 |
+
- Quality Enforcer: Ensures consistent chunk structure
|
| 15 |
+
|
| 16 |
+
KEY RESPONSIBILITIES:
|
| 17 |
+
1. Enrich AST chunks with Tree-sitter byte spans
|
| 18 |
+
2. Build and verify hierarchy relationships
|
| 19 |
+
3. Create secondary chunks for extracted content
|
| 20 |
+
4. Ensure type safety across all chunk operations
|
| 21 |
+
|
| 22 |
+
FLOW:
|
| 23 |
+
File → AST chunks (semantic) + Tree-sitter chunks (spans)
|
| 24 |
+
→ HierarchicalChunker.enrich_and_link()
|
| 25 |
+
→ Final chunks with hierarchy + precise spans
|
| 26 |
+
|
| 27 |
+
USAGE:
|
| 28 |
+
chunker = HierarchicalChunker()
|
| 29 |
+
chunks = chunker.chunk_file(Path("file.py"))
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
from typing import List, Dict, Optional, Tuple, Set, cast
|
| 34 |
+
import uuid
|
| 35 |
+
|
| 36 |
+
from .ast_chunker import extract_ast_chunks
|
| 37 |
+
from .ts_chunker import extract_ts_chunks
|
| 38 |
+
from .chunk_schema import CodeChunk, ChunkHierarchy, ChunkType
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class HierarchicalChunker:
|
| 42 |
+
def __init__(self):
|
| 43 |
+
self.chunks_by_id: Dict[str, CodeChunk] = {}
|
| 44 |
+
self.imports_by_file: Dict[str, str] = {} # Track imports chunks by file
|
| 45 |
+
|
| 46 |
+
# ---------------- helpers ----------------
|
| 47 |
+
|
| 48 |
+
def _build_ts_span_map(
|
| 49 |
+
self, ts_chunks: List[CodeChunk]
|
| 50 |
+
) -> Dict[Tuple[int, int], CodeChunk]:
|
| 51 |
+
span_map: Dict[Tuple[int, int], CodeChunk] = {}
|
| 52 |
+
|
| 53 |
+
for c in ts_chunks:
|
| 54 |
+
if c.span.start_line is None or c.span.end_line is None:
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
span_map[(c.span.start_line, c.span.end_line)] = c
|
| 58 |
+
|
| 59 |
+
return span_map
|
| 60 |
+
|
| 61 |
+
def _enrich_spans_with_tree_sitter(
|
| 62 |
+
self, ast_chunks: List[CodeChunk], ts_chunks: List[CodeChunk]
|
| 63 |
+
) -> List[CodeChunk]:
|
| 64 |
+
"""Enrich AST chunks with Tree-sitter precise byte spans"""
|
| 65 |
+
ts_span_map = self._build_ts_span_map(ts_chunks)
|
| 66 |
+
|
| 67 |
+
for ast_chunk in ast_chunks:
|
| 68 |
+
if ast_chunk.span.start_line is not None and ast_chunk.span.end_line is not None:
|
| 69 |
+
key: Tuple[int, int] = (ast_chunk.span.start_line, ast_chunk.span.end_line)
|
| 70 |
+
ts_match = ts_span_map.get(key)
|
| 71 |
+
|
| 72 |
+
if ts_match:
|
| 73 |
+
# Update byte spans from Tree-sitter
|
| 74 |
+
ast_chunk.span.start_byte = ts_match.span.start_byte
|
| 75 |
+
ast_chunk.span.end_byte = ts_match.span.end_byte
|
| 76 |
+
|
| 77 |
+
return ast_chunks
|
| 78 |
+
|
| 79 |
+
def _preserve_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
|
| 80 |
+
"""Ensure all hierarchy relationships are preserved with proper typing"""
|
| 81 |
+
# Build mapping for quick lookup
|
| 82 |
+
for chunk in all_chunks:
|
| 83 |
+
self.chunks_by_id[chunk.chunk_id] = chunk
|
| 84 |
+
|
| 85 |
+
# Verify and fix parent-child relationships with type safety
|
| 86 |
+
for chunk in all_chunks:
|
| 87 |
+
# Ensure hierarchy exists
|
| 88 |
+
if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
|
| 89 |
+
chunk.hierarchy = ChunkHierarchy()
|
| 90 |
+
|
| 91 |
+
if chunk.hierarchy.parent_id:
|
| 92 |
+
parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
|
| 93 |
+
if parent:
|
| 94 |
+
# Ensure parent has hierarchy
|
| 95 |
+
if not hasattr(parent, 'hierarchy') or parent.hierarchy is None:
|
| 96 |
+
parent.hierarchy = ChunkHierarchy()
|
| 97 |
+
|
| 98 |
+
# Add child to parent with type safety
|
| 99 |
+
if chunk.chunk_id not in parent.hierarchy.children_ids:
|
| 100 |
+
parent.hierarchy.children_ids.append(chunk.chunk_id)
|
| 101 |
+
|
| 102 |
+
def _create_secondary_chunks_for_extracted_content(
|
| 103 |
+
self, ast_chunks: List[CodeChunk]
|
| 104 |
+
) -> List[CodeChunk]:
|
| 105 |
+
"""Create secondary chunks for extracted content (if needed)"""
|
| 106 |
+
secondary_chunks: List[CodeChunk] = []
|
| 107 |
+
|
| 108 |
+
# Currently, our AST chunker creates everything as primary
|
| 109 |
+
# This method is for future extensions
|
| 110 |
+
return secondary_chunks
|
| 111 |
+
|
| 112 |
+
def _update_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
|
| 113 |
+
"""Update parent-child relationships based on AST parent field with proper typing"""
|
| 114 |
+
# Create mapping from (name, type) to chunk_id
|
| 115 |
+
chunk_map: Dict[Tuple[Optional[str], ChunkType], str] = {}
|
| 116 |
+
|
| 117 |
+
for chunk in all_chunks:
|
| 118 |
+
if chunk.ast and chunk.ast.name:
|
| 119 |
+
key = (chunk.ast.name, chunk.chunk_type)
|
| 120 |
+
chunk_map[key] = chunk.chunk_id
|
| 121 |
+
|
| 122 |
+
# Update parent relationships with type safety
|
| 123 |
+
for chunk in all_chunks:
|
| 124 |
+
# Ensure hierarchy exists
|
| 125 |
+
if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
|
| 126 |
+
chunk.hierarchy = ChunkHierarchy()
|
| 127 |
+
|
| 128 |
+
if chunk.ast and chunk.ast.parent and chunk.ast.parent != "None":
|
| 129 |
+
# Determine parent type based on current chunk type
|
| 130 |
+
parent_type: ChunkType = "class" if chunk.chunk_type == "method" else "module"
|
| 131 |
+
|
| 132 |
+
# Try to find parent chunk
|
| 133 |
+
parent_key = (chunk.ast.parent, parent_type)
|
| 134 |
+
parent_id = chunk_map.get(parent_key)
|
| 135 |
+
|
| 136 |
+
if parent_id and parent_id in self.chunks_by_id:
|
| 137 |
+
chunk.hierarchy.parent_id = parent_id
|
| 138 |
+
|
| 139 |
+
# Add this chunk to parent's children with type safety
|
| 140 |
+
parent_chunk = self.chunks_by_id.get(parent_id)
|
| 141 |
+
if parent_chunk:
|
| 142 |
+
# Ensure parent has hierarchy
|
| 143 |
+
if not hasattr(parent_chunk, 'hierarchy') or parent_chunk.hierarchy is None:
|
| 144 |
+
parent_chunk.hierarchy = ChunkHierarchy()
|
| 145 |
+
|
| 146 |
+
if chunk.chunk_id not in parent_chunk.hierarchy.children_ids:
|
| 147 |
+
parent_chunk.hierarchy.children_ids.append(chunk.chunk_id)
|
| 148 |
+
|
| 149 |
+
# Set depth based on parent relationships
|
| 150 |
+
for chunk in all_chunks:
|
| 151 |
+
if chunk.hierarchy.parent_id:
|
| 152 |
+
parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
|
| 153 |
+
if parent and hasattr(parent, 'hierarchy') and parent.hierarchy:
|
| 154 |
+
chunk.hierarchy.depth = parent.hierarchy.depth + 1
|
| 155 |
+
|
| 156 |
+
# ---------------- public API ----------------
|
| 157 |
+
|
| 158 |
+
def chunk_file(self, file_path: Path) -> List[CodeChunk]:
|
| 159 |
+
self.chunks_by_id.clear()
|
| 160 |
+
self.imports_by_file.clear()
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
ast_chunks = extract_ast_chunks(file_path)
|
| 164 |
+
except SyntaxError:
|
| 165 |
+
ast_chunks = []
|
| 166 |
+
|
| 167 |
+
# Get Tree-sitter chunks for byte-level precision
|
| 168 |
+
ts_chunks = extract_ts_chunks(file_path)
|
| 169 |
+
|
| 170 |
+
# Enrich AST chunks with Tree-sitter byte spans
|
| 171 |
+
enriched_chunks = self._enrich_spans_with_tree_sitter(ast_chunks, ts_chunks)
|
| 172 |
+
|
| 173 |
+
# Update hierarchy relationships with proper typing
|
| 174 |
+
self._update_hierarchy_relationships(enriched_chunks)
|
| 175 |
+
|
| 176 |
+
# Preserve any existing relationships
|
| 177 |
+
self._preserve_hierarchy_relationships(enriched_chunks)
|
| 178 |
+
|
| 179 |
+
# Create any needed secondary chunks
|
| 180 |
+
secondary_chunks = self._create_secondary_chunks_for_extracted_content(enriched_chunks)
|
| 181 |
+
|
| 182 |
+
return enriched_chunks + secondary_chunks
|
scripts/core/ingestion/ingest.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Git Repository Crawler - Intelligent repository cloning and file listing system.
|
| 3 |
+
|
| 4 |
+
This module serves as the entry point for ingesting Git repositories into our
|
| 5 |
+
dataset pipeline. It handles cloning, file listing, metadata extraction, and
|
| 6 |
+
statistics generation with multiple strategies for different use cases.
|
| 7 |
+
|
| 8 |
+
ARCHITECTURE POSITION:
|
| 9 |
+
- Ingestion Layer: Entry point for Git repositories
|
| 10 |
+
- File Discovery: Finds and filters repository files
|
| 11 |
+
- Metadata Collector: Gathers repo-level information
|
| 12 |
+
|
| 13 |
+
KEY FEATURES:
|
| 14 |
+
1. Multi-strategy file listing (fast/rich/smart)
|
| 15 |
+
2. Intelligent binary detection and filtering
|
| 16 |
+
3. Repository metadata extraction with git history
|
| 17 |
+
4. Agentic framework detection (through RepoMetadataExtractor)
|
| 18 |
+
5. Repository statistics and cleanup utilities
|
| 19 |
+
|
| 20 |
+
DATA FLOW:
|
| 21 |
+
Repository URL → Clone → File Discovery → Filtering → File Info/Metadata → Output
|
| 22 |
+
|
| 23 |
+
USE CASES:
|
| 24 |
+
- FAST: When only file paths are needed (performance-critical)
|
| 25 |
+
- RICH: When full metadata is required (dataset building)
|
| 26 |
+
- SMART: Auto-chooses based on needs (balanced approach)
|
| 27 |
+
|
| 28 |
+
USAGE:
|
| 29 |
+
crawler = GitCrawler()
|
| 30 |
+
repo_path = crawler.clone_repository("https://github.com/org/repo.git")
|
| 31 |
+
files_fast = crawler.list_files_fast(repo_path, extensions={'.py'})
|
| 32 |
+
files_rich, stats = crawler.list_files_with_info(repo_path)
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
import subprocess
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import List, Optional, Set, Dict, Tuple, Union, cast
|
| 38 |
+
import os
|
| 39 |
+
from dataclasses import dataclass
|
| 40 |
+
import time
|
| 41 |
+
from .repo_metadata import RepoMetadataExtractor
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class RepoFileInfo:
|
| 46 |
+
"""Lightweight file info - optional for when you need it"""
|
| 47 |
+
path: Path
|
| 48 |
+
relative_path: str
|
| 49 |
+
size: int = 0
|
| 50 |
+
extension: str = ""
|
| 51 |
+
is_binary: Optional[bool] = None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class GitCrawler:
|
| 55 |
+
"""
|
| 56 |
+
Optimized Git crawler with fast listing + optional rich info
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
def __init__(self, cache_dir: Path = Path("data/raw/repos")):
|
| 60 |
+
self.cache_dir = cache_dir
|
| 61 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 62 |
+
|
| 63 |
+
# -------- CORE: Cloning (same for both) --------
|
| 64 |
+
def clone_repository(self, repo_url: str) -> Optional[Path]:
|
| 65 |
+
"""Clone a repository if not already cloned"""
|
| 66 |
+
repo_name = self._extract_repo_name(repo_url)
|
| 67 |
+
repo_path = self.cache_dir / repo_name
|
| 68 |
+
|
| 69 |
+
if repo_path.exists():
|
| 70 |
+
print(f"Repository already exists: {repo_path}")
|
| 71 |
+
return repo_path
|
| 72 |
+
|
| 73 |
+
print(f"Cloning {repo_url}...")
|
| 74 |
+
cmd = ["git", "clone", "--depth", "1", repo_url, str(repo_path)]
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
start_time = time.time()
|
| 78 |
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 79 |
+
elapsed = time.time() - start_time
|
| 80 |
+
print(f"Cloned to {repo_path} ({elapsed:.1f}s)")
|
| 81 |
+
return repo_path
|
| 82 |
+
except subprocess.CalledProcessError as e:
|
| 83 |
+
print(f"Failed to clone {repo_url}: {e.stderr}")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
def extract_enhanced_metadata(self, repo_path: Path) -> Dict:
|
| 87 |
+
"""
|
| 88 |
+
Extract enhanced metadata including agentic framework detection
|
| 89 |
+
"""
|
| 90 |
+
extractor = RepoMetadataExtractor(repo_path)
|
| 91 |
+
return extractor.extract_comprehensive_metadata()
|
| 92 |
+
|
| 93 |
+
# -------- OPTION 1: FAST listing (old style) --------
|
| 94 |
+
def list_files_fast(self, repo_path: Path,
|
| 95 |
+
extensions: Optional[Set[str]] = None,
|
| 96 |
+
exclude_dirs: Optional[Set[str]] = None) -> List[Path]:
|
| 97 |
+
"""
|
| 98 |
+
FAST file listing - returns just Path objects
|
| 99 |
+
|
| 100 |
+
Use when you need speed and don't need metadata
|
| 101 |
+
"""
|
| 102 |
+
if exclude_dirs is None:
|
| 103 |
+
exclude_dirs = {'.git', '__pycache__', 'node_modules',
|
| 104 |
+
'build', 'dist', '.venv', 'venv'}
|
| 105 |
+
|
| 106 |
+
files = []
|
| 107 |
+
|
| 108 |
+
for root, dirs, filenames in os.walk(repo_path):
|
| 109 |
+
# Filter directories
|
| 110 |
+
dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
|
| 111 |
+
|
| 112 |
+
for filename in filenames:
|
| 113 |
+
if filename.startswith('.'):
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
file_path = Path(root) / filename
|
| 117 |
+
|
| 118 |
+
# Filter by extension if specified
|
| 119 |
+
if extensions:
|
| 120 |
+
if file_path.suffix.lower() in extensions:
|
| 121 |
+
files.append(file_path)
|
| 122 |
+
else:
|
| 123 |
+
files.append(file_path)
|
| 124 |
+
|
| 125 |
+
return sorted(files) # Sort for consistency
|
| 126 |
+
|
| 127 |
+
# -------- OPTION 2: RICH listing with metadata --------
|
| 128 |
+
def list_files_with_info(self, repo_path: Path,
|
| 129 |
+
extensions: Optional[Set[str]] = None,
|
| 130 |
+
exclude_dirs: Optional[Set[str]] = None,
|
| 131 |
+
skip_binary: bool = True) -> Tuple[List[RepoFileInfo], Dict]:
|
| 132 |
+
"""
|
| 133 |
+
RICH file listing - returns file info + statistics
|
| 134 |
+
|
| 135 |
+
Use when you need metadata for better chunking
|
| 136 |
+
"""
|
| 137 |
+
if exclude_dirs is None:
|
| 138 |
+
exclude_dirs = {'.git', '__pycache__', 'node_modules',
|
| 139 |
+
'build', 'dist', '.venv', 'venv', '.env'}
|
| 140 |
+
|
| 141 |
+
file_infos = []
|
| 142 |
+
stats = {
|
| 143 |
+
"total_files": 0,
|
| 144 |
+
"total_size": 0,
|
| 145 |
+
"by_extension": {},
|
| 146 |
+
"binary_files": 0,
|
| 147 |
+
"text_files": 0
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
for root, dirs, filenames in os.walk(repo_path):
|
| 151 |
+
# Filter directories
|
| 152 |
+
dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
|
| 153 |
+
|
| 154 |
+
for filename in filenames:
|
| 155 |
+
if filename.startswith('.'):
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
file_path = Path(root) / filename
|
| 159 |
+
relative_path = file_path.relative_to(repo_path)
|
| 160 |
+
extension = file_path.suffix.lower()
|
| 161 |
+
|
| 162 |
+
# Filter by extension
|
| 163 |
+
if extensions and extension not in extensions:
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
size = file_path.stat().st_size
|
| 168 |
+
is_binary = None
|
| 169 |
+
|
| 170 |
+
# Check if binary (only when needed)
|
| 171 |
+
if skip_binary:
|
| 172 |
+
is_binary = self._is_binary_file(file_path)
|
| 173 |
+
if is_binary:
|
| 174 |
+
stats["binary_files"] += 1
|
| 175 |
+
continue # Skip binary files
|
| 176 |
+
else:
|
| 177 |
+
stats["text_files"] += 1
|
| 178 |
+
|
| 179 |
+
# Create file info
|
| 180 |
+
file_info = RepoFileInfo(
|
| 181 |
+
path=file_path,
|
| 182 |
+
relative_path=str(relative_path),
|
| 183 |
+
size=size,
|
| 184 |
+
extension=extension,
|
| 185 |
+
is_binary=is_binary
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
file_infos.append(file_info)
|
| 189 |
+
|
| 190 |
+
# Update stats
|
| 191 |
+
stats["total_files"] += 1
|
| 192 |
+
stats["total_size"] += size
|
| 193 |
+
stats["by_extension"][extension] = stats["by_extension"].get(extension, 0) + 1
|
| 194 |
+
|
| 195 |
+
except (OSError, PermissionError) as e:
|
| 196 |
+
print(f"[WARNING] Could not read {file_path}: {e}")
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
# Sort by relative path
|
| 200 |
+
file_infos.sort(key=lambda x: x.relative_path)
|
| 201 |
+
|
| 202 |
+
return file_infos, stats
|
| 203 |
+
|
| 204 |
+
# -------- OPTION 3: SMART listing (auto-chooses) --------
|
| 205 |
+
def list_files(self, repo_path: Path,
|
| 206 |
+
extensions: Optional[Set[str]] = None,
|
| 207 |
+
exclude_dirs: Optional[Set[str]] = None,
|
| 208 |
+
rich_metadata: bool = False,
|
| 209 |
+
skip_binary: bool = True) -> Union[List[Path], Tuple[List[RepoFileInfo], Dict]]:
|
| 210 |
+
"""
|
| 211 |
+
SMART file listing - chooses method based on needs
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
rich_metadata: True for RepoFileInfo + stats, False for just Paths
|
| 215 |
+
skip_binary: Skip binary files (only when rich_metadata=True)
|
| 216 |
+
"""
|
| 217 |
+
if rich_metadata:
|
| 218 |
+
return self.list_files_with_info(repo_path, extensions, exclude_dirs, skip_binary)
|
| 219 |
+
else:
|
| 220 |
+
return self.list_files_fast(repo_path, extensions, exclude_dirs)
|
| 221 |
+
|
| 222 |
+
# -------- HELPER: Get README --------
|
| 223 |
+
def get_readme_content(self, repo_path: Path) -> Optional[str]:
|
| 224 |
+
"""Quickly get README content if exists"""
|
| 225 |
+
for pattern in ['README.md', 'README.rst', 'README.txt', 'README', 'readme.md']:
|
| 226 |
+
readme_path = repo_path / pattern
|
| 227 |
+
if readme_path.exists():
|
| 228 |
+
try:
|
| 229 |
+
return readme_path.read_text(encoding='utf-8', errors='ignore')[:5000] # First 5k chars
|
| 230 |
+
except:
|
| 231 |
+
continue
|
| 232 |
+
return None
|
| 233 |
+
|
| 234 |
+
# -------- HELPER: Get repository stats --------
|
| 235 |
+
|
| 236 |
+
def get_repo_stats(self, repo_path: Path) -> Dict:
|
| 237 |
+
"""ACCURATE repository statistics (excludes .git)"""
|
| 238 |
+
try:
|
| 239 |
+
total_files = 0
|
| 240 |
+
total_size = 0
|
| 241 |
+
extensions = set()
|
| 242 |
+
|
| 243 |
+
for root, dirs, files in os.walk(repo_path):
|
| 244 |
+
# ✅ PROPERLY skip .git directory
|
| 245 |
+
root_path = Path(root)
|
| 246 |
+
if '.git' in root_path.parts:
|
| 247 |
+
continue # Skip entire .git directory
|
| 248 |
+
|
| 249 |
+
total_files += len(files)
|
| 250 |
+
for file in files:
|
| 251 |
+
file_path = Path(root) / file
|
| 252 |
+
try:
|
| 253 |
+
size = file_path.stat().st_size
|
| 254 |
+
total_size += size
|
| 255 |
+
if file_path.suffix:
|
| 256 |
+
extensions.add(file_path.suffix.lower())
|
| 257 |
+
except:
|
| 258 |
+
pass
|
| 259 |
+
|
| 260 |
+
return {
|
| 261 |
+
"total_files": total_files,
|
| 262 |
+
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
| 263 |
+
"unique_extensions": sorted(list(extensions))[:20],
|
| 264 |
+
"path": str(repo_path),
|
| 265 |
+
"name": repo_path.name,
|
| 266 |
+
"note": "Size excludes .git directory" # ✅ Add note
|
| 267 |
+
}
|
| 268 |
+
except Exception as e:
|
| 269 |
+
return {"error": str(e)}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
# -------- UTILITY METHODS --------
|
| 273 |
+
def _extract_repo_name(self, repo_url: str) -> str:
|
| 274 |
+
"""Extract repository name from URL"""
|
| 275 |
+
name = repo_url.rstrip('/').split('/')[-1]
|
| 276 |
+
if name.endswith('.git'):
|
| 277 |
+
name = name[:-4]
|
| 278 |
+
return name
|
| 279 |
+
|
| 280 |
+
def _is_binary_file(self, file_path: Path, sample_size: int = 1024) -> bool:
|
| 281 |
+
"""Quick binary detection by sampling"""
|
| 282 |
+
try:
|
| 283 |
+
with open(file_path, 'rb') as f:
|
| 284 |
+
sample = f.read(sample_size)
|
| 285 |
+
|
| 286 |
+
if not sample:
|
| 287 |
+
return False
|
| 288 |
+
|
| 289 |
+
# Check for null bytes (common in binaries)
|
| 290 |
+
if b'\x00' in sample:
|
| 291 |
+
return True
|
| 292 |
+
|
| 293 |
+
# Count printable ASCII
|
| 294 |
+
printable = sum(1 for byte in sample if 32 <= byte <= 126 or byte in (9, 10, 13))
|
| 295 |
+
return (printable / len(sample)) < 0.8 # Less than 80% printable
|
| 296 |
+
except:
|
| 297 |
+
return True # If we can't read, assume binary
|
| 298 |
+
|
| 299 |
+
def cleanup_old_repos(self, max_age_days: int = 7):
|
| 300 |
+
"""Cleanup old cached repositories (optional)"""
|
| 301 |
+
import shutil
|
| 302 |
+
from datetime import datetime, timedelta
|
| 303 |
+
|
| 304 |
+
cutoff = datetime.now() - timedelta(days=max_age_days)
|
| 305 |
+
|
| 306 |
+
for repo_dir in self.cache_dir.iterdir():
|
| 307 |
+
if repo_dir.is_dir():
|
| 308 |
+
try:
|
| 309 |
+
mtime = datetime.fromtimestamp(repo_dir.stat().st_mtime)
|
| 310 |
+
if mtime < cutoff:
|
| 311 |
+
print(f"🧹 Cleaning up old repo: {repo_dir.name}")
|
| 312 |
+
shutil.rmtree(repo_dir)
|
| 313 |
+
except:
|
| 314 |
+
pass
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# -------- SIMPLE USAGE EXAMPLES --------
|
| 318 |
+
def example_usage():
|
| 319 |
+
"""Example of how to use the crawler - FIXED VERSION"""
|
| 320 |
+
crawler = GitCrawler()
|
| 321 |
+
|
| 322 |
+
# 1. Clone a repository
|
| 323 |
+
repo_path = crawler.clone_repository("https://github.com/microsoft/autogen.git")
|
| 324 |
+
if not repo_path:
|
| 325 |
+
print("❌ Failed to clone repository")
|
| 326 |
+
return
|
| 327 |
+
|
| 328 |
+
# 2. OPTION A: Fast listing (just paths)
|
| 329 |
+
print("\n=== FAST LISTING ===")
|
| 330 |
+
python_files = crawler.list_files_fast(repo_path, extensions={'.py'})
|
| 331 |
+
print(f"Found {len(python_files)} Python files")
|
| 332 |
+
|
| 333 |
+
# 3. OPTION B: Rich listing with metadata
|
| 334 |
+
print("\n=== RICH LISTING ===")
|
| 335 |
+
file_infos, stats = crawler.list_files_with_info(
|
| 336 |
+
repo_path,
|
| 337 |
+
extensions={'.py', '.md', '.json', '.yaml'},
|
| 338 |
+
skip_binary=True
|
| 339 |
+
)
|
| 340 |
+
print(f"Total files: {stats['total_files']}")
|
| 341 |
+
print(f"Total size: {stats['total_size'] / 1024 / 1024:.2f} MB")
|
| 342 |
+
print(f"Extensions: {stats['by_extension']}")
|
| 343 |
+
|
| 344 |
+
# 4. OPTION C: Smart listing (auto) - FIXED
|
| 345 |
+
print("\n=== SMART LISTING ===")
|
| 346 |
+
# Returns just paths (fast)
|
| 347 |
+
files_fast = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=False)
|
| 348 |
+
# Type check for PyLance
|
| 349 |
+
if isinstance(files_fast, list):
|
| 350 |
+
print(f"Fast count: {len(files_fast)}")
|
| 351 |
+
else:
|
| 352 |
+
# This shouldn't happen with rich_metadata=False
|
| 353 |
+
print("Unexpected return type from list_files()")
|
| 354 |
+
|
| 355 |
+
# Returns info + stats (rich) - FIXED
|
| 356 |
+
result = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=True)
|
| 357 |
+
if isinstance(result, tuple):
|
| 358 |
+
files_rich, stats = result
|
| 359 |
+
print(f"Rich count: {len(files_rich)}")
|
| 360 |
+
else:
|
| 361 |
+
# This shouldn't happen with rich_metadata=True
|
| 362 |
+
print("Unexpected return type from list_files()")
|
| 363 |
+
|
| 364 |
+
# 5. Get README
|
| 365 |
+
readme = crawler.get_readme_content(repo_path)
|
| 366 |
+
if readme:
|
| 367 |
+
print(f"\nREADME preview: {readme[:200]}...")
|
| 368 |
+
|
| 369 |
+
# 6. Get repo stats
|
| 370 |
+
repo_stats = crawler.get_repo_stats(repo_path)
|
| 371 |
+
print(f"\nRepository stats: {repo_stats}")
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
if __name__ == "__main__":
|
| 375 |
+
example_usage()
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
|
scripts/core/ingestion/repo_metadata.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Repository Metadata Extractor - Advanced metadata extraction for Git repositories.
|
| 3 |
+
|
| 4 |
+
This module extracts comprehensive metadata from Git repositories with a
|
| 5 |
+
special focus on agentic framework detection. It analyzes repository structure,
|
| 6 |
+
dependencies, git history, and patterns to identify agentic code patterns.
|
| 7 |
+
|
| 8 |
+
ARCHITECTURE POSITION:
|
| 9 |
+
- Repository Analyzer: Deep analysis of Git repositories
|
| 10 |
+
- Agentic Detector: Identifies agentic framework usage
|
| 11 |
+
- Dependency Mapper: Extracts dependency information
|
| 12 |
+
|
| 13 |
+
KEY FEATURES:
|
| 14 |
+
1. Agentic framework detection across multiple frameworks
|
| 15 |
+
2. Comprehensive dependency extraction (Python, Node.js, Docker)
|
| 16 |
+
3. Git metadata extraction (commits, branches, tags)
|
| 17 |
+
4. Repository structure analysis
|
| 18 |
+
5. Entry point and configuration file discovery
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import re
|
| 23 |
+
import subprocess
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import Dict, List, Optional
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class RepoMetadataExtractor:
|
| 30 |
+
"""Enhanced metadata extractor for agentic codebases"""
|
| 31 |
+
|
| 32 |
+
AGENTIC_FRAMEWORKS = {
|
| 33 |
+
"langchain": ["langchain", "langsmith", "lc", "chain", "agent"],
|
| 34 |
+
"autogen": ["autogen", "agent", "groupchat"],
|
| 35 |
+
"crewai": ["crewai", "crew", "task", "agent"],
|
| 36 |
+
"haystack": ["haystack", "pipeline", "node"],
|
| 37 |
+
"llamaindex": ["llama_index", "query_engine", "index"],
|
| 38 |
+
"semantic_kernel": ["semantic_kernel", "sk"],
|
| 39 |
+
"transformers_agents": ["transformers_agents", "huggingface"],
|
| 40 |
+
"camel": ["camel", "role_playing"],
|
| 41 |
+
"agents": ["agent", "tool", "workflow", "orchestrator"],
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def __init__(self, repo_path: Path):
|
| 45 |
+
self.repo_path = repo_path
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------
|
| 48 |
+
# Public API
|
| 49 |
+
# ---------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
def extract_comprehensive_metadata(self) -> Dict:
|
| 52 |
+
return {
|
| 53 |
+
"basic": self.extract_basic_metadata(),
|
| 54 |
+
"git": self.extract_git_metadata(),
|
| 55 |
+
"dependencies": self.extract_dependency_info(),
|
| 56 |
+
"structure": self.extract_structure_info(),
|
| 57 |
+
"agentic_detection": self.detect_agentic_frameworks(),
|
| 58 |
+
"entry_points": self.find_entry_points(),
|
| 59 |
+
"config_files": self.find_config_files(),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# 🔧 FIXED: Now returns actual repo name, not folder name
|
| 63 |
+
def extract_basic_metadata(self) -> Dict:
|
| 64 |
+
"""Extract basic repository metadata"""
|
| 65 |
+
return {
|
| 66 |
+
"repo_name": self._get_actual_repo_name(), # 🎯 FIXED LINE
|
| 67 |
+
"local_path": str(self.repo_path),
|
| 68 |
+
"size_mb": self._get_repo_size_mb(),
|
| 69 |
+
"file_count": self._count_files(),
|
| 70 |
+
"extracted_at": datetime.now().isoformat(),
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
# 🆕 NEW HELPER METHOD
|
| 74 |
+
def _get_actual_repo_name(self) -> str:
|
| 75 |
+
"""
|
| 76 |
+
Get actual repository name from Git remote or folder structure.
|
| 77 |
+
Returns 'crewAI' not 'crewai_test'.
|
| 78 |
+
"""
|
| 79 |
+
# 1. Try to get from git remote URL
|
| 80 |
+
try:
|
| 81 |
+
remote_url = self._run_git_command(["config", "--get", "remote.origin.url"])
|
| 82 |
+
if remote_url:
|
| 83 |
+
remote_url = remote_url.strip()
|
| 84 |
+
# Extract repo name from URL
|
| 85 |
+
# github.com/owner/repo.git -> repo
|
| 86 |
+
if '/' in remote_url:
|
| 87 |
+
repo_name = remote_url.split('/')[-1]
|
| 88 |
+
if repo_name.endswith('.git'):
|
| 89 |
+
repo_name = repo_name[:-4]
|
| 90 |
+
return repo_name
|
| 91 |
+
except Exception:
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
# 2. Fallback: clean folder name
|
| 95 |
+
folder_name = self.repo_path.name
|
| 96 |
+
|
| 97 |
+
# Remove common suffixes
|
| 98 |
+
for suffix in ['_test', '_copy', '_backup', '_temp', '_local']:
|
| 99 |
+
if folder_name.lower().endswith(suffix.lower()):
|
| 100 |
+
return folder_name[:-len(suffix)]
|
| 101 |
+
|
| 102 |
+
return folder_name
|
| 103 |
+
|
| 104 |
+
def extract_git_metadata(self) -> Dict:
|
| 105 |
+
try:
|
| 106 |
+
remote_url = self._run_git_command(
|
| 107 |
+
["config", "--get", "remote.origin.url"]
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
latest_commit = self._run_git_command(
|
| 111 |
+
["log", "-1", "--pretty=format:%H|%an|%ae|%ad|%s"]
|
| 112 |
+
)
|
| 113 |
+
commit_parts = latest_commit.split("|") if latest_commit else []
|
| 114 |
+
|
| 115 |
+
branches_raw = self._run_git_command(["branch", "-a"])
|
| 116 |
+
branch_list = (
|
| 117 |
+
[
|
| 118 |
+
b.strip().replace("* ", "")
|
| 119 |
+
for b in branches_raw.split("\n")
|
| 120 |
+
if b.strip()
|
| 121 |
+
]
|
| 122 |
+
if branches_raw
|
| 123 |
+
else []
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
tags_raw = self._run_git_command(["tag", "-l"])
|
| 127 |
+
tag_list = (
|
| 128 |
+
[t.strip() for t in tags_raw.split("\n") if t.strip()]
|
| 129 |
+
if tags_raw
|
| 130 |
+
else []
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
current_branch = self._run_git_command(["branch", "--show-current"])
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
"remote_url": remote_url or "",
|
| 137 |
+
"branch": current_branch or "",
|
| 138 |
+
"latest_commit": {
|
| 139 |
+
"hash": commit_parts[0] if len(commit_parts) > 0 else "",
|
| 140 |
+
"author": commit_parts[1] if len(commit_parts) > 1 else "",
|
| 141 |
+
"email": commit_parts[2] if len(commit_parts) > 2 else "",
|
| 142 |
+
"date": commit_parts[3] if len(commit_parts) > 3 else "",
|
| 143 |
+
"message": commit_parts[4] if len(commit_parts) > 4 else "",
|
| 144 |
+
},
|
| 145 |
+
"branch_count": len(branch_list),
|
| 146 |
+
"branches": branch_list[:10],
|
| 147 |
+
"tag_count": len(tag_list),
|
| 148 |
+
"tags": tag_list[:10],
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return {"error": str(e)}
|
| 153 |
+
|
| 154 |
+
# ---------------------------------------------------------------------
|
| 155 |
+
# Agentic detection
|
| 156 |
+
# ---------------------------------------------------------------------
|
| 157 |
+
|
| 158 |
+
def detect_agentic_frameworks(self) -> Dict:
|
| 159 |
+
detected: Dict[str, str] = {}
|
| 160 |
+
|
| 161 |
+
deps = self.extract_dependency_info()
|
| 162 |
+
python_packages = deps.get("python_packages", [])
|
| 163 |
+
|
| 164 |
+
for framework, keywords in self.AGENTIC_FRAMEWORKS.items():
|
| 165 |
+
for package in python_packages:
|
| 166 |
+
if any(k in package.lower() for k in keywords):
|
| 167 |
+
detected[framework] = "dependency"
|
| 168 |
+
break
|
| 169 |
+
else:
|
| 170 |
+
if self._scan_for_framework(keywords):
|
| 171 |
+
detected[framework] = "usage"
|
| 172 |
+
|
| 173 |
+
if self._has_agent_patterns():
|
| 174 |
+
detected["custom_agents"] = "implementation"
|
| 175 |
+
|
| 176 |
+
return detected
|
| 177 |
+
|
| 178 |
+
def _scan_for_framework(self, keywords: List[str]) -> bool:
|
| 179 |
+
python_files = list(self.repo_path.rglob("*.py"))[:50]
|
| 180 |
+
|
| 181 |
+
for py_file in python_files:
|
| 182 |
+
try:
|
| 183 |
+
content = py_file.read_text(encoding="utf-8", errors="ignore").lower()
|
| 184 |
+
|
| 185 |
+
if any(f"import {k}" in content or f"from {k}" in content for k in keywords):
|
| 186 |
+
return True
|
| 187 |
+
|
| 188 |
+
if any(re.search(rf"class.*{k}", content) for k in keywords):
|
| 189 |
+
return True
|
| 190 |
+
|
| 191 |
+
except Exception:
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
def _has_agent_patterns(self) -> bool:
|
| 197 |
+
patterns = [
|
| 198 |
+
r"class.*Agent",
|
| 199 |
+
r"def.*agent",
|
| 200 |
+
r"class.*Tool",
|
| 201 |
+
r"def.*tool",
|
| 202 |
+
r"class.*Workflow",
|
| 203 |
+
r"def.*workflow",
|
| 204 |
+
r"class.*Orchestrator",
|
| 205 |
+
r"def.*orchestrator",
|
| 206 |
+
r"@tool",
|
| 207 |
+
r"@agent",
|
| 208 |
+
r"@workflow",
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
python_files = list(self.repo_path.rglob("*.py"))[:20]
|
| 212 |
+
|
| 213 |
+
for py_file in python_files:
|
| 214 |
+
try:
|
| 215 |
+
content = py_file.read_text(encoding="utf-8", errors="ignore")
|
| 216 |
+
if any(re.search(p, content, re.IGNORECASE) for p in patterns):
|
| 217 |
+
return True
|
| 218 |
+
except Exception:
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
# ---------------------------------------------------------------------
|
| 224 |
+
# Dependencies
|
| 225 |
+
# ---------------------------------------------------------------------
|
| 226 |
+
|
| 227 |
+
def extract_dependency_info(self) -> Dict:
|
| 228 |
+
deps = {
|
| 229 |
+
"python_packages": [],
|
| 230 |
+
"nodejs_packages": [],
|
| 231 |
+
"docker": False,
|
| 232 |
+
"other_dependencies": [],
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
req_files = [
|
| 236 |
+
"requirements.txt",
|
| 237 |
+
"pyproject.toml",
|
| 238 |
+
"setup.py",
|
| 239 |
+
"setup.cfg",
|
| 240 |
+
"Pipfile",
|
| 241 |
+
"environment.yml",
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
for req_file in req_files:
|
| 245 |
+
path = self.repo_path / req_file
|
| 246 |
+
if path.exists():
|
| 247 |
+
try:
|
| 248 |
+
deps["python_packages"].extend(
|
| 249 |
+
self._parse_python_dependencies(path, req_file)
|
| 250 |
+
)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"⚠️ Error parsing {req_file}: {e}")
|
| 253 |
+
|
| 254 |
+
package_json = self.repo_path / "package.json"
|
| 255 |
+
if package_json.exists():
|
| 256 |
+
try:
|
| 257 |
+
data = json.loads(package_json.read_text())
|
| 258 |
+
deps["nodejs_packages"].extend(data.get("dependencies", {}).keys())
|
| 259 |
+
deps["nodejs_packages"].extend(data.get("devDependencies", {}).keys())
|
| 260 |
+
except Exception:
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
deps["docker"] = any(
|
| 264 |
+
(self.repo_path / f).exists()
|
| 265 |
+
for f in ["Dockerfile", "docker-compose.yml", "docker-compose.yaml"]
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
return deps
|
| 269 |
+
|
| 270 |
+
def _parse_python_dependencies(self, path: Path, file_name: str) -> List[str]:
|
| 271 |
+
packages: List[str] = []
|
| 272 |
+
|
| 273 |
+
if file_name == "requirements.txt":
|
| 274 |
+
for line in path.read_text().splitlines():
|
| 275 |
+
line = line.strip()
|
| 276 |
+
if line and not line.startswith("#"):
|
| 277 |
+
pkg = (
|
| 278 |
+
line.split("==")[0]
|
| 279 |
+
.split(">=")[0]
|
| 280 |
+
.split("<=")[0]
|
| 281 |
+
.split("~=")[0]
|
| 282 |
+
.strip()
|
| 283 |
+
)
|
| 284 |
+
if pkg and not pkg.startswith("-"):
|
| 285 |
+
packages.append(pkg)
|
| 286 |
+
|
| 287 |
+
elif file_name == "pyproject.toml":
|
| 288 |
+
import toml
|
| 289 |
+
|
| 290 |
+
data = toml.load(path)
|
| 291 |
+
deps = data.get("project", {}).get("dependencies", [])
|
| 292 |
+
for d in deps:
|
| 293 |
+
packages.append(d.split("==")[0].split(">=")[0].strip())
|
| 294 |
+
|
| 295 |
+
return packages
|
| 296 |
+
|
| 297 |
+
# ---------------------------------------------------------------------
|
| 298 |
+
# Structure & utilities
|
| 299 |
+
# ---------------------------------------------------------------------
|
| 300 |
+
|
| 301 |
+
def extract_structure_info(self) -> Dict:
|
| 302 |
+
structure = {
|
| 303 |
+
"directories": [],
|
| 304 |
+
"file_types": {},
|
| 305 |
+
"has_agentic_structure": False,
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
for item in self.repo_path.iterdir():
|
| 309 |
+
if item.is_dir() and item.name != ".git":
|
| 310 |
+
structure["directories"].append(item.name)
|
| 311 |
+
|
| 312 |
+
ext_count: Dict[str, int] = {}
|
| 313 |
+
for f in self.repo_path.rglob("*"):
|
| 314 |
+
if f.is_file():
|
| 315 |
+
ext_count[f.suffix.lower()] = ext_count.get(f.suffix.lower(), 0) + 1
|
| 316 |
+
|
| 317 |
+
structure["file_types"] = dict(
|
| 318 |
+
sorted(ext_count.items(), key=lambda x: x[1], reverse=True)[:10]
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
agentic_dirs = {
|
| 322 |
+
"agent",
|
| 323 |
+
"agents",
|
| 324 |
+
"workflow",
|
| 325 |
+
"workflows",
|
| 326 |
+
"tool",
|
| 327 |
+
"tools",
|
| 328 |
+
"pipeline",
|
| 329 |
+
"pipelines",
|
| 330 |
+
"orchestrator",
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
structure["has_agentic_structure"] = any(
|
| 334 |
+
any(k in d.lower() for k in agentic_dirs)
|
| 335 |
+
for d in structure["directories"]
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
return structure
|
| 339 |
+
|
| 340 |
+
def find_entry_points(self) -> List[str]:
|
| 341 |
+
patterns = [
|
| 342 |
+
"main.py",
|
| 343 |
+
"app.py",
|
| 344 |
+
"run.py",
|
| 345 |
+
"cli.py",
|
| 346 |
+
"server.py",
|
| 347 |
+
"agent.py",
|
| 348 |
+
"pipeline.py",
|
| 349 |
+
"__main__.py",
|
| 350 |
+
]
|
| 351 |
+
|
| 352 |
+
return [
|
| 353 |
+
str(p.relative_to(self.repo_path))
|
| 354 |
+
for pat in patterns
|
| 355 |
+
for p in self.repo_path.rglob(pat)
|
| 356 |
+
][:5]
|
| 357 |
+
|
| 358 |
+
def find_config_files(self) -> List[str]:
|
| 359 |
+
patterns = [
|
| 360 |
+
"config*.py",
|
| 361 |
+
"settings*.py",
|
| 362 |
+
".env*",
|
| 363 |
+
"*.toml",
|
| 364 |
+
"*.yaml",
|
| 365 |
+
"*.yml",
|
| 366 |
+
"*.json",
|
| 367 |
+
"*.cfg",
|
| 368 |
+
"*.ini",
|
| 369 |
+
]
|
| 370 |
+
|
| 371 |
+
files: List[str] = []
|
| 372 |
+
for pat in patterns:
|
| 373 |
+
for p in self.repo_path.rglob(pat):
|
| 374 |
+
rel = str(p.relative_to(self.repo_path))
|
| 375 |
+
if not any(x in rel for x in [".git", "__pycache__", "node_modules"]):
|
| 376 |
+
files.append(rel)
|
| 377 |
+
|
| 378 |
+
return sorted(files)[:10]
|
| 379 |
+
|
| 380 |
+
# ---------------------------------------------------------------------
|
| 381 |
+
# Internals
|
| 382 |
+
# ---------------------------------------------------------------------
|
| 383 |
+
|
| 384 |
+
def _get_repo_size_mb(self) -> float:
|
| 385 |
+
total = sum(
|
| 386 |
+
f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()
|
| 387 |
+
)
|
| 388 |
+
return round(total / (1024 * 1024), 2)
|
| 389 |
+
|
| 390 |
+
def _count_files(self) -> int:
|
| 391 |
+
return sum(
|
| 392 |
+
1
|
| 393 |
+
for f in self.repo_path.rglob("*")
|
| 394 |
+
if f.is_file() and ".git" not in str(f)
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
def _run_git_command(self, args: List[str]) -> Optional[str]:
|
| 398 |
+
try:
|
| 399 |
+
result = subprocess.run(
|
| 400 |
+
["git", "-C", str(self.repo_path)] + args,
|
| 401 |
+
capture_output=True,
|
| 402 |
+
text=True,
|
| 403 |
+
check=True,
|
| 404 |
+
)
|
| 405 |
+
return result.stdout.strip() or None
|
| 406 |
+
except Exception:
|
| 407 |
+
return None
|
| 408 |
+
|
scripts/core/ingestion/ts_chunker.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tree-sitter based syntactic chunker - Span enrichment and fallback parser.
|
| 3 |
+
|
| 4 |
+
This module provides byte-level precise chunking using Tree-sitter, which
|
| 5 |
+
serves as a structural fallback and span enrichment layer. Tree-sitter is
|
| 6 |
+
language-aware and robust against malformed code, making it ideal for
|
| 7 |
+
extracting exact byte spans and as a backup parser.
|
| 8 |
+
|
| 9 |
+
ARCHITECTURE POSITION:
|
| 10 |
+
- Enrichment Layer: Provides byte-level precision
|
| 11 |
+
- Fallback Parser: Robust parsing for malformed code
|
| 12 |
+
- Span Authority: Source of truth for byte positions
|
| 13 |
+
|
| 14 |
+
KEY FEATURES:
|
| 15 |
+
1. Byte-level accurate spans (exact source positions)
|
| 16 |
+
2. Language-aware parsing (supports multiple languages)
|
| 17 |
+
3. Robust against syntax errors
|
| 18 |
+
4. Extracts structural nodes even from partial code
|
| 19 |
+
|
| 20 |
+
FLOW:
|
| 21 |
+
File → Tree-sitter parser → Structural nodes → Spans for enrichment
|
| 22 |
+
|
| 23 |
+
USAGE:
|
| 24 |
+
from ts_chunker import extract_ts_chunks
|
| 25 |
+
chunks = extract_ts_chunks(Path("file.py"))
|
| 26 |
+
|
| 27 |
+
NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
|
| 28 |
+
precise byte spans and serve as fallback for syntax errors.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
from typing import List, Optional, Literal, Dict, Tuple
|
| 33 |
+
|
| 34 |
+
from tree_sitter import Parser, Language, Node
|
| 35 |
+
import tree_sitter_python as tspython
|
| 36 |
+
|
| 37 |
+
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType
|
| 38 |
+
|
| 39 |
+
# ----------------------------
|
| 40 |
+
# Types
|
| 41 |
+
# ----------------------------
|
| 42 |
+
|
| 43 |
+
TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
|
| 44 |
+
"module": "module",
|
| 45 |
+
"class_definition": "class",
|
| 46 |
+
"function_definition": "function",
|
| 47 |
+
"async_function_definition": "function",
|
| 48 |
+
"import_statement": "imports",
|
| 49 |
+
"import_from_statement": "imports",
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
MAX_TS_DEPTH = 3 # module → imports → class/function → method
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ----------------------------
|
| 56 |
+
# Helpers
|
| 57 |
+
# ----------------------------
|
| 58 |
+
|
| 59 |
+
def _safe_decode(data: bytes) -> str:
|
| 60 |
+
try:
|
| 61 |
+
return data.decode("utf-8")
|
| 62 |
+
except UnicodeDecodeError:
|
| 63 |
+
return data.decode("utf-8", errors="ignore")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _get_node_name(node: Node) -> Optional[str]:
|
| 67 |
+
"""
|
| 68 |
+
Extract identifier name for class / function nodes.
|
| 69 |
+
"""
|
| 70 |
+
for child in node.children:
|
| 71 |
+
if child.type == "identifier":
|
| 72 |
+
text = child.text
|
| 73 |
+
if isinstance(text, (bytes, bytearray)):
|
| 74 |
+
return _safe_decode(text)
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ----------------------------
|
| 79 |
+
# Public API
|
| 80 |
+
# ----------------------------
|
| 81 |
+
|
| 82 |
+
def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
|
| 83 |
+
source_bytes = file_path.read_bytes()
|
| 84 |
+
|
| 85 |
+
language = Language(tspython.language())
|
| 86 |
+
parser = Parser(language=language)
|
| 87 |
+
|
| 88 |
+
tree = parser.parse(source_bytes)
|
| 89 |
+
root = tree.root_node
|
| 90 |
+
|
| 91 |
+
chunks: List[CodeChunk] = []
|
| 92 |
+
|
| 93 |
+
def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
|
| 94 |
+
if depth > MAX_TS_DEPTH:
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
node_type = node.type
|
| 98 |
+
|
| 99 |
+
if node_type in TS_TO_CHUNK_TYPE:
|
| 100 |
+
code_bytes = source_bytes[node.start_byte : node.end_byte]
|
| 101 |
+
code = _safe_decode(code_bytes)
|
| 102 |
+
|
| 103 |
+
chunk_type = TS_TO_CHUNK_TYPE[node_type]
|
| 104 |
+
name = _get_node_name(node)
|
| 105 |
+
|
| 106 |
+
# For imports, use the full import as name
|
| 107 |
+
if chunk_type == "imports":
|
| 108 |
+
name = code.strip()
|
| 109 |
+
|
| 110 |
+
# Create chunk with byte-level precision
|
| 111 |
+
chunks.append(
|
| 112 |
+
CodeChunk(
|
| 113 |
+
chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
|
| 114 |
+
file_path=str(file_path),
|
| 115 |
+
language="python",
|
| 116 |
+
chunk_type=chunk_type,
|
| 117 |
+
code=code,
|
| 118 |
+
ast=ChunkAST(
|
| 119 |
+
symbol_type=None, # TS doesn't provide semantic types
|
| 120 |
+
name=name,
|
| 121 |
+
parent=None, # Parent relationships from AST
|
| 122 |
+
docstring=None,
|
| 123 |
+
decorators=[],
|
| 124 |
+
imports=[],
|
| 125 |
+
node_type=node_type,
|
| 126 |
+
),
|
| 127 |
+
span=ChunkSpan(
|
| 128 |
+
start_byte=node.start_byte,
|
| 129 |
+
end_byte=node.end_byte,
|
| 130 |
+
start_line=node.start_point[0] + 1,
|
| 131 |
+
end_line=node.end_point[0] + 1,
|
| 132 |
+
char_count=len(code),
|
| 133 |
+
),
|
| 134 |
+
hierarchy=ChunkHierarchy(
|
| 135 |
+
is_primary=False, # Tree-sitter chunks are for span enrichment only
|
| 136 |
+
is_extracted=True,
|
| 137 |
+
depth=depth,
|
| 138 |
+
parent_id=None, # Parent relationships from AST
|
| 139 |
+
),
|
| 140 |
+
metadata={
|
| 141 |
+
"byte_span": {
|
| 142 |
+
"start": node.start_byte,
|
| 143 |
+
"end": node.end_byte,
|
| 144 |
+
},
|
| 145 |
+
"tree_sitter_node_type": node_type,
|
| 146 |
+
"is_exact_span": True,
|
| 147 |
+
},
|
| 148 |
+
)
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
for child in node.children:
|
| 152 |
+
walk(child, depth + 1, node)
|
| 153 |
+
|
| 154 |
+
walk(root)
|
| 155 |
+
return chunks
|
scripts/core/training/__init__.py
ADDED
|
File without changes
|
scripts/core/training/model.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import AutoModel, AutoConfig
|
| 4 |
+
|
| 5 |
+
class CodeEmbedder(nn.Module):
|
| 6 |
+
"""
|
| 7 |
+
A wrapper around a Transformer model (default: CodeBERT) to produce
|
| 8 |
+
dense vector embeddings for code snippets using Mean Pooling.
|
| 9 |
+
"""
|
| 10 |
+
def __init__(self, model_name_or_path="microsoft/codebert-base", trust_remote_code=False):
|
| 11 |
+
super(CodeEmbedder, self).__init__()
|
| 12 |
+
self.config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
|
| 13 |
+
self.encoder = AutoModel.from_pretrained(model_name_or_path, config=self.config, trust_remote_code=trust_remote_code)
|
| 14 |
+
|
| 15 |
+
def mean_pooling(self, token_embeddings, attention_mask):
|
| 16 |
+
"""
|
| 17 |
+
Average the token embeddings, ignoring padding tokens.
|
| 18 |
+
"""
|
| 19 |
+
# attention_mask: (batch_size, seq_len)
|
| 20 |
+
# token_embeddings: (batch_size, seq_len, hidden_dim)
|
| 21 |
+
|
| 22 |
+
# Expand mask to match embedding dimensions
|
| 23 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 24 |
+
|
| 25 |
+
# Sum embeddings (ignoring padding)
|
| 26 |
+
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
|
| 27 |
+
|
| 28 |
+
# Count non-padding tokens (prevent division by zero with clamp)
|
| 29 |
+
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 30 |
+
|
| 31 |
+
return sum_embeddings / sum_mask
|
| 32 |
+
|
| 33 |
+
def forward(self, input_ids, attention_mask):
|
| 34 |
+
# Pass through the transformer
|
| 35 |
+
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
| 36 |
+
|
| 37 |
+
# Extract last hidden state
|
| 38 |
+
# Shape: (batch_size, seq_len, hidden_dim)
|
| 39 |
+
last_hidden_state = outputs.last_hidden_state
|
| 40 |
+
|
| 41 |
+
# Perform Mean Pooling (Better than CLS token for sentence similarity)
|
| 42 |
+
embeddings = self.mean_pooling(last_hidden_state, attention_mask)
|
| 43 |
+
|
| 44 |
+
# Normalize embeddings (Optional but recommended for cosine similarity)
|
| 45 |
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
| 46 |
+
|
| 47 |
+
return embeddings
|
scripts/core/training/test_model.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
from transformers import AutoTokenizer, AutoModel
|
| 4 |
+
|
| 5 |
+
# 1. Load Model from Hugging Face (Your Team's Checkpoint)
|
| 6 |
+
MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
|
| 7 |
+
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
print(f"Downloading model: {MODEL_NAME}...")
|
| 11 |
+
MAX_RETRIES = 3
|
| 12 |
+
for attempt in range(MAX_RETRIES):
|
| 13 |
+
try:
|
| 14 |
+
print(f"Attempt {attempt+1}/{MAX_RETRIES}...")
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 16 |
+
model = AutoModel.from_pretrained(MODEL_NAME)
|
| 17 |
+
print("Model loaded successfully!")
|
| 18 |
+
break
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"Attempt {attempt+1} failed: {e}")
|
| 21 |
+
if attempt == MAX_RETRIES - 1:
|
| 22 |
+
print("Failed to load model after multiple attempts.")
|
| 23 |
+
print("Tip: Check internet connection or repo visibility.")
|
| 24 |
+
exit(1)
|
| 25 |
+
time.sleep(5) # Wait before retry
|
| 26 |
+
|
| 27 |
+
# 2. Define Inputs (Query vs Code)
|
| 28 |
+
query = "How to create a state graph in langgraph?"
|
| 29 |
+
code = """
|
| 30 |
+
from langgraph.graph import StateGraph
|
| 31 |
+
|
| 32 |
+
def create_workflow():
|
| 33 |
+
workflow = StateGraph(AgentState)
|
| 34 |
+
workflow.add_node("agent", agent_node)
|
| 35 |
+
return workflow.compile()
|
| 36 |
+
"""
|
| 37 |
+
irrelevant_code = "def fast_inverse_sqrt(number): return number ** -0.5"
|
| 38 |
+
|
| 39 |
+
# 3. Embed & Compare
|
| 40 |
+
def embed(text):
|
| 41 |
+
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
|
| 42 |
+
with torch.no_grad():
|
| 43 |
+
outputs = model(**inputs)
|
| 44 |
+
# Mean pooling for sentence representation
|
| 45 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
| 46 |
+
return F.normalize(embeddings, p=2, dim=1)
|
| 47 |
+
|
| 48 |
+
print("\nRunning Inference Test...")
|
| 49 |
+
query_emb = embed(query)
|
| 50 |
+
code_emb = embed(code)
|
| 51 |
+
irrelevant_emb = embed(irrelevant_code)
|
| 52 |
+
|
| 53 |
+
# 4. Calculate Similarity
|
| 54 |
+
sim_positive = F.cosine_similarity(query_emb, code_emb).item()
|
| 55 |
+
sim_negative = F.cosine_similarity(query_emb, irrelevant_emb).item()
|
| 56 |
+
|
| 57 |
+
print(f"Query: '{query}'")
|
| 58 |
+
print(f"Similarity to Relevant Code: {sim_positive:.4f} (Should be high)")
|
| 59 |
+
print(f"Similarity to Irrelevant Code: {sim_negative:.4f} (Should be low)")
|
| 60 |
+
|
| 61 |
+
if sim_positive > sim_negative:
|
| 62 |
+
print("\nSUCCESS: Model correctly ranks relevant code higher.")
|
| 63 |
+
else:
|
| 64 |
+
print("\n⚠️ WARNING: Model performance might be poor.")
|
scripts/core/training/train.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
from torch.utils.data import DataLoader, Dataset
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
|
| 7 |
+
from scripts.core.training.model import CodeEmbedder
|
| 8 |
+
from scripts.core.training.trainer import CodeTrainer
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
# Real Dataset class for Triplet Training
|
| 13 |
+
class RealCodeDataset(Dataset):
|
| 14 |
+
def __init__(self, jsonl_path, tokenizer, max_length=512):
|
| 15 |
+
self.tokenizer = tokenizer
|
| 16 |
+
self.max_length = max_length
|
| 17 |
+
self.data = []
|
| 18 |
+
|
| 19 |
+
print(f"Loading data from {jsonl_path}...")
|
| 20 |
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
| 21 |
+
for line in f:
|
| 22 |
+
if line.strip():
|
| 23 |
+
self.data.append(json.loads(line))
|
| 24 |
+
print(f"Loaded {len(self.data)} triplets.")
|
| 25 |
+
|
| 26 |
+
def __len__(self):
|
| 27 |
+
return len(self.data)
|
| 28 |
+
|
| 29 |
+
def __getitem__(self, idx):
|
| 30 |
+
item = self.data[idx]
|
| 31 |
+
|
| 32 |
+
# Helper to tokenize
|
| 33 |
+
def tokenize_text(text):
|
| 34 |
+
return self.tokenizer(
|
| 35 |
+
text,
|
| 36 |
+
return_tensors='pt',
|
| 37 |
+
padding='max_length',
|
| 38 |
+
truncation=True,
|
| 39 |
+
max_length=self.max_length
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Tokenize all three parts
|
| 43 |
+
anchor = tokenize_text(item['anchor'])
|
| 44 |
+
positive = tokenize_text(item['positive'])
|
| 45 |
+
negative = tokenize_text(item['negative'])
|
| 46 |
+
|
| 47 |
+
# Return a flat dict with prefixed keys
|
| 48 |
+
return {
|
| 49 |
+
'anchor_input_ids': anchor['input_ids'].squeeze(0),
|
| 50 |
+
'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
|
| 51 |
+
'positive_input_ids': positive['input_ids'].squeeze(0),
|
| 52 |
+
'positive_attention_mask': positive['attention_mask'].squeeze(0),
|
| 53 |
+
'negative_input_ids': negative['input_ids'].squeeze(0),
|
| 54 |
+
'negative_attention_mask': negative['attention_mask'].squeeze(0)
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Dummy Dataset class for MVP testing without the robust data pipeline availability
|
| 58 |
+
class DummyCodeDataset(Dataset):
|
| 59 |
+
def __init__(self, tokenizer, size=100):
|
| 60 |
+
self.tokenizer = tokenizer
|
| 61 |
+
self.size = size
|
| 62 |
+
# Generate dummy triplet structure
|
| 63 |
+
self.data = [{"anchor": "def hello(): return 'world'", "positive": "def hi(): return 'earth'", "negative": "class Foo: pass"}] * size
|
| 64 |
+
|
| 65 |
+
def __len__(self):
|
| 66 |
+
return self.size
|
| 67 |
+
|
| 68 |
+
def __getitem__(self, idx):
|
| 69 |
+
item = self.data[idx]
|
| 70 |
+
|
| 71 |
+
# Helper to tokenize
|
| 72 |
+
def tokenize_text(text):
|
| 73 |
+
return self.tokenizer(
|
| 74 |
+
text,
|
| 75 |
+
return_tensors='pt',
|
| 76 |
+
padding='max_length',
|
| 77 |
+
truncation=True,
|
| 78 |
+
max_length=128
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
anchor = tokenize_text(item['anchor'])
|
| 82 |
+
positive = tokenize_text(item['positive'])
|
| 83 |
+
negative = tokenize_text(item['negative'])
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
'anchor_input_ids': anchor['input_ids'].squeeze(0),
|
| 87 |
+
'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
|
| 88 |
+
'positive_input_ids': positive['input_ids'].squeeze(0),
|
| 89 |
+
'positive_attention_mask': positive['attention_mask'].squeeze(0),
|
| 90 |
+
'negative_input_ids': negative['input_ids'].squeeze(0),
|
| 91 |
+
'negative_attention_mask': negative['attention_mask'].squeeze(0)
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
def main():
|
| 95 |
+
parser = argparse.ArgumentParser(description="Train CodeMode Embeddings")
|
| 96 |
+
|
| 97 |
+
parser.add_argument("--model_name", type=str, default="microsoft/codebert-base", help="Hub model name")
|
| 98 |
+
parser.add_argument("--data_path", type=str, required=False, help="Path to parsed chunks.jsonl")
|
| 99 |
+
parser.add_argument("--output_dir", type=str, default="./output", help="Where to save checkpoints")
|
| 100 |
+
parser.add_argument("--epochs", type=int, default=3)
|
| 101 |
+
parser.add_argument("--batch_size", type=int, default=8)
|
| 102 |
+
parser.add_argument("--accumulation_steps", type=int, default=4, help="Gradient Accumulation Steps")
|
| 103 |
+
parser.add_argument("--lr", type=float, default=2e-5)
|
| 104 |
+
parser.add_argument("--dry_run", action="store_true", help="Run with dummy data for 1 epoch")
|
| 105 |
+
|
| 106 |
+
args = parser.parse_args()
|
| 107 |
+
|
| 108 |
+
print(f"Initializing Training Pipeline...")
|
| 109 |
+
print(f" Model: {args.model_name}")
|
| 110 |
+
print(f" Output: {args.output_dir}")
|
| 111 |
+
print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
|
| 112 |
+
|
| 113 |
+
# 1. Initialize Tokenizer
|
| 114 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
| 115 |
+
|
| 116 |
+
# 2. Load Dataset (Real or Dummy)
|
| 117 |
+
if args.data_path and os.path.exists(args.data_path):
|
| 118 |
+
train_dataset = RealCodeDataset(args.data_path, tokenizer)
|
| 119 |
+
else:
|
| 120 |
+
print("No data path provided or file missing. Using DUMMY data for verification.")
|
| 121 |
+
train_dataset = DummyCodeDataset(tokenizer, size=100)
|
| 122 |
+
|
| 123 |
+
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
|
| 124 |
+
|
| 125 |
+
# 3. Initialize Model
|
| 126 |
+
model = CodeEmbedder(model_name_or_path=args.model_name)
|
| 127 |
+
|
| 128 |
+
# 4. Initialize Trainer
|
| 129 |
+
trainer = CodeTrainer(
|
| 130 |
+
model=model,
|
| 131 |
+
train_loader=train_loader,
|
| 132 |
+
epochs=args.epochs,
|
| 133 |
+
learning_rate=args.lr,
|
| 134 |
+
accumulation_steps=args.accumulation_steps,
|
| 135 |
+
mixed_precision=True, # Hardcoded True for the "Zero-Cost" philosophy
|
| 136 |
+
output_dir=args.output_dir
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# 5. Connect and Train
|
| 140 |
+
trainer.train()
|
| 141 |
+
|
| 142 |
+
print("Training Complete.")
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
scripts/core/training/trainer.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.optim import AdamW
|
| 4 |
+
from torch.utils.data import DataLoader
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from .model import CodeEmbedder
|
| 9 |
+
|
| 10 |
+
# Setup Logger
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
class CodeTrainer:
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
model: CodeEmbedder,
|
| 18 |
+
train_loader: DataLoader,
|
| 19 |
+
val_loader: DataLoader = None,
|
| 20 |
+
epochs: int = 3,
|
| 21 |
+
learning_rate: float = 2e-5,
|
| 22 |
+
accumulation_steps: int = 1,
|
| 23 |
+
mixed_precision: bool = True,
|
| 24 |
+
output_dir: str = "./output",
|
| 25 |
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
| 26 |
+
):
|
| 27 |
+
self.model = model.to(device)
|
| 28 |
+
self.train_loader = train_loader
|
| 29 |
+
self.val_loader = val_loader
|
| 30 |
+
self.epochs = epochs
|
| 31 |
+
self.lr = learning_rate
|
| 32 |
+
self.accumulation_steps = accumulation_steps
|
| 33 |
+
self.mixed_precision = mixed_precision
|
| 34 |
+
self.output_dir = output_dir
|
| 35 |
+
self.device = device
|
| 36 |
+
|
| 37 |
+
# Optimizer
|
| 38 |
+
self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
|
| 39 |
+
|
| 40 |
+
# Scheduler (Optional: constant for now, can transform to Linear later)
|
| 41 |
+
# self.scheduler = ...
|
| 42 |
+
|
| 43 |
+
# Mixed Precision Scaler
|
| 44 |
+
self.scaler = torch.cuda.amp.GradScaler(enabled=self.mixed_precision)
|
| 45 |
+
|
| 46 |
+
# Loss Function: Triplet Margin Loss (Standard for Sentence Embeddings)
|
| 47 |
+
# Tries to maximize distance between Anchor-Negative and minimize Anchor-Positive
|
| 48 |
+
self.criterion = nn.TripletMarginLoss(margin=1.0, p=2)
|
| 49 |
+
|
| 50 |
+
def train_step(self, batch):
|
| 51 |
+
"""
|
| 52 |
+
Runs one training step. Returns loss.
|
| 53 |
+
"""
|
| 54 |
+
# Unpack the Triplet Batch
|
| 55 |
+
# We assume the Dataset returns keys: 'anchor_input_ids', 'anchor_attention_mask', etc.
|
| 56 |
+
|
| 57 |
+
# Helper to move dict to device
|
| 58 |
+
to_device = lambda x: x.to(self.device)
|
| 59 |
+
|
| 60 |
+
# Autocast for Mixed Precision
|
| 61 |
+
with torch.cuda.amp.autocast(enabled=self.mixed_precision):
|
| 62 |
+
# 1. Forward Pass for all 3 components
|
| 63 |
+
anchor_emb = self.model(to_device(batch['anchor_input_ids']), to_device(batch['anchor_attention_mask']))
|
| 64 |
+
positive_emb = self.model(to_device(batch['positive_input_ids']), to_device(batch['positive_attention_mask']))
|
| 65 |
+
negative_emb = self.model(to_device(batch['negative_input_ids']), to_device(batch['negative_attention_mask']))
|
| 66 |
+
|
| 67 |
+
# 2. Compute Triplet Loss
|
| 68 |
+
loss = self.criterion(anchor_emb, positive_emb, negative_emb)
|
| 69 |
+
|
| 70 |
+
return loss
|
| 71 |
+
|
| 72 |
+
def train(self):
|
| 73 |
+
logger.info(f"Starting training on {self.device}...")
|
| 74 |
+
logger.info(f"Batch Size: {self.train_loader.batch_size}, Accumulation Steps: {self.accumulation_steps}")
|
| 75 |
+
logger.info(f"Effective Batch Size: {self.train_loader.batch_size * self.accumulation_steps}")
|
| 76 |
+
|
| 77 |
+
self.model.train()
|
| 78 |
+
|
| 79 |
+
for epoch in range(self.epochs):
|
| 80 |
+
total_loss = 0
|
| 81 |
+
self.optimizer.zero_grad()
|
| 82 |
+
|
| 83 |
+
progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.epochs}")
|
| 84 |
+
|
| 85 |
+
for step, batch in enumerate(progress_bar):
|
| 86 |
+
|
| 87 |
+
# Forward + Loss Calculation
|
| 88 |
+
loss = self.train_step(batch)
|
| 89 |
+
|
| 90 |
+
# Gradient Accumulation: Normalize loss
|
| 91 |
+
loss = loss / self.accumulation_steps
|
| 92 |
+
|
| 93 |
+
# Backward Pass (Scaled)
|
| 94 |
+
self.scaler.scale(loss).backward()
|
| 95 |
+
|
| 96 |
+
if (step + 1) % self.accumulation_steps == 0:
|
| 97 |
+
# Update Weights
|
| 98 |
+
self.scaler.step(self.optimizer)
|
| 99 |
+
self.scaler.update()
|
| 100 |
+
self.optimizer.zero_grad()
|
| 101 |
+
|
| 102 |
+
total_loss += loss.item() * self.accumulation_steps
|
| 103 |
+
progress_bar.set_postfix({'loss': total_loss / (step + 1)})
|
| 104 |
+
|
| 105 |
+
# Save Checkpoint
|
| 106 |
+
self.save_model(epoch+1)
|
| 107 |
+
|
| 108 |
+
def save_model(self, epoch):
|
| 109 |
+
save_path = os.path.join(self.output_dir, f"checkpoint-{epoch}")
|
| 110 |
+
os.makedirs(save_path, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
logger.info(f"Saving model to {save_path}...")
|
| 113 |
+
|
| 114 |
+
# Save explicitly as safetensors via transformers API
|
| 115 |
+
self.model.encoder.save_pretrained(save_path, safe_serialization=True)
|
| 116 |
+
self.model.config.save_pretrained(save_path)
|
| 117 |
+
# Note: We save the 'encoder' which is the AutoModel,
|
| 118 |
+
# so it can be loaded easily by others.
|
scripts/core/utils/__init__.py
ADDED
|
File without changes
|
scripts/core/utils/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (183 Bytes). View file
|
|
|
scripts/core/utils/__pycache__/id_utils.cpython-311.pyc
ADDED
|
Binary file (3.18 kB). View file
|
|
|
scripts/core/utils/id_utils.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deterministic ID generation for code chunks.
|
| 3 |
+
|
| 4 |
+
This module provides deterministic hashing for chunk IDs, ensuring that
|
| 5 |
+
identical code chunks receive the same ID across runs. This is crucial for:
|
| 6 |
+
1. Version tracking and change detection
|
| 7 |
+
2. Cache consistency
|
| 8 |
+
3. Reproducible datasets
|
| 9 |
+
4. Efficient deduplication
|
| 10 |
+
|
| 11 |
+
ID GENERATION STRATEGY:
|
| 12 |
+
Hash = SHA256(file_path + chunk_type + name + parent +
|
| 13 |
+
start_line + end_line + code + byte_spans)
|
| 14 |
+
|
| 15 |
+
Result: prefix_hash (e.g., "primary_5c442008")
|
| 16 |
+
|
| 17 |
+
KEY PROPERTIES:
|
| 18 |
+
1. Deterministic: Same input → same ID
|
| 19 |
+
2. Content-aware: Code changes → ID changes
|
| 20 |
+
3. Position-aware: Line/byte changes → ID changes
|
| 21 |
+
4. Hierarchical: Parent relationships affect ID
|
| 22 |
+
|
| 23 |
+
USE CASE:
|
| 24 |
+
Ensures that during RAG operations, identical code chunks are
|
| 25 |
+
recognized as the same entity, improving retrieval accuracy.
|
| 26 |
+
|
| 27 |
+
EXAMPLE:
|
| 28 |
+
deterministic_chunk_id(
|
| 29 |
+
file_path="src/module.py",
|
| 30 |
+
chunk_type="class",
|
| 31 |
+
name="MyClass",
|
| 32 |
+
parent="module",
|
| 33 |
+
start_line=10,
|
| 34 |
+
end_line=50,
|
| 35 |
+
code="class MyClass: ...",
|
| 36 |
+
start_byte=100,
|
| 37 |
+
end_byte=500
|
| 38 |
+
)
|
| 39 |
+
→ "primary_a1b2c3d4"
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
import hashlib
|
| 43 |
+
from typing import Optional
|
| 44 |
+
|
| 45 |
+
def deterministic_chunk_id(
|
| 46 |
+
*,
|
| 47 |
+
file_path: str,
|
| 48 |
+
chunk_type: str,
|
| 49 |
+
name: Optional[str],
|
| 50 |
+
parent: Optional[str],
|
| 51 |
+
start_line: Optional[int],
|
| 52 |
+
end_line: Optional[int],
|
| 53 |
+
code: str,
|
| 54 |
+
prefix: str = "primary",
|
| 55 |
+
start_byte: Optional[int] = None,
|
| 56 |
+
end_byte: Optional[int] = None,
|
| 57 |
+
) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Generate deterministic chunk ID that includes code content.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
file_path: Path to source file
|
| 63 |
+
chunk_type: Type of chunk (function, class, method, etc.)
|
| 64 |
+
name: Name of the symbol
|
| 65 |
+
parent: Parent symbol name
|
| 66 |
+
start_line: Starting line number
|
| 67 |
+
end_line: Ending line number
|
| 68 |
+
code: Actual code content
|
| 69 |
+
prefix: ID prefix (primary/secondary)
|
| 70 |
+
start_byte: Starting byte offset
|
| 71 |
+
end_byte: Ending byte offset
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Deterministic chunk ID
|
| 75 |
+
"""
|
| 76 |
+
# Create a payload that uniquely identifies this chunk
|
| 77 |
+
payload = f"""
|
| 78 |
+
{file_path}
|
| 79 |
+
{chunk_type}
|
| 80 |
+
{name}
|
| 81 |
+
{parent}
|
| 82 |
+
{start_line}
|
| 83 |
+
{end_line}
|
| 84 |
+
{start_byte}
|
| 85 |
+
{end_byte}
|
| 86 |
+
{code}
|
| 87 |
+
""".strip()
|
| 88 |
+
|
| 89 |
+
# Generate hash and use first 8 chars for readability
|
| 90 |
+
hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8]
|
| 91 |
+
return f"{prefix}_{hash_digest}"
|
scripts/generate_all_frameworks.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate training datasets for ALL frameworks automatically.
|
| 3 |
+
|
| 4 |
+
This script auto-discovers all chunk files and processes them,
|
| 5 |
+
generating separate datasets for each framework PLUS a combined dataset.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python scripts/generate_all_frameworks.py
|
| 9 |
+
|
| 10 |
+
Output Structure:
|
| 11 |
+
data/processed/training_crewai/
|
| 12 |
+
- positive_pairs.json
|
| 13 |
+
- triplets.json
|
| 14 |
+
data/processed/training_langgraph/
|
| 15 |
+
- positive_pairs.json
|
| 16 |
+
- triplets.json
|
| 17 |
+
data/processed/training_combined/
|
| 18 |
+
- positive_pairs.json (ALL frameworks merged)
|
| 19 |
+
- triplets.json (ALL frameworks merged)
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import sys
|
| 23 |
+
import json
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import List, Tuple
|
| 26 |
+
from dataclasses import asdict
|
| 27 |
+
|
| 28 |
+
# Add project root to path
|
| 29 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 30 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 31 |
+
|
| 32 |
+
from src.task_3_data_engineering.export.pairs_triplets_generator import (
|
| 33 |
+
generate_pairs_and_triplets,
|
| 34 |
+
PositivePair,
|
| 35 |
+
Triplet
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def discover_all_chunk_files() -> List[Tuple[Path, str]]:
|
| 40 |
+
"""
|
| 41 |
+
Discover all chunk files in the workspace.
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
List of (chunk_path, framework_name) tuples
|
| 45 |
+
"""
|
| 46 |
+
chunk_files = []
|
| 47 |
+
|
| 48 |
+
# Check local chunks
|
| 49 |
+
local_paths = [
|
| 50 |
+
PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
|
| 51 |
+
PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
for path in local_paths:
|
| 55 |
+
if path.exists():
|
| 56 |
+
# Extract framework from parent directory or use "local"
|
| 57 |
+
if "Local_saved_files" in str(path):
|
| 58 |
+
framework = "crewai"
|
| 59 |
+
elif "sample_code" in str(path):
|
| 60 |
+
framework = "sample"
|
| 61 |
+
else:
|
| 62 |
+
framework = path.parent.name
|
| 63 |
+
chunk_files.append((path, framework))
|
| 64 |
+
|
| 65 |
+
# Check repository chunks
|
| 66 |
+
repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
|
| 67 |
+
if repos_dir.exists():
|
| 68 |
+
for repo_dir in repos_dir.iterdir():
|
| 69 |
+
if repo_dir.is_dir():
|
| 70 |
+
for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
|
| 71 |
+
# Extract framework from filename or directory
|
| 72 |
+
framework = jsonl_file.stem.replace("_chunks", "").split("_")[0]
|
| 73 |
+
chunk_files.append((jsonl_file, framework))
|
| 74 |
+
|
| 75 |
+
return chunk_files
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def merge_datasets(all_pairs: List[List[PositivePair]],
|
| 79 |
+
all_triplets: List[List[Triplet]],
|
| 80 |
+
output_dir: Path) -> None:
|
| 81 |
+
"""Merge all framework datasets into combined files (JSON + JSONL)."""
|
| 82 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
|
| 84 |
+
# Flatten lists
|
| 85 |
+
combined_pairs = []
|
| 86 |
+
for pairs in all_pairs:
|
| 87 |
+
combined_pairs.extend(pairs)
|
| 88 |
+
|
| 89 |
+
combined_triplets = []
|
| 90 |
+
for triplets in all_triplets:
|
| 91 |
+
combined_triplets.extend(triplets)
|
| 92 |
+
|
| 93 |
+
# Export combined positive pairs - JSON
|
| 94 |
+
pairs_json_path = output_dir / "positive_pairs.json"
|
| 95 |
+
with open(pairs_json_path, "w", encoding="utf-8") as f:
|
| 96 |
+
json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False)
|
| 97 |
+
print(f"✅ Combined positive pairs (JSON): {pairs_json_path}")
|
| 98 |
+
|
| 99 |
+
# Export combined positive pairs - JSONL
|
| 100 |
+
pairs_jsonl_path = output_dir / "positive_pairs.jsonl"
|
| 101 |
+
with open(pairs_jsonl_path, "w", encoding="utf-8") as f:
|
| 102 |
+
for p in combined_pairs:
|
| 103 |
+
f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n")
|
| 104 |
+
print(f"✅ Combined positive pairs (JSONL): {pairs_jsonl_path}")
|
| 105 |
+
|
| 106 |
+
# Export combined triplets - JSON
|
| 107 |
+
triplets_json_path = output_dir / "triplets.json"
|
| 108 |
+
with open(triplets_json_path, "w", encoding="utf-8") as f:
|
| 109 |
+
json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False)
|
| 110 |
+
print(f"✅ Combined triplets (JSON): {triplets_json_path}")
|
| 111 |
+
|
| 112 |
+
# Export combined triplets - JSONL
|
| 113 |
+
triplets_jsonl_path = output_dir / "triplets.jsonl"
|
| 114 |
+
with open(triplets_jsonl_path, "w", encoding="utf-8") as f:
|
| 115 |
+
for t in combined_triplets:
|
| 116 |
+
f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
|
| 117 |
+
print(f"✅ Combined triplets (JSONL): {triplets_jsonl_path}")
|
| 118 |
+
|
| 119 |
+
return len(combined_pairs), len(combined_triplets)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main():
|
| 123 |
+
"""Generate datasets for all discovered frameworks + combined dataset."""
|
| 124 |
+
print("=" * 80)
|
| 125 |
+
print("🚀 MULTI-FRAMEWORK TRAINING DATA GENERATOR")
|
| 126 |
+
print("=" * 80)
|
| 127 |
+
|
| 128 |
+
# Discover all chunk files
|
| 129 |
+
print("\n🔍 Discovering chunk files...")
|
| 130 |
+
chunk_files = discover_all_chunk_files()
|
| 131 |
+
|
| 132 |
+
if not chunk_files:
|
| 133 |
+
print("❌ No chunk files found!")
|
| 134 |
+
print("\nPlease ensure chunks exist in:")
|
| 135 |
+
print(" - data/processed/chunks/Local_saved_files/")
|
| 136 |
+
print(" - data/processed/repos/*/")
|
| 137 |
+
return
|
| 138 |
+
|
| 139 |
+
print(f"✅ Found {len(chunk_files)} chunk file(s):\n")
|
| 140 |
+
for path, framework in chunk_files:
|
| 141 |
+
print(f" 📦 {framework}: {path.name}")
|
| 142 |
+
|
| 143 |
+
# Process each framework
|
| 144 |
+
print("\n" + "=" * 80)
|
| 145 |
+
print("🔄 PROCESSING INDIVIDUAL FRAMEWORKS")
|
| 146 |
+
print("=" * 80 + "\n")
|
| 147 |
+
|
| 148 |
+
results = []
|
| 149 |
+
all_pairs = []
|
| 150 |
+
all_triplets = []
|
| 151 |
+
|
| 152 |
+
for i, (chunks_path, framework) in enumerate(chunk_files, 1):
|
| 153 |
+
print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...")
|
| 154 |
+
print("-" * 60)
|
| 155 |
+
|
| 156 |
+
output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}"
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
pairs, triplets = generate_pairs_and_triplets(
|
| 160 |
+
chunks_path=chunks_path,
|
| 161 |
+
output_dir=output_dir,
|
| 162 |
+
num_pairs=100,
|
| 163 |
+
num_triplets=100,
|
| 164 |
+
variance=5,
|
| 165 |
+
export_format="both" # JSON + JSONL
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Collect for combined dataset
|
| 169 |
+
all_pairs.append(pairs)
|
| 170 |
+
all_triplets.append(triplets)
|
| 171 |
+
|
| 172 |
+
results.append({
|
| 173 |
+
"framework": framework,
|
| 174 |
+
"status": "✅ SUCCESS",
|
| 175 |
+
"pairs": len(pairs),
|
| 176 |
+
"variations": sum(len(p.variations) for p in pairs),
|
| 177 |
+
"triplets": len(triplets),
|
| 178 |
+
"output": output_dir
|
| 179 |
+
})
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
results.append({
|
| 183 |
+
"framework": framework,
|
| 184 |
+
"status": f"❌ FAILED: {str(e)}",
|
| 185 |
+
"output": output_dir
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
# Create combined dataset
|
| 189 |
+
print("\n" + "=" * 80)
|
| 190 |
+
print("🔗 CREATING COMBINED DATASET (ALL FRAMEWORKS)")
|
| 191 |
+
print("=" * 80 + "\n")
|
| 192 |
+
|
| 193 |
+
combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined"
|
| 194 |
+
total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir)
|
| 195 |
+
|
| 196 |
+
# Final summary
|
| 197 |
+
print("\n" + "=" * 80)
|
| 198 |
+
print("📊 FINAL SUMMARY")
|
| 199 |
+
print("=" * 80 + "\n")
|
| 200 |
+
|
| 201 |
+
print("INDIVIDUAL FRAMEWORK DATASETS:")
|
| 202 |
+
print("-" * 40)
|
| 203 |
+
for result in results:
|
| 204 |
+
print(f"\n📦 {result['framework'].upper()}")
|
| 205 |
+
print(f" Status: {result['status']}")
|
| 206 |
+
if "pairs" in result:
|
| 207 |
+
print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)")
|
| 208 |
+
print(f" - triplets.json: {result['triplets']} docs")
|
| 209 |
+
print(f" 📁 {result['output']}")
|
| 210 |
+
|
| 211 |
+
print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):")
|
| 212 |
+
print("-" * 40)
|
| 213 |
+
print(f"📁 {combined_dir}")
|
| 214 |
+
print(f" - positive_pairs.json: {total_pairs} docs")
|
| 215 |
+
print(f" - triplets.json: {total_triplets} docs")
|
| 216 |
+
|
| 217 |
+
# File count summary
|
| 218 |
+
successful = sum(1 for r in results if "SUCCESS" in r["status"])
|
| 219 |
+
total_files = (successful * 4) + 4 # 4 per framework + 4 combined
|
| 220 |
+
|
| 221 |
+
print(f"\n\n📄 TOTAL FILES GENERATED: {total_files}")
|
| 222 |
+
print(f" - {successful} frameworks × 4 files = {successful * 4} files")
|
| 223 |
+
print(f" - Combined dataset = 4 files")
|
| 224 |
+
print("=" * 80)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
if __name__ == "__main__":
|
| 228 |
+
main()
|
scripts/run_pairs_triplets_pipeline.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Script to generate positive pairs and triplets from code chunks.
|
| 3 |
+
|
| 4 |
+
This script loads code chunks and generates:
|
| 5 |
+
1. Positive Pairs: (question, code) with 4-5 variations per sample
|
| 6 |
+
2. Triplets: (anchor_question, positive_code, negative_code)
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir>
|
| 10 |
+
python -m scripts.run_pairs_triplets_pipeline --help
|
| 11 |
+
|
| 12 |
+
Examples:
|
| 13 |
+
# Generate from local chunks with default settings
|
| 14 |
+
python -m scripts.run_pairs_triplets_pipeline \\
|
| 15 |
+
--chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
|
| 16 |
+
--output data/processed/training
|
| 17 |
+
|
| 18 |
+
# Generate from repository chunks
|
| 19 |
+
python -m scripts.run_pairs_triplets_pipeline \\
|
| 20 |
+
--chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\
|
| 21 |
+
--output data/processed/training/langgraph
|
| 22 |
+
|
| 23 |
+
# Custom settings
|
| 24 |
+
python -m scripts.run_pairs_triplets_pipeline \\
|
| 25 |
+
--chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
|
| 26 |
+
--output data/processed/training \\
|
| 27 |
+
--pairs 100 --triplets 100 --variance 5
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
import sys
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
# Add project root to path
|
| 34 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 35 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 36 |
+
|
| 37 |
+
from src.task_3_data_engineering.export.pairs_triplets_generator import (
|
| 38 |
+
generate_pairs_and_triplets,
|
| 39 |
+
main as cli_main
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def run_default_pipeline():
|
| 44 |
+
"""Run with default settings for the available chunks."""
|
| 45 |
+
|
| 46 |
+
# Try multiple possible chunk locations
|
| 47 |
+
possible_paths = [
|
| 48 |
+
PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
|
| 49 |
+
PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
# Find all chunks.jsonl files in chunks folder subdirectories
|
| 53 |
+
chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks"
|
| 54 |
+
if chunks_dir.exists():
|
| 55 |
+
for subdir in chunks_dir.iterdir():
|
| 56 |
+
if subdir.is_dir():
|
| 57 |
+
chunks_file = subdir / "chunks.jsonl"
|
| 58 |
+
if chunks_file.exists() and chunks_file not in possible_paths:
|
| 59 |
+
possible_paths.append(chunks_file)
|
| 60 |
+
|
| 61 |
+
# Find repository chunks
|
| 62 |
+
repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
|
| 63 |
+
if repos_dir.exists():
|
| 64 |
+
for repo_dir in repos_dir.iterdir():
|
| 65 |
+
if repo_dir.is_dir():
|
| 66 |
+
for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
|
| 67 |
+
possible_paths.append(jsonl_file)
|
| 68 |
+
|
| 69 |
+
chunks_path = None
|
| 70 |
+
for path in possible_paths:
|
| 71 |
+
if path.exists():
|
| 72 |
+
chunks_path = path
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
if chunks_path is None:
|
| 76 |
+
print("❌ No chunks files found. Please specify a chunks file with --chunks")
|
| 77 |
+
print("\nPossible locations checked:")
|
| 78 |
+
for p in possible_paths[:5]:
|
| 79 |
+
print(f" - {p}")
|
| 80 |
+
return
|
| 81 |
+
|
| 82 |
+
output_dir = PROJECT_ROOT / "data" / "processed" / "training"
|
| 83 |
+
|
| 84 |
+
print("=" * 60)
|
| 85 |
+
print("🚀 Positive Pairs & Triplets Generator")
|
| 86 |
+
print("=" * 60)
|
| 87 |
+
print(f"\n📂 Chunks Path: {chunks_path}")
|
| 88 |
+
print(f"📁 Output Dir: {output_dir}")
|
| 89 |
+
print(f"📊 Settings: pairs=100, triplets=100, variance=5")
|
| 90 |
+
print("\n" + "-" * 60)
|
| 91 |
+
|
| 92 |
+
pairs, triplets = generate_pairs_and_triplets(
|
| 93 |
+
chunks_path=chunks_path,
|
| 94 |
+
output_dir=output_dir,
|
| 95 |
+
num_pairs=100,
|
| 96 |
+
num_triplets=100,
|
| 97 |
+
variance=5,
|
| 98 |
+
export_format="both"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
print("\n" + "=" * 60)
|
| 102 |
+
print("✅ Pipeline Complete!")
|
| 103 |
+
print("=" * 60)
|
| 104 |
+
print(f"\n📁 Output files saved to: {output_dir}")
|
| 105 |
+
print(" - positive_pairs.jsonl")
|
| 106 |
+
print(" - positive_pairs.json")
|
| 107 |
+
print(" - triplets.jsonl")
|
| 108 |
+
print(" - triplets.json")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
import argparse
|
| 113 |
+
|
| 114 |
+
# Check if any arguments provided
|
| 115 |
+
if len(sys.argv) > 1:
|
| 116 |
+
# Use CLI with provided arguments
|
| 117 |
+
cli_main()
|
| 118 |
+
else:
|
| 119 |
+
# Run with defaults
|
| 120 |
+
run_default_pipeline()
|
scripts/run_python_pipeline.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local Codebase Pipeline Runner - Processes local codebases for dataset creation.
|
| 3 |
+
|
| 4 |
+
This is the main entry point for processing LOCAL CODEBASES (not Git repos).
|
| 5 |
+
It orchestrates the entire chunking pipeline for local files, handling both
|
| 6 |
+
code files and documentation with intelligent fallback strategies.
|
| 7 |
+
|
| 8 |
+
ARCHITECTURE POSITION:
|
| 9 |
+
- Local Pipeline Orchestrator: Coordinates local file processing
|
| 10 |
+
- Fallback Handler: Intelligent fallback from code to documentation
|
| 11 |
+
- Dataset Exporter: Creates final JSONL datasets with statistics
|
| 12 |
+
|
| 13 |
+
KEY FEATURES:
|
| 14 |
+
1. Unified processing of Python files and documentation
|
| 15 |
+
2. Intelligent fallback (failed code chunking → documentation chunking)
|
| 16 |
+
3. Hierarchical chunking for Python files
|
| 17 |
+
4. Documentation-aware chunking for markdown/text files
|
| 18 |
+
5. Dataset statistics and metadata generation
|
| 19 |
+
|
| 20 |
+
DATA FLOW:
|
| 21 |
+
Local files → Type detection → Python chunking (or fallback) →
|
| 22 |
+
Documentation chunking → JSONL export → Statistics
|
| 23 |
+
|
| 24 |
+
USE CASES:
|
| 25 |
+
- Processing locally saved code examples
|
| 26 |
+
- Creating datasets from example repositories
|
| 27 |
+
- Testing chunking strategies on local files
|
| 28 |
+
|
| 29 |
+
USAGE:
|
| 30 |
+
python run_python_pipeline.py --name crewai_examples --include crewai
|
| 31 |
+
python run_python_pipeline.py --name test_dataset --exclude large_repos
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
import json
|
| 36 |
+
import argparse
|
| 37 |
+
|
| 38 |
+
from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker
|
| 39 |
+
from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl
|
| 40 |
+
from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
|
| 41 |
+
from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata
|
| 42 |
+
from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
INPUT_DIR = Path("data/raw/codebases")
|
| 46 |
+
BASE_OUTPUT_DIR = Path("data/processed/chunks")
|
| 47 |
+
|
| 48 |
+
DOC_EXTS = {".md", ".txt", ".rst"}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None):
|
| 52 |
+
output_dir = BASE_OUTPUT_DIR / dataset_name
|
| 53 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 54 |
+
|
| 55 |
+
chunker = HierarchicalChunker()
|
| 56 |
+
all_chunks = []
|
| 57 |
+
|
| 58 |
+
files = [p for p in INPUT_DIR.rglob("*") if p.is_file()]
|
| 59 |
+
|
| 60 |
+
for file_path in files:
|
| 61 |
+
rel = file_path.relative_to(INPUT_DIR).parts
|
| 62 |
+
if include and rel[0] not in include:
|
| 63 |
+
continue
|
| 64 |
+
if exclude and rel[0] in exclude:
|
| 65 |
+
continue
|
| 66 |
+
|
| 67 |
+
print(f"Processing: {file_path}")
|
| 68 |
+
|
| 69 |
+
# ---- Python files ----
|
| 70 |
+
if file_path.suffix == ".py":
|
| 71 |
+
try:
|
| 72 |
+
code_chunks = chunker.chunk_file(file_path)
|
| 73 |
+
if code_chunks:
|
| 74 |
+
all_chunks.extend(code_chunks)
|
| 75 |
+
continue
|
| 76 |
+
except Exception:
|
| 77 |
+
pass # fallback to doc mode
|
| 78 |
+
|
| 79 |
+
# ---- Documentation / text ----
|
| 80 |
+
if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py":
|
| 81 |
+
try:
|
| 82 |
+
raw_text = file_path.read_text(encoding="utf-8", errors="ignore")
|
| 83 |
+
except Exception:
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
if not raw_text.strip():
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
doc_chunks = chunk_document(
|
| 90 |
+
raw_text=raw_text,
|
| 91 |
+
source_name=str(file_path),
|
| 92 |
+
source_url=None,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
all_chunks.extend(wrap_doc_chunks(doc_chunks))
|
| 96 |
+
|
| 97 |
+
# ---- Export ----
|
| 98 |
+
export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True)
|
| 99 |
+
|
| 100 |
+
stats = compute_dataset_stats(all_chunks)
|
| 101 |
+
|
| 102 |
+
primary = [c for c in all_chunks if c.hierarchy.is_primary]
|
| 103 |
+
stats["hierarchy"] = {
|
| 104 |
+
"primary_chunks": len(primary),
|
| 105 |
+
"secondary_chunks": len(all_chunks) - len(primary),
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f:
|
| 109 |
+
json.dump(stats, f, indent=2)
|
| 110 |
+
|
| 111 |
+
write_dataset_metadata(
|
| 112 |
+
chunks=all_chunks,
|
| 113 |
+
output_path=output_dir / "dataset_metadata.json",
|
| 114 |
+
dataset_name=dataset_name,
|
| 115 |
+
dataset_version="v1",
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
print("\n✅ Dataset built successfully")
|
| 119 |
+
print(f" - Files: {len({c.file_path for c in all_chunks})}")
|
| 120 |
+
print(f" - Chunks: {len(all_chunks)}")
|
| 121 |
+
print(f" - Output: {output_dir}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
parser = argparse.ArgumentParser()
|
| 126 |
+
parser.add_argument("--name", required=True)
|
| 127 |
+
parser.add_argument("--include", nargs="+")
|
| 128 |
+
parser.add_argument("--exclude", nargs="+")
|
| 129 |
+
args = parser.parse_args()
|
| 130 |
+
|
| 131 |
+
run(args.name, args.include, args.exclude)
|
scripts/run_repo_pipeline.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Git Repository Pipeline Runner - Processes Git repositories at scale.
|
| 3 |
+
|
| 4 |
+
This is the main entry point for processing GIT REPOSITORIES. It provides
|
| 5 |
+
enhanced features for repository analysis, including git metadata extraction,
|
| 6 |
+
agentic framework detection, and comprehensive statistics generation.
|
| 7 |
+
|
| 8 |
+
ARCHITECTURE POSITION:
|
| 9 |
+
- Repository Pipeline Orchestrator: Coordinates Git repo processing
|
| 10 |
+
- Enhanced Metadata Collector: Extracts git history and agentic patterns
|
| 11 |
+
- Production Pipeline: Handles large repositories with performance tracking
|
| 12 |
+
|
| 13 |
+
KEY FEATURES:
|
| 14 |
+
1. Complete repository processing with git metadata
|
| 15 |
+
2. Extension-aware filtering (None = full repository)
|
| 16 |
+
3. Performance tracking (files/sec, chunks/sec)
|
| 17 |
+
4. Agentic framework detection (via RepoMetadataExtractor)
|
| 18 |
+
5. Comprehensive output (JSONL chunks + metadata + statistics)
|
| 19 |
+
|
| 20 |
+
DATA FLOW:
|
| 21 |
+
Repo URL → Clone → Metadata extraction → File listing → Chunking →
|
| 22 |
+
Enhanced export → Statistics → Comprehensive output package
|
| 23 |
+
|
| 24 |
+
USE CASES:
|
| 25 |
+
- Processing complete Git repositories for training data
|
| 26 |
+
- Creating agentic-aware datasets
|
| 27 |
+
- Benchmarking chunking performance
|
| 28 |
+
- Production dataset generation
|
| 29 |
+
|
| 30 |
+
USAGE:
|
| 31 |
+
python run_repo_pipeline.py single https://github.com/crewAIInc/crewAI
|
| 32 |
+
python run_repo_pipeline.py single https://github.com/autogen/autogen --extensions .py .md
|
| 33 |
+
python run_repo_pipeline.py single https://github.com/langchain --max-files 1000
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
import json
|
| 38 |
+
from typing import Dict, Any, Optional, Set, List
|
| 39 |
+
import argparse
|
| 40 |
+
import time
|
| 41 |
+
from datetime import datetime
|
| 42 |
+
|
| 43 |
+
# Import enhanced components
|
| 44 |
+
from src.task_3_data_engineering.ingestion.git_crawler import GitCrawler
|
| 45 |
+
from src.task_3_data_engineering.ingestion.repo_metadata import RepoMetadataExtractor
|
| 46 |
+
from src.task_3_data_engineering.chunking.repo_chunker import RepoChunker
|
| 47 |
+
from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
|
| 48 |
+
from src.task_3_data_engineering.export.enhanced_jsonl_exporter import export_repo_chunks_jsonl
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class EnhancedRepoPipeline:
|
| 52 |
+
"""Enhanced pipeline with agentic focus"""
|
| 53 |
+
|
| 54 |
+
def __init__(
|
| 55 |
+
self,
|
| 56 |
+
output_base: Path = Path("data/processed/repos"),
|
| 57 |
+
use_hierarchical: bool = True,
|
| 58 |
+
):
|
| 59 |
+
self.crawler = GitCrawler()
|
| 60 |
+
self.chunker = RepoChunker(use_hierarchical=use_hierarchical)
|
| 61 |
+
self.output_base = output_base
|
| 62 |
+
self.output_base.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
|
| 64 |
+
def process_repository(
|
| 65 |
+
self,
|
| 66 |
+
repo_url: str,
|
| 67 |
+
extensions: Optional[Set[str]] = None,
|
| 68 |
+
output_name: Optional[str] = None,
|
| 69 |
+
include_binary: bool = False,
|
| 70 |
+
max_files: Optional[int] = None,
|
| 71 |
+
skip_git_metadata: bool = False,
|
| 72 |
+
) -> Dict[str, Any]:
|
| 73 |
+
"""
|
| 74 |
+
Process repository with enhanced features
|
| 75 |
+
|
| 76 |
+
IMPORTANT FIX:
|
| 77 |
+
- extensions=None => FULL repository (no filtering)
|
| 78 |
+
- extensions=set() => filtered repository
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
start_time = time.time()
|
| 82 |
+
print(f"🚀 Processing repository: {repo_url}")
|
| 83 |
+
print("-" * 60)
|
| 84 |
+
|
| 85 |
+
# 1. Clone repository
|
| 86 |
+
repo_path = self.crawler.clone_repository(repo_url)
|
| 87 |
+
if not repo_path:
|
| 88 |
+
raise RuntimeError(f"Failed to clone {repo_url}")
|
| 89 |
+
|
| 90 |
+
# 2. Determine output name
|
| 91 |
+
if not output_name:
|
| 92 |
+
output_name = repo_path.name
|
| 93 |
+
|
| 94 |
+
# 3. Log extension behavior (FIXED)
|
| 95 |
+
if extensions:
|
| 96 |
+
print(f"📁 Extension filter enabled: {sorted(extensions)}")
|
| 97 |
+
else:
|
| 98 |
+
print("📁 No extension filter → processing FULL repository")
|
| 99 |
+
|
| 100 |
+
# 4. Extract repository metadata
|
| 101 |
+
print("📊 Extracting repository metadata...")
|
| 102 |
+
metadata = {}
|
| 103 |
+
|
| 104 |
+
if not skip_git_metadata:
|
| 105 |
+
extractor = RepoMetadataExtractor(repo_path)
|
| 106 |
+
metadata = extractor.extract_comprehensive_metadata()
|
| 107 |
+
|
| 108 |
+
# 5. List files (CORE LOGIC UNCHANGED)
|
| 109 |
+
print("📁 Listing repository files...")
|
| 110 |
+
file_infos, file_stats = self.crawler.list_files_with_info(
|
| 111 |
+
repo_path,
|
| 112 |
+
extensions=extensions, # None => full repo
|
| 113 |
+
skip_binary=not include_binary,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# 6. Optional file limiting
|
| 117 |
+
if max_files and len(file_infos) > max_files:
|
| 118 |
+
print(f"⚠️ Limiting to {max_files} files (out of {len(file_infos)})")
|
| 119 |
+
file_infos = file_infos[:max_files]
|
| 120 |
+
|
| 121 |
+
print(f"📊 Found {len(file_infos)} files to process")
|
| 122 |
+
|
| 123 |
+
# 7. Create output directory
|
| 124 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 125 |
+
output_dir = self.output_base / f"{output_name}_{timestamp}"
|
| 126 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 127 |
+
|
| 128 |
+
# 8. Repository-level metadata
|
| 129 |
+
# Get actual repo name from metadata
|
| 130 |
+
actual_repo_name = metadata.get("basic", {}).get("repo_name", output_name)
|
| 131 |
+
|
| 132 |
+
repo_metadata = {
|
| 133 |
+
"repo_url": repo_url,
|
| 134 |
+
"repo_name": actual_repo_name, # ✅ Use actual repo name
|
| 135 |
+
"folder_name": output_name, # ✅ Track user's folder
|
| 136 |
+
"local_path": str(repo_path),
|
| 137 |
+
"extensions_included": list(extensions) if extensions else "ALL",
|
| 138 |
+
"timestamp": timestamp,
|
| 139 |
+
**metadata,
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
metadata_file = output_dir / "repository_metadata.json"
|
| 143 |
+
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 144 |
+
json.dump(repo_metadata, f, indent=2, default=str)
|
| 145 |
+
|
| 146 |
+
# 9. Chunk processing
|
| 147 |
+
all_chunks = []
|
| 148 |
+
processing_stats = {
|
| 149 |
+
"total_files": len(file_infos),
|
| 150 |
+
"processed": 0,
|
| 151 |
+
"failed": 0,
|
| 152 |
+
"file_types": {},
|
| 153 |
+
"chunk_types": {},
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
print("\n🔧 Processing files...")
|
| 157 |
+
print("-" * 60)
|
| 158 |
+
|
| 159 |
+
for idx, file_info in enumerate(file_infos, start=1):
|
| 160 |
+
try:
|
| 161 |
+
if idx % 10 == 0:
|
| 162 |
+
print(f" [{idx}/{len(file_infos)}] Processing...")
|
| 163 |
+
|
| 164 |
+
file_metadata = {
|
| 165 |
+
**repo_metadata,
|
| 166 |
+
"file_info": {
|
| 167 |
+
"relative_path": file_info.relative_path,
|
| 168 |
+
"size_bytes": file_info.size,
|
| 169 |
+
"extension": file_info.extension,
|
| 170 |
+
"is_binary": file_info.is_binary,
|
| 171 |
+
},
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
chunks = self.chunker.chunk_file(
|
| 175 |
+
file_info.path,
|
| 176 |
+
file_metadata,
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
all_chunks.extend(chunks)
|
| 180 |
+
processing_stats["processed"] += 1
|
| 181 |
+
processing_stats["file_types"][file_info.extension] = (
|
| 182 |
+
processing_stats["file_types"].get(file_info.extension, 0) + 1
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
for chunk in chunks:
|
| 186 |
+
ct = chunk.chunk_type
|
| 187 |
+
processing_stats["chunk_types"][ct] = (
|
| 188 |
+
processing_stats["chunk_types"].get(ct, 0) + 1
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"⚠️ Error processing {file_info.relative_path}: {str(e)[:120]}")
|
| 193 |
+
processing_stats["failed"] += 1
|
| 194 |
+
|
| 195 |
+
# 10. Export chunks
|
| 196 |
+
print("\n💾 Exporting chunks...")
|
| 197 |
+
output_file = output_dir / f"{output_name}_chunks.jsonl"
|
| 198 |
+
|
| 199 |
+
export_repo_chunks_jsonl(
|
| 200 |
+
chunks=all_chunks,
|
| 201 |
+
output_path=output_file,
|
| 202 |
+
repo_metadata=repo_metadata,
|
| 203 |
+
print_stats=True,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# 11. Compute statistics
|
| 207 |
+
print("📈 Computing statistics...")
|
| 208 |
+
chunk_stats = compute_dataset_stats(all_chunks)
|
| 209 |
+
|
| 210 |
+
total_time = time.time() - start_time
|
| 211 |
+
|
| 212 |
+
final_stats = {
|
| 213 |
+
"repository_info": {
|
| 214 |
+
"name": actual_repo_name, # ✅ USE actual_repo_name
|
| 215 |
+
"folder_name": output_name, # ✅ ADD folder_name field
|
| 216 |
+
"url": repo_url,
|
| 217 |
+
"path": str(repo_path),
|
| 218 |
+
"timestamp": timestamp,
|
| 219 |
+
},
|
| 220 |
+
"processing_stats": processing_stats,
|
| 221 |
+
"chunk_statistics": chunk_stats,
|
| 222 |
+
"performance": {
|
| 223 |
+
"total_time_seconds": round(total_time, 2),
|
| 224 |
+
"files_per_second": round(len(file_infos) / total_time, 2),
|
| 225 |
+
"chunks_per_second": round(len(all_chunks) / total_time, 2),
|
| 226 |
+
},
|
| 227 |
+
"output_files": {
|
| 228 |
+
"chunks": str(output_file),
|
| 229 |
+
"metadata": str(metadata_file),
|
| 230 |
+
},
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
stats_file = output_dir / f"{output_name}_stats.json"
|
| 234 |
+
with open(stats_file, "w", encoding="utf-8") as f:
|
| 235 |
+
json.dump(final_stats, f, indent=2)
|
| 236 |
+
|
| 237 |
+
# 12. Summary
|
| 238 |
+
print("\n" + "=" * 70)
|
| 239 |
+
print("✅ REPOSITORY PROCESSING COMPLETE")
|
| 240 |
+
print("=" * 70)
|
| 241 |
+
print(f"📁 Repository: {output_name}")
|
| 242 |
+
print(f"📄 Files: {len(file_infos)}")
|
| 243 |
+
print(f"🧩 Chunks: {len(all_chunks)}")
|
| 244 |
+
print(f"⏱️ Time: {final_stats['performance']['total_time_seconds']}s")
|
| 245 |
+
print(f"💾 Output: {output_dir}")
|
| 246 |
+
print("=" * 70)
|
| 247 |
+
|
| 248 |
+
return final_stats
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def main():
|
| 252 |
+
"""Enhanced CLI for repository pipeline (FIXED)"""
|
| 253 |
+
|
| 254 |
+
parser = argparse.ArgumentParser(
|
| 255 |
+
description="Process Git repositories for agentic datasets"
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 259 |
+
|
| 260 |
+
# ---- Single repo ----
|
| 261 |
+
single = subparsers.add_parser("single", help="Process single repository")
|
| 262 |
+
single.add_argument("repo_url", help="Git repository URL")
|
| 263 |
+
single.add_argument("--name", help="Custom output name")
|
| 264 |
+
single.add_argument(
|
| 265 |
+
"--extensions",
|
| 266 |
+
nargs="+",
|
| 267 |
+
default=None,
|
| 268 |
+
help="Optional file extensions (.py .md). If omitted, FULL repo is processed.",
|
| 269 |
+
)
|
| 270 |
+
single.add_argument("--max-files", type=int, help="Limit number of files")
|
| 271 |
+
single.add_argument("--skip-git-metadata", action="store_true")
|
| 272 |
+
single.add_argument("--include-binary", action="store_true")
|
| 273 |
+
|
| 274 |
+
args = parser.parse_args()
|
| 275 |
+
pipeline = EnhancedRepoPipeline()
|
| 276 |
+
|
| 277 |
+
if args.command == "single":
|
| 278 |
+
pipeline.process_repository(
|
| 279 |
+
repo_url=args.repo_url,
|
| 280 |
+
output_name=args.name,
|
| 281 |
+
extensions=set(args.extensions) if args.extensions else None,
|
| 282 |
+
max_files=args.max_files,
|
| 283 |
+
skip_git_metadata=args.skip_git_metadata,
|
| 284 |
+
include_binary=args.include_binary,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
if __name__ == "__main__":
|
| 289 |
+
main()
|
scripts/triplets_synthesis.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Synthesize triplet and positive pair datasets from chunked code files.'''
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import random
|
| 7 |
+
import hashlib
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ============================
|
| 16 |
+
# CONFIG
|
| 17 |
+
# ============================
|
| 18 |
+
|
| 19 |
+
MAX_DOCUMENTS = 200
|
| 20 |
+
POSITIVE_VARIANTS = 5
|
| 21 |
+
TFIDF_MAX_FEATURES = 5000
|
| 22 |
+
RANDOM_SEED = 42
|
| 23 |
+
|
| 24 |
+
BASE_OUTPUT_DIR = Path("data/synthetic")
|
| 25 |
+
|
| 26 |
+
random.seed(RANDOM_SEED)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ============================
|
| 30 |
+
# UTILITIES
|
| 31 |
+
# ============================
|
| 32 |
+
|
| 33 |
+
def load_chunks(file_path):
|
| 34 |
+
path = Path(file_path)
|
| 35 |
+
|
| 36 |
+
if path.suffix == ".jsonl":
|
| 37 |
+
chunks = []
|
| 38 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 39 |
+
for line_no, line in enumerate(f, 1):
|
| 40 |
+
line = line.strip()
|
| 41 |
+
if not line:
|
| 42 |
+
continue
|
| 43 |
+
try:
|
| 44 |
+
chunks.append(json.loads(line))
|
| 45 |
+
except json.JSONDecodeError as e:
|
| 46 |
+
raise ValueError(
|
| 47 |
+
f"Invalid JSON on line {line_no} in {path}"
|
| 48 |
+
) from e
|
| 49 |
+
return chunks
|
| 50 |
+
|
| 51 |
+
elif path.suffix == ".json":
|
| 52 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 53 |
+
data = json.load(f)
|
| 54 |
+
if not isinstance(data, list):
|
| 55 |
+
raise ValueError(f"{path} must contain a list of chunks")
|
| 56 |
+
return data
|
| 57 |
+
|
| 58 |
+
else:
|
| 59 |
+
raise ValueError(
|
| 60 |
+
f"Unsupported file format {path.suffix}. Use .json or .jsonl"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def save_jsonl(path: Path, records: List[Dict]):
|
| 66 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
with path.open("w", encoding="utf-8") as f:
|
| 68 |
+
for r in records:
|
| 69 |
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def save_json(path: Path, data):
|
| 73 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 74 |
+
with path.open("w", encoding="utf-8") as f:
|
| 75 |
+
json.dump(data, f, indent=2)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def stable_document_id(chunk: Dict, idx: int) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Generate a canonical, stable document_id.
|
| 81 |
+
"""
|
| 82 |
+
base = f"{chunk.get('file_path','unknown')}::{idx}"
|
| 83 |
+
return "doc_" + hashlib.sha1(base.encode()).hexdigest()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def infer_framework(input_path: Path) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Infer framework from path (fallback-safe).
|
| 89 |
+
"""
|
| 90 |
+
parts = [p.lower() for p in input_path.parts]
|
| 91 |
+
for fw in ["crewai", "langchain", "langgraph", "autogen"]:
|
| 92 |
+
if fw in parts:
|
| 93 |
+
return fw
|
| 94 |
+
return "unknown"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ============================
|
| 98 |
+
# ANCHOR GENERATION (LLM PLACEHOLDER)
|
| 99 |
+
# ============================
|
| 100 |
+
|
| 101 |
+
def generate_anchor_questions(code: str, n: int) -> List[str]:
|
| 102 |
+
"""
|
| 103 |
+
Deterministic placeholder (LLM-ready).
|
| 104 |
+
"""
|
| 105 |
+
symbol = code.split("(")[0].replace("def ", "").replace("class ", "").strip()
|
| 106 |
+
|
| 107 |
+
templates = [
|
| 108 |
+
f"How does {symbol} work in Python?",
|
| 109 |
+
f"How to implement {symbol}?",
|
| 110 |
+
f"Example usage of {symbol}",
|
| 111 |
+
f"Explain the {symbol} logic",
|
| 112 |
+
f"Best practices for {symbol}",
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
random.shuffle(templates)
|
| 116 |
+
return templates[:n]
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ============================
|
| 120 |
+
# NEGATIVE MINING
|
| 121 |
+
# ============================
|
| 122 |
+
|
| 123 |
+
def build_tfidf(chunks: List[Dict]):
|
| 124 |
+
corpus = [c["code"] for c in chunks]
|
| 125 |
+
vectorizer = TfidfVectorizer(
|
| 126 |
+
stop_words="english",
|
| 127 |
+
max_features=TFIDF_MAX_FEATURES
|
| 128 |
+
)
|
| 129 |
+
matrix = vectorizer.fit_transform(corpus)
|
| 130 |
+
return vectorizer, matrix
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def mine_hard_negative(
|
| 134 |
+
anchor: str,
|
| 135 |
+
positive_idx: int,
|
| 136 |
+
chunks: List[Dict],
|
| 137 |
+
vectorizer,
|
| 138 |
+
matrix,
|
| 139 |
+
) -> Dict:
|
| 140 |
+
query_vec = vectorizer.transform([anchor])
|
| 141 |
+
scores = cosine_similarity(query_vec, matrix)[0]
|
| 142 |
+
|
| 143 |
+
ranked = sorted(
|
| 144 |
+
[(i, s) for i, s in enumerate(scores)],
|
| 145 |
+
key=lambda x: x[1],
|
| 146 |
+
reverse=True,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
for idx, _ in ranked:
|
| 150 |
+
if idx != positive_idx:
|
| 151 |
+
return chunks[idx]
|
| 152 |
+
|
| 153 |
+
raise RuntimeError("No negative candidate found")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ============================
|
| 157 |
+
# MAIN PIPELINE
|
| 158 |
+
# ============================
|
| 159 |
+
|
| 160 |
+
def generate_datasets(input_path: Path, run_name: str):
|
| 161 |
+
output_dir = BASE_OUTPUT_DIR / run_name
|
| 162 |
+
framework = infer_framework(input_path)
|
| 163 |
+
|
| 164 |
+
chunks = load_chunks(input_path)
|
| 165 |
+
# Filter only semantic code chunks
|
| 166 |
+
chunks = [
|
| 167 |
+
c for c in chunks
|
| 168 |
+
if c.get("chunk_type") in {"class", "method", "function"}
|
| 169 |
+
and "code" in c
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
random.shuffle(chunks)
|
| 173 |
+
chunks = chunks[:MAX_DOCUMENTS]
|
| 174 |
+
|
| 175 |
+
# Assign canonical document_id
|
| 176 |
+
for idx, c in enumerate(chunks):
|
| 177 |
+
c["document_id"] = stable_document_id(c, idx)
|
| 178 |
+
|
| 179 |
+
vectorizer, matrix = build_tfidf(chunks)
|
| 180 |
+
|
| 181 |
+
positive_pairs = []
|
| 182 |
+
triplets = []
|
| 183 |
+
|
| 184 |
+
for idx, chunk in enumerate(chunks):
|
| 185 |
+
code = chunk["code"]
|
| 186 |
+
doc_id = chunk["document_id"]
|
| 187 |
+
|
| 188 |
+
# -------- POSITIVE PAIRS --------
|
| 189 |
+
anchors = generate_anchor_questions(code, POSITIVE_VARIANTS)
|
| 190 |
+
for a in anchors:
|
| 191 |
+
positive_pairs.append({
|
| 192 |
+
"document_id": doc_id,
|
| 193 |
+
"anchor": a,
|
| 194 |
+
"positive": code,
|
| 195 |
+
"framework": framework,
|
| 196 |
+
"source": "synthetic_positive_v2",
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
# -------- TRIPLET --------
|
| 200 |
+
anchor = anchors[0]
|
| 201 |
+
negative_chunk = mine_hard_negative(
|
| 202 |
+
anchor, idx, chunks, vectorizer, matrix
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
triplets.append({
|
| 206 |
+
"document_id": doc_id,
|
| 207 |
+
"anchor": anchor,
|
| 208 |
+
"positive": code,
|
| 209 |
+
"negative": negative_chunk["code"],
|
| 210 |
+
"framework": framework,
|
| 211 |
+
"source": "synthetic_triplet_v2",
|
| 212 |
+
})
|
| 213 |
+
|
| 214 |
+
# -------- SAVE --------
|
| 215 |
+
save_jsonl(output_dir / "positive_pairs.jsonl", positive_pairs)
|
| 216 |
+
save_jsonl(output_dir / "triplets.jsonl", triplets)
|
| 217 |
+
|
| 218 |
+
save_json(output_dir / "positive_pairs.json", positive_pairs)
|
| 219 |
+
save_json(output_dir / "triplets.json", triplets)
|
| 220 |
+
|
| 221 |
+
metadata = {
|
| 222 |
+
"name": run_name,
|
| 223 |
+
"framework": framework,
|
| 224 |
+
"input_file": str(input_path),
|
| 225 |
+
"num_chunks": len(chunks),
|
| 226 |
+
"positive_pairs": len(positive_pairs),
|
| 227 |
+
"triplets": len(triplets),
|
| 228 |
+
"created_at": datetime.utcnow().isoformat(),
|
| 229 |
+
"random_seed": RANDOM_SEED,
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
save_json(output_dir / "metadata.json", metadata)
|
| 233 |
+
|
| 234 |
+
print(f"✅ Dataset generated at: {output_dir}")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
# ============================
|
| 238 |
+
# ENTRY POINT
|
| 239 |
+
# ============================
|
| 240 |
+
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
parser = argparse.ArgumentParser()
|
| 243 |
+
parser.add_argument("--input", required=True, help="Chunked JSONL file")
|
| 244 |
+
parser.add_argument("--name", required=True, help="Synthetic dataset name")
|
| 245 |
+
|
| 246 |
+
args = parser.parse_args()
|
| 247 |
+
|
| 248 |
+
generate_datasets(
|
| 249 |
+
input_path=Path(args.input),
|
| 250 |
+
run_name=args.name,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# # For document id
|
| 254 |
+
|
| 255 |
+
# document_id := sha1(
|
| 256 |
+
# normalized_repo_path +
|
| 257 |
+
# file_path +
|
| 258 |
+
# top_level_symbol
|
| 259 |
+
# )
|