Spaces:

icyayaka
/

CodeMode

Running

App Files Files Community

CodeMode Agent commited on 6 days ago

Commit

463fc7e

1 Parent(s): 17cc505

Deploy CodeMode via Agent

Browse files

Files changed (42) hide show

README.md +20 -5
app.py +430 -0
requirements.txt +9 -0
scripts/__init__.py +0 -0
scripts/__pycache__/__init__.cpython-311.pyc +0 -0
scripts/aggregate_datasets.py +77 -0
scripts/core/README.md +37 -0
scripts/core/__init__.py +0 -0
scripts/core/__pycache__/__init__.cpython-311.pyc +0 -0
scripts/core/ingestion/__init__.py +0 -0
scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc +0 -0
scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc +0 -0
scripts/core/ingestion/ast_chunker.py +390 -0
scripts/core/ingestion/chunk.py +497 -0
scripts/core/ingestion/chunk_schema.py +112 -0
scripts/core/ingestion/doc_chunker.py +446 -0
scripts/core/ingestion/generate_data.py +658 -0
scripts/core/ingestion/hierarchical_chunker.py +182 -0
scripts/core/ingestion/ingest.py +380 -0
scripts/core/ingestion/repo_metadata.py +408 -0
scripts/core/ingestion/ts_chunker.py +155 -0
scripts/core/training/__init__.py +0 -0
scripts/core/training/model.py +47 -0
scripts/core/training/test_model.py +64 -0
scripts/core/training/train.py +145 -0
scripts/core/training/trainer.py +118 -0
scripts/core/utils/__init__.py +0 -0
scripts/core/utils/__pycache__/__init__.cpython-311.pyc +0 -0
scripts/core/utils/__pycache__/id_utils.cpython-311.pyc +0 -0
scripts/core/utils/id_utils.py +91 -0
scripts/generate_all_frameworks.py +228 -0
scripts/run_pairs_triplets_pipeline.py +120 -0
scripts/run_python_pipeline.py +131 -0
scripts/run_repo_pipeline.py +289 -0
scripts/triplets_synthesis.py +259 -0

README.md CHANGED Viewed

@@ -1,12 +1,27 @@
 ---
 title: CodeMode
-emoji: 🏢
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: CodeMode
+emoji: 🚀
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.19.2
 app_file: app.py
 pinned: false
+license: mit
 ---
+# CodeMode: Agentic RAG Engine
+This is the official demo for CodeMode, an advanced RAG engine for codebases.
+## Features
+- **Ingest**: Clone and index any public GitHub repository.
+- **Semantic Search**: Find relevant code using natural language.
+- **Code-to-Code**: Find similar functions using code snippets.
+- **MROps**: Analyze embedding quality and diversity.
+## Local Setup
+```bash
+pip install -r requirements.txt
+python app.py
+```

app.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import gradio as gr
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import pandas as pd
+import sys
+import os
+import shutil
+from pathlib import Path
+import chromadb
+from chromadb.config import Settings
+import uuid
+# --- Add scripts to path so we can import ingestion modules ---
+# --- Add scripts to path so we can import ingestion modules ---
+sys.path.append(os.path.dirname(__file__))
+from scripts.core.ingestion.ingest import GitCrawler
+from scripts.core.ingestion.chunk import RepoChunker
+# --- Configuration ---
+MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DB_DIR = Path("data/chroma_db")
+DB_DIR.mkdir(parents=True, exist_ok=True)
+print(f"Loading model: {MODEL_NAME} on {DEVICE}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModel.from_pretrained(MODEL_NAME)
+model.to(DEVICE)
+model.eval()
+print("Model loaded!")
+# --- Vector Database Setup ---
+# Initialize ChromaDB Client (Persistent)
+chroma_client = chromadb.PersistentClient(path=str(DB_DIR))
+# Create or Get Collection
+# We use cosine similarity space
+collection = chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
+# --- Helper Functions ---
+def compute_embeddings(text_list):
+    """Batch compute embeddings"""
+    if not text_list: return None
+    # Truncate to 512 tokens to avoid errors
+    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
+    with torch.no_grad():
+        out = model(**inputs)
+        emb = out.last_hidden_state.mean(dim=1)
+        return F.normalize(emb, p=2, dim=1)
+def reset_db():
+    """Clear database"""
+    try:
+        chroma_client.delete_collection("codemode_rag")
+        chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
+        return "Database reset (All embeddings deleted)."
+    except Exception as e:
+        return f"Error resetting DB: {e}"
+def search_codebase(query, top_k=5):
+    """Semantic Search using ChromaDB"""
+    if collection.count() == 0: return []
+    query_emb = compute_embeddings([query])
+    if query_emb is None: return []
+    # Convert tensor to list for Chroma
+    query_vec = query_emb.cpu().numpy().tolist()[0]
+    results = collection.query(
+        query_embeddings=[query_vec],
+        n_results=min(top_k, collection.count()),
+        include=["metadatas", "documents", "distances"]
+    )
+    # Parse items
+    output = []
+    if results['ids']:
+        for i in range(len(results['ids'][0])):
+            meta = results['metadatas'][0][i]
+            code = results['documents'][0][i]
+            dist = results['distances'][0][i]
+            score = 1 - dist # Cosine distance to similarity
+            link_icon = "[Link]" if score > 0.7 else ""
+            output.append([meta.get("file_name", "unknown"), f"{score:.4f} {link_icon}", code[:300] + "..."])
+    return output
+def fn_ingest(repo_url):
+    """
+    1. Clone Repo
+    2. Chunk Files
+    3. Compute Embeddings (Batched)
+    4. Store in ChromaDB
+    """
+    if not repo_url.startswith("http"):
+        return "Invalid URL"
+    DATA_DIR = Path(os.path.abspath("data/raw_ingest"))
+    import stat
+    def remove_readonly(func, path, _):
+        os.chmod(path, stat.S_IWRITE)
+        func(path)
+    try:
+        # Clean up old raw data
+        if DATA_DIR.exists():
+            shutil.rmtree(DATA_DIR, onerror=remove_readonly)
+        # 1. Clone
+        yield f"Cloning {repo_url}..."
+        crawler = GitCrawler(cache_dir=DATA_DIR)
+        repo_path = crawler.clone_repository(repo_url)
+        if not repo_path:
+             return "Failed to clone repository."
+        # 2. Chunk
+        yield "Listing files..."
+        files = crawler.list_files(repo_path, extensions={'.py', '.md', '.json', '.js', '.ts', '.java', '.cpp'})
+        if isinstance(files, tuple): files = [f.path for f in files[0]]
+        total_files = len(files)
+        yield f"Found {total_files} files. Chunking..."
+        chunker = RepoChunker()
+        all_chunks = []
+        for i, file_path in enumerate(files):
+            yield f"Chunking: {i+1}/{total_files} ({file_path.name})"
+            try:
+                meta = {"file_name": file_path.name, "url": repo_url}
+                file_chunks = chunker.chunk_file(file_path, repo_metadata=meta)
+                all_chunks.extend(file_chunks)
+            except Exception as e:
+                print(f"Skipping {file_path}: {e}")
+        if not all_chunks:
+            return "No valid chunks found."
+        # 3. Indexing Loop (Batched)
+        total_chunks = len(all_chunks)
+        yield f"Generated {total_chunks} chunks. Embedding & Indexing into ChromaDB..."
+        batch_size = 64
+        for i in range(0, total_chunks, batch_size):
+            batch = all_chunks[i:i+batch_size]
+            # Prepare data
+            texts = [c.code for c in batch]
+            ids = [str(uuid.uuid4()) for _ in batch]
+            metadatas = [{"file_name": Path(c.file_path).name, "url": repo_url} for c in batch]
+            # Compute Embeddings
+            embeddings = compute_embeddings(texts)
+            if embeddings is not None:
+                # Add to Chroma
+                collection.add(
+                    ids=ids,
+                    embeddings=embeddings.cpu().numpy().tolist(),
+                    metadatas=metadatas,
+                    documents=texts
+                )
+            progress = int((i / total_chunks) * 100)
+            yield f"Indexed {min(i+batch_size, total_chunks)}/{total_chunks} ({progress}%)"
+        count = collection.count()
+        yield f"Success! Database now has {count} code chunks. Ready for search."
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        yield f"Error: {str(e)}"
+# --- Analysis Functions ---
+def fn_analyze_embeddings():
+    count = collection.count()
+    if count < 5:
+        return "Not enough data (Need > 5 chunks).", None
+    try:
+        # Fetch all embeddings (Limit to 2000 for visualization speed)
+        limit = min(count, 2000)
+        data = collection.get(limit=limit, include=["embeddings", "metadatas"])
+        X = torch.tensor(data['embeddings'])
+        # PCA
+        X_mean = torch.mean(X, 0)
+        X_centered = X - X_mean
+        U, S, V = torch.pca_lowrank(X_centered, q=2)
+        projected = torch.matmul(X_centered, V[:, :2]).numpy()
+        # Diversity
+        indices = torch.randint(0, len(X), (min(100, len(X)),))
+        sample = X[indices]
+        sim_matrix = torch.mm(sample, sample.t())
+        mask = ~torch.eye(len(sample), dtype=bool)
+        avg_sim = sim_matrix[mask].mean().item()
+        diversity_score = 1.0 - avg_sim
+        metrics = (
+            f"Total Chunks: {count}\n"
+            f"Analyzed: {len(X)} (Sampled)\n"
+            f"Diversity Score: {diversity_score:.4f}\n"
+            f"Est. Avg Similarity: {avg_sim:.4f}"
+        )
+        plot_df = pd.DataFrame({
+            "x": projected[:, 0],
+            "y": projected[:, 1],
+            "topic": [m.get("file_name", "unknown") for m in data['metadatas']]
+        })
+        return metrics, gr.ScatterPlot(value=plot_df, x="x", y="y", color="topic", title="Semantic Space", tooltip="topic")
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"Analysis Error: {e}", None
+def fn_evaluate_retrieval(sample_limit):
+    count = collection.count()
+    if count < 10: return "Not enough data for evaluation (Need > 10 chunks)."
+    try:
+        # Sample random chunks
+        # Chroma doesn't support random sample easily, so we get a larger batch and pick random
+        fetch_limit = min(count, 2000) # Fetch up to 2k to sample from
+        data = collection.get(limit=fetch_limit, include=["documents", "ids"])
+        import random
+        actual_sample_size = min(sample_limit, len(data['ids']))
+        sample_indices = random.sample(range(len(data['ids'])), actual_sample_size)
+        hits_at_1 = 0
+        hits_at_5 = 0
+        mrr_sum = 0
+        # Generator for progress updates
+        yield f"Running evaluation on {actual_sample_size} chunks..."
+        for i, idx in enumerate(sample_indices):
+            target_id = data['ids'][idx]
+            code = data['documents'][idx]
+            # Synthetic Query
+            query = "\n".join(code.split("\n")[:3])
+            query_emb = compute_embeddings([query]).cpu().numpy().tolist()[0]
+            # Query DB
+            results = collection.query(query_embeddings=[query_emb], n_results=10)
+            # Check results
+            found_ids = results['ids'][0]
+            if target_id in found_ids:
+                rank = found_ids.index(target_id) + 1
+                mrr_sum += 1.0 / rank
+                if rank == 1: hits_at_1 += 1
+                if rank <= 5: hits_at_5 += 1
+            if i % 10 == 0:
+                yield f"Evaluated {i}/{actual_sample_size}..."
+        recall_1 = hits_at_1 / actual_sample_size
+        recall_5 = hits_at_5 / actual_sample_size
+        mrr = mrr_sum / actual_sample_size
+        report = (
+            f"Evaluation on {actual_sample_size} random chunks:\n"
+            f"--------------------------------------------\n"
+            f"Recall@1: {recall_1:.4f}\n"
+            f"Recall@5: {recall_5:.4f}\n"
+            f"MRR:      {mrr:.4f}\n"
+            f"\n(Note: Using ChromaDB for retrieval)"
+        )
+        yield report
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        yield f"Eval Error: {e}"
+# --- UI Layout ---
+theme = gr.themes.Soft(
+    primary_hue="slate",
+    neutral_hue="slate",
+    spacing_size="sm",
+    radius_size="md"
+).set(
+    body_background_fill="*neutral_50",
+    block_background_fill="white",
+    block_border_width="1px",
+    block_title_text_weight="600"
+)
+css = """
+h1 {
+    text-align: center;
+    font-family: 'Inter', sans-serif;
+    margin-bottom: 1rem;
+    color: #1e293b;
+}
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto;
+}
+"""
+with gr.Blocks(theme=theme, css=css, title="CodeMode") as demo:
+    gr.Markdown("# CodeMode")
+    with gr.Tabs():
+        # --- TAB 1: INGEST ---
+        with gr.Tab("1. Ingest GitHub Repo"):
+            gr.Markdown("### Connect a Repository")
+            with gr.Row():
+                repo_input = gr.Textbox(label="GitHub URL", placeholder="https://github.com/fastapi/fastapi", value="https://github.com/langchain-ai/langgraph")
+                ingest_btn = gr.Button("Ingest & Index", variant="primary")
+            with gr.Row():
+                 reset_btn = gr.Button("Reset Database", variant="stop")
+                 ingest_status = gr.Textbox(label="Status")
+            with gr.Accordion("Database Inspector", open=False):
+                list_files_btn = gr.Button("Refresh File List")
+                files_df = gr.Dataframe(
+                    headers=["File Name", "Chunks", "Source URL"],
+                    datatype=["str", "number", "str"],
+                    interactive=False
+                )
+            def fn_list_files():
+                count = collection.count()
+                if count == 0: return [["Database Empty", 0, "-"]]
+                try:
+                    # Fetch all metadata (limit to 10k to prevent UI freeze)
+                    limit = min(count, 10000)
+                    data = collection.get(limit=limit, include=["metadatas"])
+                    if not data or 'metadatas' not in data or data['metadatas'] is None:
+                         return [["Error: No metadata found", 0, "-"]]
+                    # Aggregate stats
+                    file_counts = {} # filename -> count
+                    file_urls = {} # filename -> url
+                    for meta in data['metadatas']:
+                        if meta is None: continue # Skip None entries
+                        fname = meta.get("file_name", "unknown")
+                        url = meta.get("url", "-")
+                        file_counts[fname] = file_counts.get(fname, 0) + 1
+                        file_urls[fname] = url
+                    # Convert to list
+                    output = []
+                    for fname, count in file_counts.items():
+                        output.append([fname, count, file_urls[fname]])
+                    if not output:
+                        return [["No files found in metadata", 0, "-"]]
+                    # Sort by chunk count (descending)
+                    output.sort(key=lambda x: x[1], reverse=True)
+                    return output
+                except Exception as e:
+                    import traceback
+                    traceback.print_exc()
+                    return [[f"Error: {str(e)}", 0, "-"]]
+            ingest_btn.click(fn_ingest, inputs=repo_input, outputs=[ingest_status])
+            reset_btn.click(fn=reset_db, inputs=[], outputs=[ingest_status])
+            list_files_btn.click(fn_list_files, inputs=[], outputs=[files_df])
+        # --- TAB 2: SEARCH ---
+        with gr.Tab("2. Semantic Search"):
+            gr.Markdown("### Search the Ingested Code")
+            with gr.Row():
+                search_box = gr.Textbox(label="Search Query", placeholder="e.g., 'how to create a state graph'")
+                search_btn = gr.Button("Search", variant="primary")
+            results_df = gr.Dataframe(
+                headers=["File Name", "Score", "Code Snippet"],
+                datatype=["str", "str", "str"],
+                interactive=False,
+                wrap=True
+            )
+            search_btn.click(fn=search_codebase, inputs=search_box, outputs=results_df)
+        # --- TAB 3: CODE SEARCH ---
+        with gr.Tab("3. Find Similar Code"):
+            gr.Markdown("### Code-to-Code Retrieval")
+            with gr.Row():
+                code_input = gr.Code(label="Reference Code", language="python")
+                code_search_btn = gr.Button("Find Matches", variant="primary")
+            code_results_df = gr.Dataframe(
+                headers=["File Name", "Score", "Matched Code"],
+                datatype=["str", "str", "str"],
+                interactive=False,
+                wrap=True
+            )
+            code_search_btn.click(fn=search_codebase, inputs=code_input, outputs=code_results_df)
+        # --- TAB 4: MLOps MONITORING ---
+        with gr.Tab("4. Deployment Monitoring"):
+            gr.Markdown("### Embedding Quality Analysis")
+            analyze_btn = gr.Button("Analyze Embeddings", variant="secondary")
+            with gr.Row():
+                quality_metrics = gr.Textbox(label="Quality Metrics")
+                plot_output = gr.ScatterPlot(label="Semantic Space (PCA)")
+            analyze_btn.click(fn_analyze_embeddings, inputs=[], outputs=[quality_metrics, plot_output])
+            gr.Markdown("### Extrinsic Evaluation (Retrieval Performance)")
+            with gr.Row():
+                eval_size = gr.Slider(minimum=10, maximum=1000, value=50, step=10, label="Sample Size (Chunks)")
+                eval_btn = gr.Button("Run Retrieval Evaluation", variant="primary")
+            eval_output = gr.Textbox(label="Evaluation Report")
+            eval_btn.click(fn_evaluate_retrieval, inputs=[eval_size], outputs=eval_output)
+if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+chromadb>=0.4.0
+torch
+transformers
+pandas
+scikit-learn
+tree-sitter==0.21.3
+tree-sitter-languages
+gitpython

scripts/__init__.py ADDED Viewed

File without changes

scripts/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (172 Bytes). View file

scripts/aggregate_datasets.py ADDED Viewed

	@@ -0,0 +1,77 @@

+'''
+Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py.
+'''
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict
+BASE_SYNTHETIC_DIR = Path("data/synthetic")
+OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined"
+def load_jsonl(path: Path) -> List[Dict]:
+    with path.open("r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+def save_jsonl(path: Path, records: List[Dict]):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def save_json(path: Path, records: List[Dict]):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2)
+def aggregate():
+    positive_pairs_all = []
+    triplets_all = []
+    included_runs = []
+    for run_dir in BASE_SYNTHETIC_DIR.iterdir():
+        if not run_dir.is_dir():
+            continue
+        if run_dir.name == "combined":
+            continue
+        pos_path = run_dir / "positive_pairs.jsonl"
+        tri_path = run_dir / "triplets.jsonl"
+        if pos_path.exists() and tri_path.exists():
+            positive_pairs_all.extend(load_jsonl(pos_path))
+            triplets_all.extend(load_jsonl(tri_path))
+            included_runs.append(run_dir.name)
+    # Save JSONL (training)
+    save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all)
+    save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all)
+    # Save JSON (inspection / upload)
+    save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all)
+    save_json(OUTPUT_DIR / "triplets.json", triplets_all)
+    # Metadata
+    metadata = {
+        "type": "combined_dataset",
+        "included_runs": included_runs,
+        "total_positive_pairs": len(positive_pairs_all),
+        "total_triplets": len(triplets_all),
+        "created_at": datetime.utcnow().isoformat(),
+    }
+    with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+    print("✅ Combined dataset created at:", OUTPUT_DIR)
+if __name__ == "__main__":
+    aggregate()

scripts/core/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# CodeMode Core Scripts 🚀
+This directory contains the **modular core logic** for the CodeMode pipeline. It is designed to be cleaner and more production-ready than the experimental notebooks.
+## Structure
+### 1. Ingestion (`scripts/core/ingestion`)
+Handles data collection and processing.
+- `ingest.py`: The Git Crawler (formerly `git_crawler.py`).
+- `chunk.py`: The Universal Chunker (formerly `repo_chunker.py`).
+- `generate_data.py`: Creates training triplets (formerly `pairs_triplets_generator.py`).
+**Usage:**
+```bash
+# Example: Ingest a repo
+python -m scripts.core.ingestion.ingest --url https://github.com/crewAIInc/crewAI
+# Example: Generate Triplets
+python -m scripts.core.ingestion.generate_data --chunks data/processed/chunks.jsonl --output data/training
+```
+### 2. Training (`scripts/core/training`)
+Handles model training and embedding generation.
+- `train.py`: Main training loop.
+- `model.py`: The CodeEmbedder model architecture.
+- `trainer.py`: The training loop logic.
+**Usage:**
+```bash
+# Example: Train the model
+python -m scripts.core.training.train --data_path data/training/triplets.jsonl --epochs 3
+```
+## Why this structure?
+- **Separation of Concerns:** Training logic doesn't depend on web scraping libraries.
+- **Reusability:** You can import `CodeEmbedder` or `RepoChunker` in other projects easily.
+- **Production Ready:** Direct python scripts instead of notebooks.

scripts/core/__init__.py ADDED Viewed

File without changes

scripts/core/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (177 Bytes). View file

scripts/core/ingestion/__init__.py ADDED Viewed

File without changes

scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (187 Bytes). View file

scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc ADDED Viewed

Binary file (14.9 kB). View file

scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc ADDED Viewed

Binary file (20.4 kB). View file

scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc ADDED Viewed

Binary file (4.74 kB). View file

scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc ADDED Viewed

Binary file (14.8 kB). View file

scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc ADDED Viewed

Binary file (8.04 kB). View file

scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc ADDED Viewed

Binary file (18 kB). View file

scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc ADDED Viewed

Binary file (21.7 kB). View file

scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc ADDED Viewed

Binary file (5.77 kB). View file

scripts/core/ingestion/ast_chunker.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""
+AST-based semantic code chunker - Primary source of truth for code structure.
+This module implements the core AST-based chunking strategy that forms the
+authority layer of our hybrid chunking pipeline. It uses Python's built-in
+AST parser to extract semantic chunks (modules, classes, functions, methods)
+while preserving hierarchical relationships.
+ARCHITECTURE POSITION:
+    - Authority Layer: Source of truth for semantic structure
+    - Primary Chunker: Generates all primary chunks
+    - Hierarchy Builder: Establishes parent-child relationships
+KEY FEATURES:
+    1. AST-first parsing for semantic accuracy
+    2. Hierarchical chunk generation with depth tracking
+    3. Byte-level span calculation for precise positioning
+    4. Import and decorator extraction per node
+    5. Deterministic chunk ID generation
+FLOW:
+    File → Python AST → ASTChunker visitor → Semantic chunks with hierarchy
+USAGE:
+    from ast_chunker import extract_ast_chunks
+    chunks = extract_ast_chunks(Path("file.py"))
+"""
+import ast
+from pathlib import Path
+from typing import List, Optional, Union, Dict, Tuple
+import hashlib
+from ..utils.id_utils import deterministic_chunk_id
+from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ASTSymbolType, ChunkType
+DocNode = Union[
+    ast.Module,
+    ast.ClassDef,
+    ast.FunctionDef,
+    ast.AsyncFunctionDef,
+]
+class ASTChunker(ast.NodeVisitor):
+    def __init__(self, source: str, file_path: str):
+        self.source = source
+        self.file_path = file_path
+        self.source_bytes = source.encode('utf-8')
+        self.chunks: List[CodeChunk] = []
+        self.tree = ast.parse(source)
+        # Track hierarchy
+        self.current_class: Optional[str] = None
+        self.imports_list: List[str] = []
+        # For hierarchy tracking
+        self.parent_stack: List[CodeChunk] = []
+        self.sibling_counters: Dict[str, int] = {}
+        # Attach parents to nodes
+        for node in ast.walk(self.tree):
+            for child in ast.iter_child_nodes(node):
+                setattr(child, "parent", node)
+    # ---------------- utilities ----------------
+    def _get_code(self, node: ast.AST) -> str:
+        code = ast.get_source_segment(self.source, node)
+        return code.strip() if code else ""
+    def _get_byte_span(self, start_line: int, end_line: int) -> Tuple[int, int]:
+        """Convert line numbers to byte positions"""
+        lines = self.source.split('\n')
+        # Calculate start byte
+        start_byte = sum(len(line.encode()) + 1 for line in lines[:start_line-1])
+        # Calculate end byte (up to end_line)
+        end_byte = sum(len(line.encode()) + 1 for line in lines[:end_line])
+        return start_byte, end_byte
+    def _extract_node_imports(self, node: ast.AST) -> List[str]:
+        """Extract imports specific to this node (not all module imports)"""
+        imports: List[str] = []
+        # Walk through this node's body
+        for child in ast.walk(node):
+            if isinstance(child, (ast.Import, ast.ImportFrom)):
+                try:
+                    imports.append(ast.unparse(child))
+                except Exception:
+                    imports.append(str(child))
+        return imports
+    def _extract_decorators(self, node: ast.AST) -> List[str]:
+        decorators: List[str] = []
+        if hasattr(node, "decorator_list"):
+            for d in node.decorator_list:  # type: ignore[attr-defined]
+                try:
+                    decorators.append(ast.unparse(d))
+                except Exception:
+                    decorators.append(str(d))
+        return decorators
+    # ---------------- chunk creation ----------------
+    def _create_chunk(
+        self,
+        node: DocNode,
+        chunk_type: ChunkType,
+        name: str,
+        parent: Optional[str] = None,
+        parent_chunk: Optional[CodeChunk] = None,
+    ) -> CodeChunk:
+        code = self._get_code(node)
+        # Get line numbers
+        start_line = getattr(node, "lineno", None)
+        end_line = getattr(node, "end_lineno", None)
+        # Calculate byte span
+        start_byte, end_byte = None, None
+        if start_line and end_line:
+            start_byte, end_byte = self._get_byte_span(start_line, end_line)
+        # Determine parent if not provided
+        if parent is None and chunk_type == "method":
+            parent = self.current_class
+        decorators: List[str] = []
+        if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
+            decorators = self._extract_decorators(node)
+        # Get imports specific to this node (not all module imports)
+        node_imports = self._extract_node_imports(node)
+        # Get docstring only for nodes that can have one
+        docstring: Optional[str] = None
+        if hasattr(node, 'body'):
+            docstring = ast.get_docstring(node)
+        # Determine hierarchy depth
+        depth = 0
+        lineage: List[str] = []
+        sibling_index = 0
+        if parent_chunk:
+            depth = parent_chunk.hierarchy.depth + 1
+            lineage = parent_chunk.hierarchy.lineage.copy()
+            lineage.append(parent_chunk.chunk_id)
+            # Update sibling counter
+            parent_key = parent_chunk.chunk_id
+            self.sibling_counters[parent_key] = self.sibling_counters.get(parent_key, 0) + 1
+            sibling_index = self.sibling_counters[parent_key] - 1
+        ast_info = ChunkAST(
+            symbol_type=chunk_type,
+            name=name,
+            parent=parent,
+            docstring=docstring,
+            decorators=decorators,
+            imports=node_imports,
+        )
+        span = ChunkSpan(
+            start_byte=start_byte,
+            end_byte=end_byte,
+            start_line=start_line,
+            end_line=end_line,
+        )
+        # Generate chunk ID
+        chunk_id = deterministic_chunk_id(
+            file_path=self.file_path,
+            chunk_type=chunk_type,
+            name=name,
+            parent=parent,
+            start_line=start_line,
+            end_line=end_line,
+            code=code,
+        )
+        chunk = CodeChunk(
+            chunk_id=chunk_id,
+            file_path=self.file_path,
+            language="python",
+            chunk_type=chunk_type,
+            code=code,
+            ast=ast_info,
+            span=span,
+            hierarchy=ChunkHierarchy(
+                parent_id=parent_chunk.chunk_id if parent_chunk else None,
+                children_ids=[],
+                depth=depth,
+                is_primary=True,
+                is_extracted=False,
+                lineage=lineage,
+                sibling_index=sibling_index,
+            ),
+        )
+        # Add to parent's children if parent exists
+        if parent_chunk:
+            parent_chunk.hierarchy.children_ids.append(chunk_id)
+        self.chunks.append(chunk)
+        return chunk
+    def _create_module_chunk(self) -> CodeChunk:
+        """Create module chunk with all imports"""
+        module_name = Path(self.file_path).stem
+        start_line = 1
+        end_line = len(self.source.split('\n'))
+        start_byte, end_byte = self._get_byte_span(start_line, end_line)
+        # Module code - entire file
+        module_code = self.source
+        # Extract ALL imports for module
+        module_imports: List[str] = []
+        for node in ast.walk(self.tree):
+            if isinstance(node, (ast.Import, ast.ImportFrom)):
+                try:
+                    module_imports.append(ast.unparse(node))
+                except Exception:
+                    pass
+        chunk_id = deterministic_chunk_id(
+            file_path=self.file_path,
+            chunk_type="module",
+            name=module_name,
+            parent=None,
+            start_line=start_line,
+            end_line=end_line,
+            code=module_code,
+        )
+        ast_info = ChunkAST(
+            symbol_type="module",
+            name=module_name,
+            parent=None,
+            docstring=ast.get_docstring(self.tree),
+            decorators=[],
+            imports=module_imports,  # ALL imports in module
+        )
+        span = ChunkSpan(
+            start_byte=start_byte,
+            end_byte=end_byte,
+            start_line=start_line,
+            end_line=end_line,
+        )
+        chunk = CodeChunk(
+            chunk_id=chunk_id,
+            file_path=self.file_path,
+            language="python",
+            chunk_type="module",
+            code=module_code,
+            ast=ast_info,
+            span=span,
+            hierarchy=ChunkHierarchy(
+                parent_id=None,
+                children_ids=[],
+                depth=0,
+                is_primary=True,
+                is_extracted=False,
+                lineage=[],
+                sibling_index=0,
+            ),
+        )
+        self.chunks.append(chunk)
+        return chunk
+    # ---------------- visitors ----------------
+    def visit_Import(self, node: ast.Import) -> None:
+        try:
+            self.imports_list.append(ast.unparse(node))
+        except Exception:
+            pass
+        self.generic_visit(node)
+    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+        try:
+            self.imports_list.append(ast.unparse(node))
+        except Exception:
+            pass
+        self.generic_visit(node)
+    def visit_ClassDef(self, node: ast.ClassDef) -> None:
+        # Create class chunk
+        class_chunk = self._create_chunk(
+            node,
+            "class",
+            node.name,
+            parent="module",
+            parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
+        )
+        # Save current class context
+        previous_class = self.current_class
+        self.current_class = node.name
+        # Push class to stack
+        self.parent_stack.append(class_chunk)
+        # Visit class body
+        self.generic_visit(node)
+        # Restore previous context
+        self.current_class = previous_class
+        self.parent_stack.pop()
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
+        parent = getattr(node, "parent", None)
+        if isinstance(parent, ast.Module):
+            # Top-level function
+            self._create_chunk(
+                node,
+                "function",
+                node.name,
+                parent="module",
+                parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
+            )
+        elif isinstance(parent, ast.ClassDef):
+            # Method inside class
+            self._create_chunk(
+                node,
+                "method",
+                node.name,
+                parent=parent.name,
+                parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
+            )
+        self.generic_visit(node)
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
+        parent = getattr(node, "parent", None)
+        if isinstance(parent, ast.Module):
+            # Top-level async function
+            self._create_chunk(
+                node,
+                "function",
+                node.name,
+                parent="module",
+                parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
+            )
+        elif isinstance(parent, ast.ClassDef):
+            # Async method inside class
+            self._create_chunk(
+                node,
+                "method",
+                node.name,
+                parent=parent.name,
+                parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
+            )
+        self.generic_visit(node)
+    def visit_Module(self, node: ast.Module) -> None:
+        # Create module chunk first (root)
+        module_chunk = self._create_module_chunk()
+        # Push module to stack
+        self.parent_stack.append(module_chunk)
+        # Visit children to create classes and functions
+        self.generic_visit(node)
+        # Pop module from stack
+        self.parent_stack.pop()
+# ---------------- public API ----------------
+def extract_ast_chunks(file_path: Path) -> List[CodeChunk]:
+    source = file_path.read_text(encoding="utf-8")
+    chunker = ASTChunker(source, str(file_path))
+    # Visit the tree (creates all chunks with relationships)
+    chunker.visit(chunker.tree)
+    return chunker.chunks

scripts/core/ingestion/chunk.py ADDED Viewed

	@@ -0,0 +1,497 @@

+"""
+Repository File Type Chunker - Universal chunker for all file types.
+This module provides file-type-aware chunking for repositories, handling
+everything from Python code to configuration files, documentation, and
+special files. It's the universal interface that delegates to specialized
+chunkers based on file type.
+ARCHITECTURE POSITION:
+    - File Type Dispatcher: Routes files to appropriate chunkers
+    - Universal Interface: Single entry point for all file types
+    - Metadata Enricher: Adds repository context to all chunks
+KEY FEATURES:
+    1. File type detection and intelligent routing
+    2. Hierarchical chunking for Python files
+    3. Documentation chunking for markdown/RST
+    4. Configuration file handling (JSON/YAML/TOML)
+    5. Special file handling (README, requirements.txt, Dockerfile)
+    6. Binary file detection and skipping
+FILE TYPE SUPPORT:
+    - .py: HierarchicalChunker (AST + Tree-sitter)
+    - .md/.mdx/.rst: Documentation chunker
+    - .json/.yaml/.toml: Configuration chunker
+    - requirements.txt/Dockerfile: Special chunker
+    - .txt/.cfg/.ini: Text chunker
+    - README/LICENSE: Documentation chunker
+    - Others: Text chunker with binary detection
+DATA FLOW:
+    File → Type detection → Route to specialized chunker →
+    Add repo metadata → Return chunks
+USAGE:
+    chunker = RepoChunker()
+    chunks = chunker.chunk_file(Path("file.py"), repo_metadata)
+"""
+from pathlib import Path
+from typing import List, Dict, Optional, cast
+import json
+import yaml
+import re
+import hashlib
+from .hierarchical_chunker import HierarchicalChunker
+from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType, ASTSymbolType
+from .doc_chunker import chunk_document as chunk_markdown_file
+class RepoChunker:
+    """
+    Repository chunker that handles ALL file types with proper structure
+    """
+    def __init__(self, use_hierarchical: bool = True):
+        if use_hierarchical:
+            self.hierarchical_chunker = HierarchicalChunker()
+        self.use_hierarchical = use_hierarchical
+    def _generate_stable_id(self, content: str, prefix: str = "stable") -> str:
+        """
+        Generate deterministic chunk ID using SHA256.
+        IMPORTANT: This ensures IDs are stable across runs, processes,
+        and Python versions - crucial for RAG reproducibility.
+        Args:
+            content: The text content to hash
+            prefix: ID prefix (config, doc, text, etc.)
+        Returns:
+            Deterministic ID like "config_8a3b2c1d"
+        """
+        # Use SHA256 for consistency with id_utils.py
+        hash_digest = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
+        return f"{prefix}_{hash_digest}"
+    def chunk_file(self, file_path: Path, repo_metadata: Optional[Dict] = None) -> List[CodeChunk]:
+        """
+        Chunk ANY file type with repository context
+        Args:
+            file_path: Path to the file
+            repo_metadata: Optional dict with repo metadata
+        """
+        suffix = file_path.suffix.lower()
+        # Python files - use your advanced hierarchical chunker
+        if suffix == '.py':
+            return self._chunk_python_file(file_path, repo_metadata)
+        # Markdown/RST documentation
+        elif suffix in ['.md', '.mdx', '.rst']:
+            return self._chunk_markdown_file_wrapper(file_path, repo_metadata)
+        # JSON config files
+        elif suffix == '.json':
+            return self._chunk_json_file(file_path, repo_metadata)
+        # YAML/TOML config files
+        elif suffix in ['.yaml', '.yml', '.toml']:
+            return self._chunk_config_file(file_path, repo_metadata)
+        # Requirements/Docker files
+        elif file_path.name.lower() in ['requirements.txt', 'dockerfile', 'docker-compose.yml']:
+            return self._chunk_special_file(file_path, repo_metadata)
+        # Text files
+        elif suffix in ['.txt', '.cfg', '.ini', '.conf']:
+            return self._chunk_text_file(file_path, repo_metadata)
+        # README/LICENSE files
+        elif file_path.name.lower() in ['readme', 'readme.md', 'license', 'license.txt', 'license.md']:
+            return self._chunk_readme_file(file_path, repo_metadata)
+        # All other files
+        else:
+            return self._chunk_other_file(file_path, repo_metadata)
+    def _chunk_python_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Use our hierarchical chunker for Python files"""
+        try:
+            if self.use_hierarchical:
+                chunks = self.hierarchical_chunker.chunk_file(file_path)
+            else:
+                # Fallback to basic text chunking instead of hybrid
+                return self._chunk_text_file(file_path, repo_metadata)
+            # Add repository metadata
+            if repo_metadata:
+                for chunk in chunks:
+                    if "repo_info" not in chunk.metadata:
+                        chunk.metadata["repo_info"] = {}
+                    chunk.metadata["repo_info"].update(repo_metadata)
+            return chunks
+        except Exception as e:
+            print(f"[ERROR] Error chunking Python file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_markdown_file_wrapper(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Chunk markdown files using our doc_chunker"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            # Use your existing doc_chunker
+            doc_chunks = chunk_markdown_file(
+                content,
+                source_name=file_path.name,
+                source_url=f"file://{file_path}"
+            )
+            # Convert to CodeChunk schema
+            code_chunks = []
+            for doc_chunk in doc_chunks:
+                code_chunk = CodeChunk(
+                    chunk_id=doc_chunk["chunk_id"],  # Already uses SHA1 from doc_chunker.py
+                    file_path=str(file_path),
+                    language=doc_chunk.get("language", "markdown"),
+                    chunk_type="documentation",
+                    code=doc_chunk["content"],
+                    ast=ChunkAST(
+                        symbol_type="documentation",
+                        name=file_path.name,
+                        parent=None,
+                        docstring=None
+                    ),
+                    span=ChunkSpan(
+                        start_line=doc_chunk.get("metadata", {}).get("line_start", 1),
+                        end_line=doc_chunk.get("metadata", {}).get("line_end", 1)
+                    ),
+                    metadata={
+                        "doc_chunk_type": doc_chunk.get("chunk_type", "text"),
+                        "repo_info": repo_metadata or {},
+                        **doc_chunk.get("metadata", {})
+                    },
+                    hierarchy=ChunkHierarchy(
+                        is_primary=True,
+                        is_extracted=False,
+                        depth=0
+                    )
+                )
+                code_chunks.append(code_chunk)
+            return code_chunks
+        except Exception as e:
+            print(f"[ERROR] Error chunking markdown file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_json_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Chunk JSON config files"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            data = json.loads(content)
+            pretty_content = json.dumps(data, indent=2)
+            # FIXED: Use deterministic SHA256 instead of hash()
+            chunk = CodeChunk(
+                chunk_id=self._generate_stable_id(pretty_content, "config"),
+                file_path=str(file_path),
+                language="json",
+                chunk_type="configuration",
+                code=pretty_content,
+                ast=ChunkAST(
+                    symbol_type="configuration",
+                    name=file_path.name,
+                    parent=None,
+                    docstring=None
+                ),
+                span=ChunkSpan(
+                    start_line=1,
+                    end_line=len(pretty_content.split('\n'))
+                ),
+                metadata={
+                    "file_type": "json_config",
+                    "config_keys": list(data.keys()) if isinstance(data, dict) else [],
+                    "repo_info": repo_metadata or {}
+                },
+                hierarchy=ChunkHierarchy(
+                    is_primary=True,
+                    is_extracted=False,
+                    depth=0
+                )
+            )
+            return [chunk]
+        except Exception as e:
+            print(f"[ERROR] Error chunking JSON file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_config_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Chunk YAML/TOML config files"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            suffix = file_path.suffix.lower()
+            language = "yaml" if suffix in ['.yaml', '.yml'] else "toml"
+            # FIXED: Use deterministic SHA256 instead of hash()
+            chunk = CodeChunk(
+                chunk_id=self._generate_stable_id(content, "config"),
+                file_path=str(file_path),
+                language=language,
+                chunk_type="configuration",
+                code=content,
+                ast=ChunkAST(
+                    symbol_type="configuration",
+                    name=file_path.name,
+                    parent=None,
+                    docstring=None
+                ),
+                span=ChunkSpan(
+                    start_line=1,
+                    end_line=len(content.split('\n'))
+                ),
+                metadata={
+                    "file_type": f"{language}_config",
+                    "repo_info": repo_metadata or {}
+                },
+                hierarchy=ChunkHierarchy(
+                    is_primary=True,
+                    is_extracted=False,
+                    depth=0
+                )
+            )
+            return [chunk]
+        except Exception as e:
+            print(f"[ERROR] Error chunking config file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_special_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Chunk special files (requirements.txt, Dockerfile, etc.)"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            file_name = file_path.name.lower()
+            if 'requirements' in file_name:
+                language = "requirements"
+                chunk_type = "configuration"
+                prefix = "config"
+            elif 'docker' in file_name:
+                language = "dockerfile"
+                chunk_type = "script"
+                prefix = "script"
+            else:
+                language = "text"
+                chunk_type = "text"
+                prefix = "text"
+            # FIXED: Use deterministic SHA256 instead of hash()
+            chunk = CodeChunk(
+                chunk_id=self._generate_stable_id(content, prefix),
+                file_path=str(file_path),
+                language=language,
+                chunk_type=chunk_type,
+                code=content,
+                ast=ChunkAST(
+                    symbol_type=chunk_type,
+                    name=file_path.name,
+                    parent=None,
+                    docstring=None
+                ),
+                span=ChunkSpan(
+                    start_line=1,
+                    end_line=len(content.split('\n'))
+                ),
+                metadata={
+                    "file_type": file_name,
+                    "repo_info": repo_metadata or {},
+                    "dependencies": self._extract_dependencies(content) if "requirements" in file_name else []
+                },
+                hierarchy=ChunkHierarchy(
+                    is_primary=True,
+                    is_extracted=False,
+                    depth=0
+                )
+            )
+            return [chunk]
+        except Exception as e:
+            print(f"[ERROR] Error chunking special file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_text_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Chunk plain text files"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            # Create a single chunk for small files, multiple for large ones
+            if len(content.split('\n')) <= 200:
+                chunks = [self._create_text_chunk(content, file_path, repo_metadata)]
+            else:
+                # Split large text files into reasonable chunks
+                chunks = []
+                lines = content.split('\n')
+                chunk_size = 100
+                for i in range(0, len(lines), chunk_size):
+                    chunk_lines = lines[i:i + chunk_size]
+                    chunk_content = '\n'.join(chunk_lines)
+                    chunk = self._create_text_chunk(
+                        chunk_content,
+                        file_path,
+                        repo_metadata,
+                        chunk_index=i // chunk_size
+                    )
+                    chunks.append(chunk)
+            return chunks
+        except Exception as e:
+            print(f"[ERROR] Error reading text file {file_path}: {e}")
+            return []
+    def _chunk_readme_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Special handling for README/LICENSE files"""
+        try:
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            file_name_lower = file_path.name.lower()
+            # Determine appropriate prefix
+            if 'readme' in file_name_lower:
+                prefix = "doc"
+            elif 'license' in file_name_lower:
+                prefix = "license"
+            else:
+                prefix = "doc"
+            # FIXED: Use deterministic SHA256 instead of hash()
+            chunk = CodeChunk(
+                chunk_id=self._generate_stable_id(content, prefix),
+                file_path=str(file_path),
+                language="markdown" if file_path.suffix in ['.md', '.mdx'] else "text",
+                chunk_type="documentation",
+                code=content,
+                ast=ChunkAST(
+                    symbol_type="documentation",
+                    name=file_path.name,
+                    parent=None,
+                    docstring=None
+                ),
+                span=ChunkSpan(
+                    start_line=1,
+                    end_line=len(content.split('\n'))
+                ),
+                metadata={
+                    "file_type": "readme_license",
+                    "is_readme": "readme" in file_name_lower,
+                    "is_license": "license" in file_name_lower,
+                    "repo_info": repo_metadata or {}
+                },
+                hierarchy=ChunkHierarchy(
+                    is_primary=True,
+                    is_extracted=False,
+                    depth=0
+                )
+            )
+            return [chunk]
+        except Exception as e:
+            print(f"[ERROR] Error chunking README file {file_path}: {e}")
+            return self._chunk_text_file(file_path, repo_metadata)
+    def _chunk_other_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
+        """Fallback for unknown file types (binary or unsupported)"""
+        try:
+            # Try to read as text first
+            content = file_path.read_text(encoding='utf-8', errors='ignore')
+            # If it looks like binary (mostly non-printable characters)
+            if self._looks_like_binary(content):
+                print(f"[SKIPPED] Skipping binary file: {file_path}")
+                return []
+            # If readable text, treat as text file
+            return self._chunk_text_file(file_path, repo_metadata)
+        except UnicodeDecodeError:
+            print(f"[SKIPPED] Skipping binary file: {file_path}")
+            return []
+        except Exception as e:
+            print(f"[ERROR] Error with file {file_path}: {e}")
+            return []
+    def _create_text_chunk(self, content: str, file_path: Path,
+                          repo_metadata: Optional[Dict], chunk_index: int = 0) -> CodeChunk:
+        """Helper to create a text chunk"""
+        lines = content.split('\n')
+        # ENHANCED: Use deterministic ID that includes chunk_index for uniqueness
+        id_payload = f"{content}_{chunk_index}"
+        return CodeChunk(
+            chunk_id=self._generate_stable_id(id_payload, "text"),
+            file_path=str(file_path),
+            language="text",
+            chunk_type="text",
+            code=content,
+            ast=ChunkAST(
+                symbol_type="text",
+                name=file_path.name,
+                parent=None,
+                docstring=None
+            ),
+            span=ChunkSpan(
+                start_line=1,
+                end_line=len(lines)
+            ),
+            metadata={
+                "file_type": "text",
+                "chunk_index": chunk_index,
+                "total_lines": len(lines),
+                "repo_info": repo_metadata or {}
+            },
+            hierarchy=ChunkHierarchy(
+                is_primary=True,
+                is_extracted=False,
+                depth=0
+            )
+        )
+    def _extract_dependencies(self, requirements_content: str) -> List[str]:
+        """Extract package names from requirements.txt"""
+        dependencies = []
+        for line in requirements_content.split('\n'):
+            line = line.strip()
+            if line and not line.startswith('#'):
+                # Extract package name (before version specifiers)
+                package = line.split('==')[0].split('>=')[0].split('<=')[0].strip()
+                if package:
+                    dependencies.append(package)
+        return dependencies
+    def _looks_like_binary(self, content: str, threshold: float = 0.3) -> bool:
+        """Check if content looks like binary data"""
+        if not content:
+            return False
+        # Count printable vs non-printable characters
+        printable = sum(1 for c in content if 32 <= ord(c) <= 126 or c in '\n\r\t')
+        total = len(content)
+        if total == 0:
+            return False
+        ratio = printable / total
+        return ratio < threshold

scripts/core/ingestion/chunk_schema.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+chunk_schema.py - UPDATED with enhanced hierarchy
+"""
+from typing import Dict, List, Optional, Literal, Union
+from dataclasses import dataclass, field
+# ✅ EXPANDED ChunkType to support ALL file types
+ChunkType = Literal[
+    "module",        # Python module
+    "class",         # Python class
+    "function",      # Python function
+    "method",        # Python method
+    "context",       # General context
+    "documentation", # Markdown/RST docs
+    "configuration", # Config files (JSON, YAML, TOML)
+    "notebook",      # Jupyter notebook
+    "script",        # Shell scripts
+    "dockerfile",    # Docker files
+    "typescript",    # TypeScript files
+    "javascript",    # JavaScript files
+    "text",          # Plain text
+    "imports",       # Import statements
+    "unknown"        # Unknown file type
+]
+# For AST symbol types
+ASTSymbolType = Literal[
+    "module", "class", "function", "method", "context",
+    "documentation", "configuration", "notebook", "script",
+    "dockerfile", "typescript", "javascript", "text",
+    "imports",
+    "unknown"
+]
+# @dataclass
+# class ChunkHierarchy:
+#     """Enhanced hierarchical relationship metadata"""
+#     parent_id: Optional[str] = None
+#     children_ids: List[str] = field(default_factory=list)
+#     depth: int = 0
+#     is_primary: bool = True
+#     is_extracted: bool = False
+#     lineage: List[str] = field(default_factory=list)  # Path from root
+#     sibling_index: int = 0  # Position among siblings
+@dataclass
+class ChunkHierarchy:
+    """Enhanced hierarchical relationship metadata"""
+    parent_id: Optional[str] = None
+    children_ids: List[str] = field(default_factory=list)
+    depth: int = 0
+    is_primary: bool = True
+    is_extracted: bool = False
+    lineage: List[str] = field(default_factory=list)  # Path from root
+    sibling_index: int = 0  # Position among siblings
+    # Optional: Add methods for type-safe operations
+    def add_child(self, child_id: str) -> None:
+        """Type-safe method to add child"""
+        if child_id not in self.children_ids:
+            self.children_ids.append(child_id)
+    def remove_child(self, child_id: str) -> None:
+        """Type-safe method to remove child"""
+        if child_id in self.children_ids:
+            self.children_ids.remove(child_id)
+    def set_parent(self, parent_id: Optional[str]) -> None:
+        """Type-safe method to set parent"""
+        self.parent_id = parent_id
+    def increment_depth(self) -> None:
+        """Increment depth by 1"""
+        self.depth += 1
+@dataclass
+class ChunkAST:
+    symbol_type: Optional[ASTSymbolType] = None
+    name: Optional[str] = None
+    parent: Optional[str] = None
+    docstring: Optional[str] = None
+    decorators: List[str] = field(default_factory=list)
+    imports: List[str] = field(default_factory=list)
+    node_type: Optional[str] = None  # Original AST node type
+@dataclass
+class ChunkSpan:
+    start_byte: Optional[int] = None
+    end_byte: Optional[int] = None
+    start_line: Optional[int] = None
+    end_line: Optional[int] = None
+    char_count: Optional[int] = None  # Character count for quick reference
+@dataclass
+class CodeChunk:
+    chunk_id: str
+    file_path: str
+    language: str
+    chunk_type: ChunkType  # ✅ Now accepts ALL types
+    code: str
+    ast: ChunkAST
+    span: ChunkSpan
+    metadata: Dict = field(default_factory=dict)
+    hierarchy: ChunkHierarchy = field(default_factory=ChunkHierarchy)

scripts/core/ingestion/doc_chunker.py ADDED Viewed

	@@ -0,0 +1,446 @@

+from __future__ import annotations
+import hashlib
+import re
+from typing import List, Dict, Optional
+from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy
+def _hash_id(text: str, prefix: str) -> str:
+    """
+    Generate deterministic ID using SHA256 (standardized).
+    Previously used SHA1, now standardized to SHA256 for consistency
+    with repo_chunker.py and id_utils.py.
+    """
+    # CHANGED: sha1 → sha256
+    h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
+    return f"{prefix}_{h}"
+def _is_actual_code(text: str) -> bool:
+    """
+    Check if text inside a fenced block is actual executable code
+    or just formatted text.
+    """
+    text = text.strip()
+    # Common patterns that indicate formatted text, not code
+    formatted_text_patterns = [
+        # Lines with many = or - characters (dividers)
+        r'^=+\s*[A-Za-z\s]+\s*=+$',
+        r'^-+\s*[A-Za-z\s]+\s*-+$',
+        # Lines that look like headers/separators
+        r'^[=_-]{20,}$',
+        # Contains natural language sentences
+        r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
+        r'[.!?]\s+[A-Z]',  # Sentence boundaries
+        # Message-like patterns
+        r'^\s*(Human|AI|Tool|System|User|Assistant)\s+(Message|Response|Input|Output)?\s*[:=-]',
+        r'^\s*[A-Z][a-z]+\s*:',  # "Reasoning:", "Acting:", etc.
+    ]
+    # Check if it looks like formatted text
+    lines = text.split('\n')
+    formatted_line_count = 0
+    code_line_count = 0
+    # Patterns that indicate actual code
+    code_patterns = [
+        r'^\s*(def|class|import|from|async|await|return|if|for|while|try|except|with)\b',
+        r'^\s*@\w+',
+        r'^\s*\w+\s*=\s*.+',
+        r'^\s*\w+\(.+\)',
+        r'^\s*print\(.+\)',
+        r'^\s*\{.*\}',  # JSON/dict
+        r'^\s*\[.*\]',  # List
+    ]
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # Check for formatted text patterns
+        is_formatted = any(re.search(pattern, line, re.IGNORECASE) for pattern in formatted_text_patterns)
+        # Check for code patterns
+        is_code = any(re.search(pattern, line) for pattern in code_patterns)
+        if is_formatted:
+            formatted_line_count += 1
+        if is_code:
+            code_line_count += 1
+    # If it has many formatted text lines and few/no code lines, it's not actual code
+    if formatted_line_count > 1 and code_line_count == 0:
+        return False
+    # Default to treating fenced blocks as code (original behavior)
+    return True
+def _looks_like_code_block(lines: List[str]) -> bool:
+    """
+    Heuristic to recover code blocks when Markdown fences are missing
+    (common after HTML → MD conversion).
+    """
+    if not lines:
+        return False
+    # Join lines and check for minimum length
+    joined = "\n".join(lines)
+    text = joined.strip()
+    # Too short? Probably not code
+    if len(text) < 50:
+        return False
+    # Check for code patterns
+    code_patterns = [
+        # Python keywords at line start
+        r'^\s*(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|from\s+\w+\s+import)',
+        # Function calls or assignments
+        r'^\s*\w+\s*=\s*.+|^\s*\w+\s*\(.+\)',
+        # Control structures
+        r'^\s*(if|for|while|with|try|except|finally|async|await)\s+',
+        # Decorators
+        r'^\s*@\w+',
+        # Return statements
+        r'^\s*return\b',
+        # Print statements
+        r'^\s*print\(',
+        # Indented blocks (common in Python)
+        r'^\s{4,}\S',
+    ]
+    # Check for prose indicators (if these are present, it's likely text)
+    prose_indicators = [
+        # Common English words in prose
+        r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
+        # Sentence endings followed by capital
+        r'[.!?]\s+[A-Z]',
+        # Articles
+        r'\b(a|an|the)\s+\w+',
+    ]
+    lines_list = text.split('\n')
+    code_line_count = 0
+    prose_line_count = 0
+    for line in lines_list:
+        line = line.strip()
+        if not line:
+            continue
+        # Check if line looks like code
+        is_code = any(re.search(pattern, line) for pattern in code_patterns)
+        # Check if line looks like prose (but only if it's not empty/short)
+        is_prose = len(line) > 20 and any(re.search(pattern, line, re.IGNORECASE) for pattern in prose_indicators)
+        if is_code:
+            code_line_count += 1
+        if is_prose:
+            prose_line_count += 1
+    # Need strong evidence for code
+    total_non_empty_lines = len([l for l in lines_list if l.strip()])
+    # If more than 2 lines look like code and not many look like prose
+    if code_line_count >= 2 and prose_line_count <= code_line_count // 2:
+        return True
+    # Special case: single strong code line in short text
+    if total_non_empty_lines <= 3 and code_line_count >= 1 and prose_line_count == 0:
+        return True
+    # Check for specific code-only patterns
+    code_only_patterns = [
+        r'^\s*from langchain\.',
+        r'^\s*import langchain',
+        r'^\s*@tool\b',  # Decorator
+        r'^\s*agent = create_agent\(',
+        r'^\s*result = agent\.invoke\(',
+    ]
+    if any(re.search(pattern, text) for pattern in code_only_patterns):
+        return True
+    return False
+def _looks_like_executable_code(text: str) -> bool:
+    """Check if code looks like it could be executed"""
+    # First check if it's actually code (not formatted text)
+    if not _is_actual_code(text):
+        return False
+    # Check for actual Python syntax patterns
+    patterns = [
+        r'\bdef\s+\w+\s*\([^)]*\)\s*:',
+        r'\bclass\s+\w+\s*\(?[^:]*\)?\s*:',
+        r'^\s*from\s+\w+\s+import\s+\w+',
+        r'^\s*import\s+\w+',
+        r'\breturn\b',
+        r'\bprint\(',
+        r'^\s*\w+\s*=\s*[^=\n]+$',  # Variable assignment
+    ]
+    lines = text.split('\n')
+    executable_lines = 0
+    for line in lines:
+        line = line.strip()
+        if not line or line.startswith('#') or line.startswith('"""'):
+            continue
+        if any(re.search(pattern, line) for pattern in patterns):
+            executable_lines += 1
+    # Need at least 2 executable lines or 1 strong executable line
+    return executable_lines >= 2 or (
+        executable_lines >= 1 and len([l for l in lines if l.strip()]) <= 3
+    )
+def chunk_document(
+    raw_text: str,
+    source_name: str,
+    source_url: Optional[str] = None,
+) -> List[Dict]:
+    """
+    Chunk documentation text containing headings, prose, and code examples.
+    Design goals:
+    - Preserve document hierarchy
+    - Separate prose vs code
+    - Recover code even if Markdown fences are lost
+    - Deterministic chunk IDs
+    """
+    chunks: List[Dict] = []
+    heading_stack: List[str] = []
+    current_heading: Optional[str] = None
+    current_heading_level: Optional[int] = None
+    buffer: List[str] = []
+    code_block = False
+    code_language: Optional[str] = None
+    code_lines: List[str] = []
+    lines = raw_text.splitlines()
+    chunk_index = 0
+    line_cursor = 0
+    def heading_path() -> Optional[str]:
+        return " > ".join(heading_stack) if heading_stack else None
+    def flush_text(start_line: int, end_line: int):
+        nonlocal buffer, chunk_index
+        if not buffer:
+            return
+        text = "\n".join(buffer).strip()
+        buffer = []
+        if not text:
+            return
+        lines_local = text.splitlines()
+        # 🔹 Recover unfenced code blocks - use stricter heuristic
+        # Only mark as code if it's very clearly code
+        if _looks_like_code_block(lines_local) and len(text) > 30:
+            # Double-check: make sure it doesn't look like prose
+            looks_like_prose = any(word in text.lower() for word in
+                                  ['the', 'and', 'that', 'this', 'with', 'for', 'are', 'is', 'was'])
+            if not looks_like_prose:
+                chunks.append(
+                    {
+                        "chunk_id": _hash_id(text, "doc_code"),
+                        "source": "documentation",
+                        "source_name": source_name,
+                        "source_url": source_url,
+                        "language": "python",
+                        "chunk_type": "code",
+                        "content": text,
+                        "chunk_index": chunk_index,
+                        "metadata": {
+                            "heading": current_heading,
+                            "heading_level": current_heading_level,
+                            "heading_path": heading_path(),
+                            "line_start": start_line,
+                            "line_end": end_line,
+                            "inferred_block": True,
+                        },
+                    }
+                )
+                chunk_index += 1
+                return
+        # Default to text
+        chunks.append(
+            {
+                "chunk_id": _hash_id(text, "doc_text"),
+                "source": "documentation",
+                "source_name": source_name,
+                "source_url": source_url,
+                "language": "markdown",
+                "chunk_type": "text",
+                "content": text,
+                "chunk_index": chunk_index,
+                "metadata": {
+                    "heading": current_heading,
+                    "heading_level": current_heading_level,
+                    "heading_path": heading_path(),
+                    "line_start": start_line,
+                    "line_end": end_line,
+                },
+            }
+        )
+        chunk_index += 1
+    def flush_code(start_line: int, end_line: int):
+        nonlocal code_lines, code_language, chunk_index
+        if not code_lines:
+            return
+        code = "\n".join(code_lines)
+        code_lines = []
+        # Check if this is actually code or just formatted text
+        is_actual_code = _is_actual_code(code)
+        if is_actual_code:
+            chunks.append(
+                {
+                    "chunk_id": _hash_id(code, "doc_code"),
+                    "source": "documentation",
+                    "source_name": source_name,
+                    "source_url": source_url,
+                    "language": code_language or "unknown",
+                    "chunk_type": "code",
+                    "content": code,
+                    "chunk_index": chunk_index,
+                    "metadata": {
+                        "heading": current_heading,
+                        "heading_level": current_heading_level,
+                        "heading_path": heading_path(),
+                        "fenced_block": True,
+                        "line_start": start_line,
+                        "line_end": end_line,
+                        "looks_executable": _looks_like_executable_code(code),
+                    },
+                }
+            )
+        else:
+            # It's formatted text, not actual code
+            chunks.append(
+                {
+                    "chunk_id": _hash_id(code, "doc_text"),
+                    "source": "documentation",
+                    "source_name": source_name,
+                    "source_url": source_url,
+                    "language": "markdown",
+                    "chunk_type": "text",
+                    "content": code,
+                    "chunk_index": chunk_index,
+                    "metadata": {
+                        "heading": current_heading,
+                        "heading_level": current_heading_level,
+                        "heading_path": heading_path(),
+                        "line_start": start_line,
+                        "line_end": end_line,
+                        "was_fenced_block": True,  # Note: was in ``` but isn't code
+                    },
+                }
+            )
+        chunk_index += 1
+        code_language = None
+    buffer_start_line = 0
+    code_start_line = 0
+    for i, line in enumerate(lines):
+        line_cursor = i + 1
+        # ---- Heading detection ----
+        m = re.match(r"^(#{2,6})\s+(.*)", line)
+        if not code_block and m:
+            flush_text(buffer_start_line, line_cursor - 1)
+            level = len(m.group(1))
+            title = m.group(2).strip()
+            # Maintain heading stack
+            heading_stack[:] = heading_stack[: level - 2]
+            heading_stack.append(title)
+            current_heading = title
+            current_heading_level = level
+            buffer_start_line = line_cursor
+            continue
+        # ---- Code fence detection ----
+        if line.strip().startswith("```"):
+            if not code_block:
+                flush_text(buffer_start_line, line_cursor - 1)
+                code_block = True
+                code_language = line.strip().replace("```", "").strip() or None
+                code_start_line = line_cursor + 1
+            else:
+                code_block = False
+                flush_code(code_start_line, line_cursor - 1)
+                buffer_start_line = line_cursor + 1
+            continue
+        if code_block:
+            code_lines.append(line)
+        else:
+            if not buffer:
+                buffer_start_line = line_cursor
+            buffer.append(line)
+    flush_text(buffer_start_line, line_cursor)
+    flush_code(code_start_line, line_cursor)
+    return chunks
+def wrap_doc_chunks(doc_chunks: List[dict]) -> List[CodeChunk]:
+    """
+    Adapter: convert doc_chunker output (dict)
+    into CodeChunk(documentation).
+    Does NOT affect core doc_chunker parsing logic.
+    """
+    wrapped: List[CodeChunk] = []
+    for d in doc_chunks:
+        wrapped.append(
+            CodeChunk(
+                chunk_id=d["chunk_id"],
+                file_path=d["source_name"],
+                language=d.get("language", "markdown"),
+                chunk_type="documentation",
+                code=d["content"],
+                ast=ChunkAST(
+                    symbol_type="documentation",
+                    name=d.get("metadata", {}).get("heading"),
+                    parent=d.get("metadata", {}).get("heading_path"),
+                ),
+                span=ChunkSpan(
+                    start_line=d.get("metadata", {}).get("line_start"),
+                    end_line=d.get("metadata", {}).get("line_end"),
+                ),
+                hierarchy=ChunkHierarchy(
+                    is_primary=True,
+                    is_extracted=True,
+                ),
+                metadata=d.get("metadata", {}),
+            )
+        )
+    return wrapped

scripts/core/ingestion/generate_data.py ADDED Viewed

	@@ -0,0 +1,658 @@

+"""
+Positive Pairs and Triplets Generator for Training Data
+This module generates positive pairs and triplets from code chunks for
+contrastive learning and similarity-based model training.
+ARCHITECTURE POSITION:
+    - Training Data Generator: Creates pairs/triplets from code chunks
+    - Question Generator: Creates natural language queries for code
+    - Variance Generator: Creates multiple variations of pairs
+KEY FEATURES:
+    1. Positive Pairs: (question, code) with 4-5 variations per sample
+    2. Triplets: (anchor_question, positive_code, negative_code)
+    3. Global ID tracking via chunk_id
+    4. Supports code-to-question and question-to-code mappings
+OUTPUT FORMATS:
+    Positive Pairs:
+    {
+        "id": "pair_001",
+        "global_id": "chunk_id",
+        "anchor": "How to create a state graph with conditional edges?",
+        "positive": "<code snippet>"
+    }
+    Triplets:
+    {
+        "id": "triplet_001",
+        "global_id": "chunk_id",
+        "anchor": "How to create a reusable prompt template?",
+        "positive": "<relevant code>",
+        "negative": "<irrelevant code>"
+    }
+USAGE:
+    from export.pairs_triplets_generator import generate_pairs_and_triplets
+    pairs, triplets = generate_pairs_and_triplets(
+        chunks_path="data/processed/chunks/chunks.jsonl",
+        output_dir="data/processed/training",
+        num_pairs=100,
+        variance=5
+    )
+"""
+import json
+import random
+import hashlib
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass, field, asdict
+@dataclass
+class PositivePairVariation:
+    """A single anchor-positive variation."""
+    anchor: str       # Question (natural language query)
+    positive: str     # Code snippet
+@dataclass
+class PositivePair:
+    """A positive pair document with multiple anchor-positive variations.
+    Format:
+    {
+        "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
+        "variations": [
+            {"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
+            {"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
+            {"anchor": "How to implement async aadd_documents?", "positive": "<code>"},
+            {"anchor": "Show the async aadd_documents code", "positive": "<code>"},
+            {"anchor": "Explain async aadd_documents function", "positive": "<code>"}
+        ],
+        "framework": "crewai"
+    }
+    """
+    document_id: str                       # Original chunk_id
+    variations: List[PositivePairVariation]  # List of (anchor, positive) pairs
+    framework: str                         # Framework name from file path
+@dataclass
+class Triplet:
+    """A triplet for contrastive learning.
+    Format:
+    {
+        "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
+        "anchor": "Best practices for async aadd_documents",
+        "positive": "async def aadd_documents(...)",
+        "negative": "async def async_agent(self):...",
+        "framework": "crewai"
+    }
+    """
+    document_id: str    # Original chunk_id
+    anchor: str         # Question (natural language query)
+    positive: str       # Relevant code snippet
+    negative: str       # Irrelevant/different code snippet
+    framework: str      # Framework name from file path
+# Question templates for different code patterns - IMPROVED for cleaner questions
+QUESTION_TEMPLATES = {
+    "class": [
+        "How does the {name} class work in Python?",
+        "What is the implementation of the {name} class?",
+        "How to create a {name} class?",
+        "Show me the {name} class implementation",
+        "Explain the {name} class structure",
+    ],
+    "function": [
+        "How does {name} function work in Python?",
+        "What is the implementation of {name}?",
+        "How to implement the {name} function?",
+        "Show the code for {name} function",
+        "Explain how {name} works",
+    ],
+    "method": [
+        "How does the {name} method work in Python?",
+        "What is the implementation of {name} method?",
+        "How to implement the {name} method?",
+        "Show me the {name} method code",
+        "Explain the {name} method",
+    ],
+    "async_function": [
+        "How does async {name} work in Python?",
+        "What is the async implementation of {name}?",
+        "How to implement async {name}?",
+        "Show the async {name} code",
+        "Explain async {name} function",
+    ],
+    "module": [
+        "How to implement {name} module?",
+        "What's the structure of {name}?",
+        "Show the {name} module implementation",
+        "Explain the {name} module",
+        "How does {name} module work?",
+    ],
+    "workflow": [
+        "How to create a {name} workflow?",
+        "What's the pattern for {name}?",
+        "Show the {name} workflow implementation",
+        "Explain the {name} workflow",
+        "How does the {name} workflow work?",
+    ],
+}
+# Variance templates to create multiple questions for the same code
+VARIANCE_TEMPLATES = [
+    "How to {action}?",
+    "What's the code for {action}?",
+    "Show me how to {action}",
+    "Implement {action}",
+    "Write code that {action}",
+]
+def extract_code_context(code: str, ast_info: Dict, file_path: str) -> Dict[str, str]:
+    """Extract contextual information from code for question generation."""
+    context = {
+        "name": ast_info.get("name", "unknown"),
+        "parent": ast_info.get("parent", ""),
+        "symbol_type": ast_info.get("symbol_type", "unknown"),
+        "docstring": ast_info.get("docstring", ""),
+        "file_name": Path(file_path).stem if file_path else "unknown",
+    }
+    # Extract purpose/description from docstring or code patterns
+    if context["docstring"]:
+        # Use first sentence of docstring as description
+        desc = context["docstring"].split(".")[0].strip()
+        context["description"] = desc[:100] if len(desc) > 100 else desc
+    else:
+        # Generate description from code patterns
+        context["description"] = _infer_description(code, context["name"])
+    context["purpose"] = context["description"].lower()
+    return context
+def _infer_description(code: str, name: str) -> str:
+    """Infer a description from code patterns when no docstring exists."""
+    code_lower = code.lower()
+    # Common patterns
+    if "stategraph" in code_lower or "workflow" in code_lower:
+        return f"building a stateful workflow"
+    elif "agent" in code_lower:
+        return f"creating an AI agent"
+    elif "tool" in code_lower or "@tool" in code:
+        return f"implementing a tool"
+    elif "async" in code_lower:
+        return f"async operations"
+    elif "api" in code_lower or "request" in code_lower:
+        return f"API interactions"
+    elif "database" in code_lower or "sql" in code_lower:
+        return f"database operations"
+    elif "parse" in code_lower:
+        return f"parsing data"
+    elif "format" in code_lower:
+        return f"formatting output"
+    elif "template" in code_lower:
+        return f"creating templates"
+    elif "filter" in code_lower:
+        return f"filtering data"
+    elif "search" in code_lower:
+        return f"search functionality"
+    elif "create" in code_lower or "build" in code_lower:
+        return f"building {name}"
+    else:
+        return f"implementing {name}"
+def generate_question(code: str, ast_info: Dict, file_path: str,
+                     variation_index: int = 0) -> str:
+    """Generate a clean natural language question for a code snippet."""
+    name = ast_info.get("name", "unknown")
+    symbol_type = ast_info.get("symbol_type", "function")
+    # Clean up the name for display
+    clean_name = name.replace("_", " ") if name else "this code"
+    # Check if it's async
+    is_async = code.strip().startswith("async ") or "async def" in code[:100]
+    # Determine template category
+    if is_async and symbol_type in ("function", "method"):
+        template_category = "async_function"
+    elif symbol_type in QUESTION_TEMPLATES:
+        template_category = symbol_type
+    elif "graph" in code.lower() or "workflow" in code.lower() or "state" in code.lower():
+        template_category = "workflow"
+    else:
+        template_category = "function"
+    templates = QUESTION_TEMPLATES[template_category]
+    # Select template based on variation index
+    template_idx = variation_index % len(templates)
+    template = templates[template_idx]
+    # Fill in template with clean name
+    question = template.format(name=name)
+    return question
+def generate_question_variations(code: str, ast_info: Dict, file_path: str,
+                                 num_variations: int = 5) -> List[str]:
+    """Generate multiple unique question variations for a code snippet."""
+    questions = []
+    seen_questions = set()
+    # Generate primary variations using templates
+    for i in range(num_variations):
+        q = generate_question(code, ast_info, file_path, variation_index=i)
+        q_lower = q.lower()
+        if q_lower not in seen_questions:
+            questions.append(q)
+            seen_questions.add(q_lower)
+    # Return exactly num_variations (templates should provide enough)
+    return questions[:num_variations]
+def extract_framework(file_path: str) -> str:
+    """Extract framework name from file path.
+    Examples:
+        'data/raw/codebases/crewai/...' -> 'crewai'
+        'data/raw/codebases/langgraph/...' -> 'langgraph'
+        'data/processed/repos/langgraph_20260116/...' -> 'langgraph'
+    """
+    path_lower = file_path.lower()
+    # Known frameworks to detect
+    frameworks = [
+        "crewai", "langgraph", "langchain", "autogen", "llamaindex",
+        "dspy", "haystack", "semantic_kernel", "fastapi", "flask", "django"
+    ]
+    for framework in frameworks:
+        if framework in path_lower:
+            return framework
+    # Try to extract from path structure
+    parts = file_path.replace("\\", "/").split("/")
+    for part in parts:
+        if "codebases" in parts or "repos" in parts:
+            # Get the next part after codebases/repos
+            try:
+                idx = parts.index("codebases") if "codebases" in parts else parts.index("repos")
+                if idx + 1 < len(parts):
+                    framework_part = parts[idx + 1].split("_")[0]  # Handle 'langgraph_20260116'
+                    if framework_part and framework_part not in ["raw", "processed"]:
+                        return framework_part
+            except (ValueError, IndexError):
+                pass
+    return "unknown"
+def is_semantically_different(chunk1: Dict, chunk2: Dict) -> bool:
+    """Check if two chunks are semantically different (good for negative pairs)."""
+    # Different symbol types
+    type1 = chunk1.get("ast", {}).get("symbol_type", "")
+    type2 = chunk2.get("ast", {}).get("symbol_type", "")
+    # Different purposes (check for different keywords)
+    code1 = chunk1.get("code", "").lower()
+    code2 = chunk2.get("code", "").lower()
+    # Keywords that indicate different functionality
+    keywords = [
+        "parse", "format", "create", "delete", "update", "read", "write",
+        "input", "output", "agent", "tool", "graph", "state", "workflow",
+        "template", "filter", "search", "database", "api", "async"
+    ]
+    keywords1 = set(k for k in keywords if k in code1)
+    keywords2 = set(k for k in keywords if k in code2)
+    # Consider different if keyword overlap is low
+    if not keywords1 or not keywords2:
+        return type1 != type2
+    overlap = len(keywords1 & keywords2) / len(keywords1 | keywords2)
+    return overlap < 0.3
+def select_negative_sample(anchor_chunk: Dict, all_chunks: List[Dict],
+                          max_attempts: int = 50) -> Optional[Dict]:
+    """Select a semantically different chunk as negative sample."""
+    anchor_id = anchor_chunk.get("chunk_id", "")
+    # Shuffle chunks for random selection
+    candidates = [c for c in all_chunks if c.get("chunk_id") != anchor_id]
+    random.shuffle(candidates)
+    for candidate in candidates[:max_attempts]:
+        if is_semantically_different(anchor_chunk, candidate):
+            return candidate
+    # Fallback: return any different chunk
+    if candidates:
+        return candidates[0]
+    return None
+def load_chunks(chunks_path: Path) -> List[Dict]:
+    """Load chunks from JSONL file."""
+    chunks = []
+    with open(chunks_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                try:
+                    chunks.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+    return chunks
+def filter_valid_chunks(chunks: List[Dict], min_code_length: int = 50) -> List[Dict]:
+    """Filter chunks that are suitable for training pairs."""
+    valid_chunks = []
+    for chunk in chunks:
+        code = chunk.get("code", "")
+        chunk_type = chunk.get("chunk_type", "")
+        ast_info = chunk.get("ast", {})
+        # Skip empty or very short chunks
+        if len(code) < min_code_length:
+            continue
+        # Skip pure imports or empty modules
+        if chunk_type == "imports" or (chunk_type == "module" and not ast_info.get("docstring")):
+            symbol_type = ast_info.get("symbol_type", "")
+            if symbol_type == "imports":
+                continue
+        # Skip __init__ files without content
+        if "__init__" in chunk.get("file_path", "") and len(code) < 100:
+            continue
+        valid_chunks.append(chunk)
+    return valid_chunks
+def generate_positive_pairs(chunks: List[Dict], num_pairs: int = 100,
+                           variance: int = 5) -> List[PositivePair]:
+    """
+    Generate positive pairs from chunks with multiple (anchor, positive) variations per document.
+    Output format:
+    {
+        "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
+        "variations": [
+            {"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
+            {"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
+            ...
+        ],
+        "framework": "crewai"
+    }
+    Args:
+        chunks: List of code chunks
+        num_pairs: Number of documents to generate (each with `variance` variations)
+        variance: Number of (anchor, positive) variations per document (4-5 recommended)
+    Returns:
+        List of PositivePair objects (one per document, each with multiple variations)
+    """
+    pairs = []
+    # Filter valid chunks
+    valid_chunks = filter_valid_chunks(chunks)
+    # Sample chunks if needed
+    if len(valid_chunks) > num_pairs:
+        selected_chunks = random.sample(valid_chunks, num_pairs)
+    else:
+        selected_chunks = valid_chunks
+    for chunk in selected_chunks:
+        code = chunk.get("code", "")
+        ast_info = chunk.get("ast", {})
+        file_path = chunk.get("file_path", "")
+        document_id = chunk.get("chunk_id", "")
+        # Extract framework from file path
+        framework = extract_framework(file_path)
+        # Generate multiple question variations
+        anchors = generate_question_variations(code, ast_info, file_path, variance)
+        # Create variations list with (anchor, positive) pairs
+        variations = [
+            PositivePairVariation(anchor=anchor, positive=code)
+            for anchor in anchors
+        ]
+        pair = PositivePair(
+            document_id=document_id,
+            variations=variations,
+            framework=framework
+        )
+        pairs.append(pair)
+    return pairs
+def generate_triplets(chunks: List[Dict], num_triplets: int = 100) -> List[Triplet]:
+    """
+    Generate triplets from chunks (no variations, flat structure).
+    Output format:
+    {
+        "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
+        "anchor": "Best practices for async aadd_documents",
+        "positive": "async def aadd_documents(...)",
+        "negative": "async def async_agent(self):...",
+        "framework": "crewai"
+    }
+    Args:
+        chunks: List of code chunks
+        num_triplets: Number of triplets to generate (100, no variance)
+    Returns:
+        List of Triplet objects
+    """
+    triplets = []
+    # Filter valid chunks
+    valid_chunks = filter_valid_chunks(chunks)
+    if len(valid_chunks) < 2:
+        return triplets
+    # Sample chunks if needed
+    if len(valid_chunks) > num_triplets:
+        selected_chunks = random.sample(valid_chunks, num_triplets)
+    else:
+        selected_chunks = valid_chunks
+    for anchor_chunk in selected_chunks:
+        # Find a semantically different chunk as negative
+        negative_chunk = select_negative_sample(anchor_chunk, valid_chunks)
+        if negative_chunk is None:
+            continue
+        code = anchor_chunk.get("code", "")
+        ast_info = anchor_chunk.get("ast", {})
+        file_path = anchor_chunk.get("file_path", "")
+        document_id = anchor_chunk.get("chunk_id", "")
+        # Extract framework from file path
+        framework = extract_framework(file_path)
+        # Generate question for anchor
+        question = generate_question(code, ast_info, file_path)
+        triplet = Triplet(
+            document_id=document_id,
+            anchor=question,
+            positive=code,
+            negative=negative_chunk.get("code", ""),
+            framework=framework
+        )
+        triplets.append(triplet)
+    return triplets
+def export_pairs_jsonl(pairs: List[PositivePair], output_path: Path) -> None:
+    """Export positive pairs to JSONL file."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for pair in pairs:
+            f.write(json.dumps(asdict(pair), ensure_ascii=False) + "\n")
+    print(f"Exported {len(pairs)} positive pairs to {output_path}")
+def export_triplets_jsonl(triplets: List[Triplet], output_path: Path) -> None:
+    """Export triplets to JSONL file."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for triplet in triplets:
+            f.write(json.dumps(asdict(triplet), ensure_ascii=False) + "\n")
+    print(f"Exported {len(triplets)} triplets to {output_path}")
+def export_pairs_json(pairs: List[PositivePair], output_path: Path) -> None:
+    """Export positive pairs to JSON file (list format for easier inspection)."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = [asdict(p) for p in pairs]
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    print(f"Exported {len(pairs)} positive pairs to {output_path}")
+def export_triplets_json(triplets: List[Triplet], output_path: Path) -> None:
+    """Export triplets to JSON file (flat list format)."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    data = [asdict(t) for t in triplets]
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    print(f"Exported {len(triplets)} triplets to {output_path}")
+def generate_pairs_and_triplets(
+    chunks_path: Path,
+    output_dir: Path,
+    num_pairs: int = 100,
+    num_triplets: int = 100,
+    variance: int = 5,
+    export_format: str = "both"  # "jsonl", "json", or "both"
+) -> Tuple[List[PositivePair], List[Triplet]]:
+    """
+    Main function to generate positive pairs and triplets from chunks.
+    Args:
+        chunks_path: Path to chunks JSONL file
+        output_dir: Directory to save output files
+        num_pairs: Number of base pairs (will generate num_pairs * variance total)
+        num_triplets: Number of triplets (no variance)
+        variance: Number of variations per positive pair (4-5)
+        export_format: Output format ("jsonl", "json", or "both")
+    Returns:
+        Tuple of (pairs, triplets)
+    """
+    print(f"Loading chunks from {chunks_path}...")
+    chunks = load_chunks(chunks_path)
+    print(f"   Loaded {len(chunks)} chunks")
+    # Generate positive pairs with variance
+    print(f"Generating positive pairs (base={num_pairs}, variance={variance})...")
+    pairs = generate_positive_pairs(chunks, num_pairs=num_pairs, variance=variance)
+    print(f"   Generated {len(pairs)} positive pairs")
+    # Generate triplets (no variance)
+    print(f"Generating triplets (count={num_triplets})...")
+    triplets = generate_triplets(chunks, num_triplets=num_triplets)
+    print(f"   Generated {len(triplets)} triplets")
+    # Create output directory
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Export based on format
+    if export_format in ("jsonl", "both"):
+        export_pairs_jsonl(pairs, output_dir / "positive_pairs.jsonl")
+        export_triplets_jsonl(triplets, output_dir / "triplets.jsonl")
+    if export_format in ("json", "both"):
+        export_pairs_json(pairs, output_dir / "positive_pairs.json")
+        export_triplets_json(triplets, output_dir / "triplets.json")
+    # Print summary statistics
+    print("Summary Statistics:")
+    print(f"   Total Positive Pair Documents: {len(pairs)}")
+    print(f"   Total Variations: {sum(len(p.variations) for p in pairs)}")
+    print(f"   Total Triplets: {len(triplets)}")
+    return pairs, triplets
+def main():
+    """CLI entry point for generating pairs and triplets."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate positive pairs and triplets from code chunks")
+    parser.add_argument("--chunks", "-c", type=str, required=True,
+                       help="Path to chunks JSONL file")
+    parser.add_argument("--output", "-o", type=str, required=True,
+                       help="Output directory for generated files")
+    parser.add_argument("--pairs", "-p", type=int, default=100,
+                       help="Number of base positive pairs (default: 100)")
+    parser.add_argument("--triplets", "-t", type=int, default=100,
+                       help="Number of triplets (default: 100)")
+    parser.add_argument("--variance", "-v", type=int, default=5,
+                       help="Number of variations per pair (default: 5)")
+    parser.add_argument("--format", "-f", type=str, default="both",
+                       choices=["jsonl", "json", "both"],
+                       help="Output format (default: both)")
+    args = parser.parse_args()
+    generate_pairs_and_triplets(
+        chunks_path=Path(args.chunks),
+        output_dir=Path(args.output),
+        num_pairs=args.pairs,
+        num_triplets=args.triplets,
+        variance=args.variance,
+        export_format=args.format
+    )
+if __name__ == "__main__":
+    main()

scripts/core/ingestion/hierarchical_chunker.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Hierarchical chunk coordinator - Orchestrates AST and Tree-sitter chunking.
+This module serves as the coordination layer that integrates AST (semantic)
+and Tree-sitter (syntactic) chunking. It ensures that:
+1. AST chunks get precise byte spans from Tree-sitter
+2. Hierarchy relationships are preserved across both sources
+3. Parent-child relationships are correctly established
+4. All chunks have consistent metadata and structure
+ARCHITECTURE POSITION:
+    - Coordination Layer: Integrates AST and Tree-sitter
+    - Relationship Manager: Maintains parent-child links
+    - Quality Enforcer: Ensures consistent chunk structure
+KEY RESPONSIBILITIES:
+    1. Enrich AST chunks with Tree-sitter byte spans
+    2. Build and verify hierarchy relationships
+    3. Create secondary chunks for extracted content
+    4. Ensure type safety across all chunk operations
+FLOW:
+    File → AST chunks (semantic) + Tree-sitter chunks (spans)
+           → HierarchicalChunker.enrich_and_link()
+           → Final chunks with hierarchy + precise spans
+USAGE:
+    chunker = HierarchicalChunker()
+    chunks = chunker.chunk_file(Path("file.py"))
+"""
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple, Set, cast
+import uuid
+from .ast_chunker import extract_ast_chunks
+from .ts_chunker import extract_ts_chunks
+from .chunk_schema import CodeChunk, ChunkHierarchy, ChunkType
+class HierarchicalChunker:
+    def __init__(self):
+        self.chunks_by_id: Dict[str, CodeChunk] = {}
+        self.imports_by_file: Dict[str, str] = {}  # Track imports chunks by file
+    # ---------------- helpers ----------------
+    def _build_ts_span_map(
+        self, ts_chunks: List[CodeChunk]
+    ) -> Dict[Tuple[int, int], CodeChunk]:
+        span_map: Dict[Tuple[int, int], CodeChunk] = {}
+        for c in ts_chunks:
+            if c.span.start_line is None or c.span.end_line is None:
+                continue
+            span_map[(c.span.start_line, c.span.end_line)] = c
+        return span_map
+    def _enrich_spans_with_tree_sitter(
+        self, ast_chunks: List[CodeChunk], ts_chunks: List[CodeChunk]
+    ) -> List[CodeChunk]:
+        """Enrich AST chunks with Tree-sitter precise byte spans"""
+        ts_span_map = self._build_ts_span_map(ts_chunks)
+        for ast_chunk in ast_chunks:
+            if ast_chunk.span.start_line is not None and ast_chunk.span.end_line is not None:
+                key: Tuple[int, int] = (ast_chunk.span.start_line, ast_chunk.span.end_line)
+                ts_match = ts_span_map.get(key)
+                if ts_match:
+                    # Update byte spans from Tree-sitter
+                    ast_chunk.span.start_byte = ts_match.span.start_byte
+                    ast_chunk.span.end_byte = ts_match.span.end_byte
+        return ast_chunks
+    def _preserve_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
+        """Ensure all hierarchy relationships are preserved with proper typing"""
+        # Build mapping for quick lookup
+        for chunk in all_chunks:
+            self.chunks_by_id[chunk.chunk_id] = chunk
+        # Verify and fix parent-child relationships with type safety
+        for chunk in all_chunks:
+            # Ensure hierarchy exists
+            if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
+                chunk.hierarchy = ChunkHierarchy()
+            if chunk.hierarchy.parent_id:
+                parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
+                if parent:
+                    # Ensure parent has hierarchy
+                    if not hasattr(parent, 'hierarchy') or parent.hierarchy is None:
+                        parent.hierarchy = ChunkHierarchy()
+                    # Add child to parent with type safety
+                    if chunk.chunk_id not in parent.hierarchy.children_ids:
+                        parent.hierarchy.children_ids.append(chunk.chunk_id)
+    def _create_secondary_chunks_for_extracted_content(
+        self, ast_chunks: List[CodeChunk]
+    ) -> List[CodeChunk]:
+        """Create secondary chunks for extracted content (if needed)"""
+        secondary_chunks: List[CodeChunk] = []
+        # Currently, our AST chunker creates everything as primary
+        # This method is for future extensions
+        return secondary_chunks
+    def _update_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
+        """Update parent-child relationships based on AST parent field with proper typing"""
+        # Create mapping from (name, type) to chunk_id
+        chunk_map: Dict[Tuple[Optional[str], ChunkType], str] = {}
+        for chunk in all_chunks:
+            if chunk.ast and chunk.ast.name:
+                key = (chunk.ast.name, chunk.chunk_type)
+                chunk_map[key] = chunk.chunk_id
+        # Update parent relationships with type safety
+        for chunk in all_chunks:
+            # Ensure hierarchy exists
+            if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
+                chunk.hierarchy = ChunkHierarchy()
+            if chunk.ast and chunk.ast.parent and chunk.ast.parent != "None":
+                # Determine parent type based on current chunk type
+                parent_type: ChunkType = "class" if chunk.chunk_type == "method" else "module"
+                # Try to find parent chunk
+                parent_key = (chunk.ast.parent, parent_type)
+                parent_id = chunk_map.get(parent_key)
+                if parent_id and parent_id in self.chunks_by_id:
+                    chunk.hierarchy.parent_id = parent_id
+                    # Add this chunk to parent's children with type safety
+                    parent_chunk = self.chunks_by_id.get(parent_id)
+                    if parent_chunk:
+                        # Ensure parent has hierarchy
+                        if not hasattr(parent_chunk, 'hierarchy') or parent_chunk.hierarchy is None:
+                            parent_chunk.hierarchy = ChunkHierarchy()
+                        if chunk.chunk_id not in parent_chunk.hierarchy.children_ids:
+                            parent_chunk.hierarchy.children_ids.append(chunk.chunk_id)
+        # Set depth based on parent relationships
+        for chunk in all_chunks:
+            if chunk.hierarchy.parent_id:
+                parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
+                if parent and hasattr(parent, 'hierarchy') and parent.hierarchy:
+                    chunk.hierarchy.depth = parent.hierarchy.depth + 1
+    # ---------------- public API ----------------
+    def chunk_file(self, file_path: Path) -> List[CodeChunk]:
+        self.chunks_by_id.clear()
+        self.imports_by_file.clear()
+        try:
+            ast_chunks = extract_ast_chunks(file_path)
+        except SyntaxError:
+            ast_chunks = []
+        # Get Tree-sitter chunks for byte-level precision
+        ts_chunks = extract_ts_chunks(file_path)
+        # Enrich AST chunks with Tree-sitter byte spans
+        enriched_chunks = self._enrich_spans_with_tree_sitter(ast_chunks, ts_chunks)
+        # Update hierarchy relationships with proper typing
+        self._update_hierarchy_relationships(enriched_chunks)
+        # Preserve any existing relationships
+        self._preserve_hierarchy_relationships(enriched_chunks)
+        # Create any needed secondary chunks
+        secondary_chunks = self._create_secondary_chunks_for_extracted_content(enriched_chunks)
+        return enriched_chunks + secondary_chunks

scripts/core/ingestion/ingest.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""
+Git Repository Crawler - Intelligent repository cloning and file listing system.
+This module serves as the entry point for ingesting Git repositories into our
+dataset pipeline. It handles cloning, file listing, metadata extraction, and
+statistics generation with multiple strategies for different use cases.
+ARCHITECTURE POSITION:
+    - Ingestion Layer: Entry point for Git repositories
+    - File Discovery: Finds and filters repository files
+    - Metadata Collector: Gathers repo-level information
+KEY FEATURES:
+    1. Multi-strategy file listing (fast/rich/smart)
+    2. Intelligent binary detection and filtering
+    3. Repository metadata extraction with git history
+    4. Agentic framework detection (through RepoMetadataExtractor)
+    5. Repository statistics and cleanup utilities
+DATA FLOW:
+    Repository URL → Clone → File Discovery → Filtering → File Info/Metadata → Output
+USE CASES:
+    - FAST: When only file paths are needed (performance-critical)
+    - RICH: When full metadata is required (dataset building)
+    - SMART: Auto-chooses based on needs (balanced approach)
+USAGE:
+    crawler = GitCrawler()
+    repo_path = crawler.clone_repository("https://github.com/org/repo.git")
+    files_fast = crawler.list_files_fast(repo_path, extensions={'.py'})
+    files_rich, stats = crawler.list_files_with_info(repo_path)
+"""
+import subprocess
+from pathlib import Path
+from typing import List, Optional, Set, Dict, Tuple, Union, cast
+import os
+from dataclasses import dataclass
+import time
+from .repo_metadata import RepoMetadataExtractor
+@dataclass
+class RepoFileInfo:
+    """Lightweight file info - optional for when you need it"""
+    path: Path
+    relative_path: str
+    size: int = 0
+    extension: str = ""
+    is_binary: Optional[bool] = None
+class GitCrawler:
+    """
+    Optimized Git crawler with fast listing + optional rich info
+    """
+    def __init__(self, cache_dir: Path = Path("data/raw/repos")):
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    # -------- CORE: Cloning (same for both) --------
+    def clone_repository(self, repo_url: str) -> Optional[Path]:
+        """Clone a repository if not already cloned"""
+        repo_name = self._extract_repo_name(repo_url)
+        repo_path = self.cache_dir / repo_name
+        if repo_path.exists():
+            print(f"Repository already exists: {repo_path}")
+            return repo_path
+        print(f"Cloning {repo_url}...")
+        cmd = ["git", "clone", "--depth", "1", repo_url, str(repo_path)]
+        try:
+            start_time = time.time()
+            result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+            elapsed = time.time() - start_time
+            print(f"Cloned to {repo_path} ({elapsed:.1f}s)")
+            return repo_path
+        except subprocess.CalledProcessError as e:
+            print(f"Failed to clone {repo_url}: {e.stderr}")
+            return None
+    def extract_enhanced_metadata(self, repo_path: Path) -> Dict:
+        """
+        Extract enhanced metadata including agentic framework detection
+        """
+        extractor = RepoMetadataExtractor(repo_path)
+        return extractor.extract_comprehensive_metadata()
+    # -------- OPTION 1: FAST listing (old style) --------
+    def list_files_fast(self, repo_path: Path,
+                       extensions: Optional[Set[str]] = None,
+                       exclude_dirs: Optional[Set[str]] = None) -> List[Path]:
+        """
+        FAST file listing - returns just Path objects
+        Use when you need speed and don't need metadata
+        """
+        if exclude_dirs is None:
+            exclude_dirs = {'.git', '__pycache__', 'node_modules',
+                          'build', 'dist', '.venv', 'venv'}
+        files = []
+        for root, dirs, filenames in os.walk(repo_path):
+            # Filter directories
+            dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
+            for filename in filenames:
+                if filename.startswith('.'):
+                    continue
+                file_path = Path(root) / filename
+                # Filter by extension if specified
+                if extensions:
+                    if file_path.suffix.lower() in extensions:
+                        files.append(file_path)
+                else:
+                    files.append(file_path)
+        return sorted(files)  # Sort for consistency
+    # -------- OPTION 2: RICH listing with metadata --------
+    def list_files_with_info(self, repo_path: Path,
+                            extensions: Optional[Set[str]] = None,
+                            exclude_dirs: Optional[Set[str]] = None,
+                            skip_binary: bool = True) -> Tuple[List[RepoFileInfo], Dict]:
+        """
+        RICH file listing - returns file info + statistics
+        Use when you need metadata for better chunking
+        """
+        if exclude_dirs is None:
+            exclude_dirs = {'.git', '__pycache__', 'node_modules',
+                          'build', 'dist', '.venv', 'venv', '.env'}
+        file_infos = []
+        stats = {
+            "total_files": 0,
+            "total_size": 0,
+            "by_extension": {},
+            "binary_files": 0,
+            "text_files": 0
+        }
+        for root, dirs, filenames in os.walk(repo_path):
+            # Filter directories
+            dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
+            for filename in filenames:
+                if filename.startswith('.'):
+                    continue
+                file_path = Path(root) / filename
+                relative_path = file_path.relative_to(repo_path)
+                extension = file_path.suffix.lower()
+                # Filter by extension
+                if extensions and extension not in extensions:
+                    continue
+                try:
+                    size = file_path.stat().st_size
+                    is_binary = None
+                    # Check if binary (only when needed)
+                    if skip_binary:
+                        is_binary = self._is_binary_file(file_path)
+                        if is_binary:
+                            stats["binary_files"] += 1
+                            continue  # Skip binary files
+                        else:
+                            stats["text_files"] += 1
+                    # Create file info
+                    file_info = RepoFileInfo(
+                        path=file_path,
+                        relative_path=str(relative_path),
+                        size=size,
+                        extension=extension,
+                        is_binary=is_binary
+                    )
+                    file_infos.append(file_info)
+                    # Update stats
+                    stats["total_files"] += 1
+                    stats["total_size"] += size
+                    stats["by_extension"][extension] = stats["by_extension"].get(extension, 0) + 1
+                except (OSError, PermissionError) as e:
+                    print(f"[WARNING] Could not read {file_path}: {e}")
+                    continue
+        # Sort by relative path
+        file_infos.sort(key=lambda x: x.relative_path)
+        return file_infos, stats
+    # -------- OPTION 3: SMART listing (auto-chooses) --------
+    def list_files(self, repo_path: Path,
+                  extensions: Optional[Set[str]] = None,
+                  exclude_dirs: Optional[Set[str]] = None,
+                  rich_metadata: bool = False,
+                  skip_binary: bool = True) -> Union[List[Path], Tuple[List[RepoFileInfo], Dict]]:
+        """
+        SMART file listing - chooses method based on needs
+        Args:
+            rich_metadata: True for RepoFileInfo + stats, False for just Paths
+            skip_binary: Skip binary files (only when rich_metadata=True)
+        """
+        if rich_metadata:
+            return self.list_files_with_info(repo_path, extensions, exclude_dirs, skip_binary)
+        else:
+            return self.list_files_fast(repo_path, extensions, exclude_dirs)
+    # -------- HELPER: Get README --------
+    def get_readme_content(self, repo_path: Path) -> Optional[str]:
+        """Quickly get README content if exists"""
+        for pattern in ['README.md', 'README.rst', 'README.txt', 'README', 'readme.md']:
+            readme_path = repo_path / pattern
+            if readme_path.exists():
+                try:
+                    return readme_path.read_text(encoding='utf-8', errors='ignore')[:5000]  # First 5k chars
+                except:
+                    continue
+        return None
+    # -------- HELPER: Get repository stats --------
+    def get_repo_stats(self, repo_path: Path) -> Dict:
+        """ACCURATE repository statistics (excludes .git)"""
+        try:
+            total_files = 0
+            total_size = 0
+            extensions = set()
+            for root, dirs, files in os.walk(repo_path):
+                # ✅ PROPERLY skip .git directory
+                root_path = Path(root)
+                if '.git' in root_path.parts:
+                    continue  # Skip entire .git directory
+                total_files += len(files)
+                for file in files:
+                    file_path = Path(root) / file
+                    try:
+                        size = file_path.stat().st_size
+                        total_size += size
+                        if file_path.suffix:
+                            extensions.add(file_path.suffix.lower())
+                    except:
+                        pass
+            return {
+                "total_files": total_files,
+                "total_size_mb": round(total_size / (1024 * 1024), 2),
+                "unique_extensions": sorted(list(extensions))[:20],
+                "path": str(repo_path),
+                "name": repo_path.name,
+                "note": "Size excludes .git directory"  # ✅ Add note
+            }
+        except Exception as e:
+            return {"error": str(e)}
+    # -------- UTILITY METHODS --------
+    def _extract_repo_name(self, repo_url: str) -> str:
+        """Extract repository name from URL"""
+        name = repo_url.rstrip('/').split('/')[-1]
+        if name.endswith('.git'):
+            name = name[:-4]
+        return name
+    def _is_binary_file(self, file_path: Path, sample_size: int = 1024) -> bool:
+        """Quick binary detection by sampling"""
+        try:
+            with open(file_path, 'rb') as f:
+                sample = f.read(sample_size)
+            if not sample:
+                return False
+            # Check for null bytes (common in binaries)
+            if b'\x00' in sample:
+                return True
+            # Count printable ASCII
+            printable = sum(1 for byte in sample if 32 <= byte <= 126 or byte in (9, 10, 13))
+            return (printable / len(sample)) < 0.8  # Less than 80% printable
+        except:
+            return True  # If we can't read, assume binary
+    def cleanup_old_repos(self, max_age_days: int = 7):
+        """Cleanup old cached repositories (optional)"""
+        import shutil
+        from datetime import datetime, timedelta
+        cutoff = datetime.now() - timedelta(days=max_age_days)
+        for repo_dir in self.cache_dir.iterdir():
+            if repo_dir.is_dir():
+                try:
+                    mtime = datetime.fromtimestamp(repo_dir.stat().st_mtime)
+                    if mtime < cutoff:
+                        print(f"🧹 Cleaning up old repo: {repo_dir.name}")
+                        shutil.rmtree(repo_dir)
+                except:
+                    pass
+# -------- SIMPLE USAGE EXAMPLES --------
+def example_usage():
+    """Example of how to use the crawler - FIXED VERSION"""
+    crawler = GitCrawler()
+    # 1. Clone a repository
+    repo_path = crawler.clone_repository("https://github.com/microsoft/autogen.git")
+    if not repo_path:
+        print("❌ Failed to clone repository")
+        return
+    # 2. OPTION A: Fast listing (just paths)
+    print("\n=== FAST LISTING ===")
+    python_files = crawler.list_files_fast(repo_path, extensions={'.py'})
+    print(f"Found {len(python_files)} Python files")
+    # 3. OPTION B: Rich listing with metadata
+    print("\n=== RICH LISTING ===")
+    file_infos, stats = crawler.list_files_with_info(
+        repo_path,
+        extensions={'.py', '.md', '.json', '.yaml'},
+        skip_binary=True
+    )
+    print(f"Total files: {stats['total_files']}")
+    print(f"Total size: {stats['total_size'] / 1024 / 1024:.2f} MB")
+    print(f"Extensions: {stats['by_extension']}")
+    # 4. OPTION C: Smart listing (auto) - FIXED
+    print("\n=== SMART LISTING ===")
+    # Returns just paths (fast)
+    files_fast = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=False)
+    # Type check for PyLance
+    if isinstance(files_fast, list):
+        print(f"Fast count: {len(files_fast)}")
+    else:
+        # This shouldn't happen with rich_metadata=False
+        print("Unexpected return type from list_files()")
+    # Returns info + stats (rich) - FIXED
+    result = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=True)
+    if isinstance(result, tuple):
+        files_rich, stats = result
+        print(f"Rich count: {len(files_rich)}")
+    else:
+        # This shouldn't happen with rich_metadata=True
+        print("Unexpected return type from list_files()")
+    # 5. Get README
+    readme = crawler.get_readme_content(repo_path)
+    if readme:
+        print(f"\nREADME preview: {readme[:200]}...")
+    # 6. Get repo stats
+    repo_stats = crawler.get_repo_stats(repo_path)
+    print(f"\nRepository stats: {repo_stats}")
+if __name__ == "__main__":
+    example_usage()

scripts/core/ingestion/repo_metadata.py ADDED Viewed

	@@ -0,0 +1,408 @@

+"""
+Repository Metadata Extractor - Advanced metadata extraction for Git repositories.
+This module extracts comprehensive metadata from Git repositories with a
+special focus on agentic framework detection. It analyzes repository structure,
+dependencies, git history, and patterns to identify agentic code patterns.
+ARCHITECTURE POSITION:
+    - Repository Analyzer: Deep analysis of Git repositories
+    - Agentic Detector: Identifies agentic framework usage
+    - Dependency Mapper: Extracts dependency information
+KEY FEATURES:
+    1. Agentic framework detection across multiple frameworks
+    2. Comprehensive dependency extraction (Python, Node.js, Docker)
+    3. Git metadata extraction (commits, branches, tags)
+    4. Repository structure analysis
+    5. Entry point and configuration file discovery
+"""
+import json
+import re
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional
+from datetime import datetime
+class RepoMetadataExtractor:
+    """Enhanced metadata extractor for agentic codebases"""
+    AGENTIC_FRAMEWORKS = {
+        "langchain": ["langchain", "langsmith", "lc", "chain", "agent"],
+        "autogen": ["autogen", "agent", "groupchat"],
+        "crewai": ["crewai", "crew", "task", "agent"],
+        "haystack": ["haystack", "pipeline", "node"],
+        "llamaindex": ["llama_index", "query_engine", "index"],
+        "semantic_kernel": ["semantic_kernel", "sk"],
+        "transformers_agents": ["transformers_agents", "huggingface"],
+        "camel": ["camel", "role_playing"],
+        "agents": ["agent", "tool", "workflow", "orchestrator"],
+    }
+    def __init__(self, repo_path: Path):
+        self.repo_path = repo_path
+    # ---------------------------------------------------------------------
+    # Public API
+    # ---------------------------------------------------------------------
+    def extract_comprehensive_metadata(self) -> Dict:
+        return {
+            "basic": self.extract_basic_metadata(),
+            "git": self.extract_git_metadata(),
+            "dependencies": self.extract_dependency_info(),
+            "structure": self.extract_structure_info(),
+            "agentic_detection": self.detect_agentic_frameworks(),
+            "entry_points": self.find_entry_points(),
+            "config_files": self.find_config_files(),
+        }
+    # 🔧 FIXED: Now returns actual repo name, not folder name
+    def extract_basic_metadata(self) -> Dict:
+        """Extract basic repository metadata"""
+        return {
+            "repo_name": self._get_actual_repo_name(),  # 🎯 FIXED LINE
+            "local_path": str(self.repo_path),
+            "size_mb": self._get_repo_size_mb(),
+            "file_count": self._count_files(),
+            "extracted_at": datetime.now().isoformat(),
+        }
+    # 🆕 NEW HELPER METHOD
+    def _get_actual_repo_name(self) -> str:
+        """
+        Get actual repository name from Git remote or folder structure.
+        Returns 'crewAI' not 'crewai_test'.
+        """
+        # 1. Try to get from git remote URL
+        try:
+            remote_url = self._run_git_command(["config", "--get", "remote.origin.url"])
+            if remote_url:
+                remote_url = remote_url.strip()
+                # Extract repo name from URL
+                # github.com/owner/repo.git -> repo
+                if '/' in remote_url:
+                    repo_name = remote_url.split('/')[-1]
+                    if repo_name.endswith('.git'):
+                        repo_name = repo_name[:-4]
+                    return repo_name
+        except Exception:
+            pass
+        # 2. Fallback: clean folder name
+        folder_name = self.repo_path.name
+        # Remove common suffixes
+        for suffix in ['_test', '_copy', '_backup', '_temp', '_local']:
+            if folder_name.lower().endswith(suffix.lower()):
+                return folder_name[:-len(suffix)]
+        return folder_name
+    def extract_git_metadata(self) -> Dict:
+        try:
+            remote_url = self._run_git_command(
+                ["config", "--get", "remote.origin.url"]
+            )
+            latest_commit = self._run_git_command(
+                ["log", "-1", "--pretty=format:%H|%an|%ae|%ad|%s"]
+            )
+            commit_parts = latest_commit.split("|") if latest_commit else []
+            branches_raw = self._run_git_command(["branch", "-a"])
+            branch_list = (
+                [
+                    b.strip().replace("* ", "")
+                    for b in branches_raw.split("\n")
+                    if b.strip()
+                ]
+                if branches_raw
+                else []
+            )
+            tags_raw = self._run_git_command(["tag", "-l"])
+            tag_list = (
+                [t.strip() for t in tags_raw.split("\n") if t.strip()]
+                if tags_raw
+                else []
+            )
+            current_branch = self._run_git_command(["branch", "--show-current"])
+            return {
+                "remote_url": remote_url or "",
+                "branch": current_branch or "",
+                "latest_commit": {
+                    "hash": commit_parts[0] if len(commit_parts) > 0 else "",
+                    "author": commit_parts[1] if len(commit_parts) > 1 else "",
+                    "email": commit_parts[2] if len(commit_parts) > 2 else "",
+                    "date": commit_parts[3] if len(commit_parts) > 3 else "",
+                    "message": commit_parts[4] if len(commit_parts) > 4 else "",
+                },
+                "branch_count": len(branch_list),
+                "branches": branch_list[:10],
+                "tag_count": len(tag_list),
+                "tags": tag_list[:10],
+            }
+        except Exception as e:
+            return {"error": str(e)}
+    # ---------------------------------------------------------------------
+    # Agentic detection
+    # ---------------------------------------------------------------------
+    def detect_agentic_frameworks(self) -> Dict:
+        detected: Dict[str, str] = {}
+        deps = self.extract_dependency_info()
+        python_packages = deps.get("python_packages", [])
+        for framework, keywords in self.AGENTIC_FRAMEWORKS.items():
+            for package in python_packages:
+                if any(k in package.lower() for k in keywords):
+                    detected[framework] = "dependency"
+                    break
+            else:
+                if self._scan_for_framework(keywords):
+                    detected[framework] = "usage"
+        if self._has_agent_patterns():
+            detected["custom_agents"] = "implementation"
+        return detected
+    def _scan_for_framework(self, keywords: List[str]) -> bool:
+        python_files = list(self.repo_path.rglob("*.py"))[:50]
+        for py_file in python_files:
+            try:
+                content = py_file.read_text(encoding="utf-8", errors="ignore").lower()
+                if any(f"import {k}" in content or f"from {k}" in content for k in keywords):
+                    return True
+                if any(re.search(rf"class.*{k}", content) for k in keywords):
+                    return True
+            except Exception:
+                continue
+        return False
+    def _has_agent_patterns(self) -> bool:
+        patterns = [
+            r"class.*Agent",
+            r"def.*agent",
+            r"class.*Tool",
+            r"def.*tool",
+            r"class.*Workflow",
+            r"def.*workflow",
+            r"class.*Orchestrator",
+            r"def.*orchestrator",
+            r"@tool",
+            r"@agent",
+            r"@workflow",
+        ]
+        python_files = list(self.repo_path.rglob("*.py"))[:20]
+        for py_file in python_files:
+            try:
+                content = py_file.read_text(encoding="utf-8", errors="ignore")
+                if any(re.search(p, content, re.IGNORECASE) for p in patterns):
+                    return True
+            except Exception:
+                continue
+        return False
+    # ---------------------------------------------------------------------
+    # Dependencies
+    # ---------------------------------------------------------------------
+    def extract_dependency_info(self) -> Dict:
+        deps = {
+            "python_packages": [],
+            "nodejs_packages": [],
+            "docker": False,
+            "other_dependencies": [],
+        }
+        req_files = [
+            "requirements.txt",
+            "pyproject.toml",
+            "setup.py",
+            "setup.cfg",
+            "Pipfile",
+            "environment.yml",
+        ]
+        for req_file in req_files:
+            path = self.repo_path / req_file
+            if path.exists():
+                try:
+                    deps["python_packages"].extend(
+                        self._parse_python_dependencies(path, req_file)
+                    )
+                except Exception as e:
+                    print(f"⚠️ Error parsing {req_file}: {e}")
+        package_json = self.repo_path / "package.json"
+        if package_json.exists():
+            try:
+                data = json.loads(package_json.read_text())
+                deps["nodejs_packages"].extend(data.get("dependencies", {}).keys())
+                deps["nodejs_packages"].extend(data.get("devDependencies", {}).keys())
+            except Exception:
+                pass
+        deps["docker"] = any(
+            (self.repo_path / f).exists()
+            for f in ["Dockerfile", "docker-compose.yml", "docker-compose.yaml"]
+        )
+        return deps
+    def _parse_python_dependencies(self, path: Path, file_name: str) -> List[str]:
+        packages: List[str] = []
+        if file_name == "requirements.txt":
+            for line in path.read_text().splitlines():
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    pkg = (
+                        line.split("==")[0]
+                        .split(">=")[0]
+                        .split("<=")[0]
+                        .split("~=")[0]
+                        .strip()
+                    )
+                    if pkg and not pkg.startswith("-"):
+                        packages.append(pkg)
+        elif file_name == "pyproject.toml":
+            import toml
+            data = toml.load(path)
+            deps = data.get("project", {}).get("dependencies", [])
+            for d in deps:
+                packages.append(d.split("==")[0].split(">=")[0].strip())
+        return packages
+    # ---------------------------------------------------------------------
+    # Structure & utilities
+    # ---------------------------------------------------------------------
+    def extract_structure_info(self) -> Dict:
+        structure = {
+            "directories": [],
+            "file_types": {},
+            "has_agentic_structure": False,
+        }
+        for item in self.repo_path.iterdir():
+            if item.is_dir() and item.name != ".git":
+                structure["directories"].append(item.name)
+        ext_count: Dict[str, int] = {}
+        for f in self.repo_path.rglob("*"):
+            if f.is_file():
+                ext_count[f.suffix.lower()] = ext_count.get(f.suffix.lower(), 0) + 1
+        structure["file_types"] = dict(
+            sorted(ext_count.items(), key=lambda x: x[1], reverse=True)[:10]
+        )
+        agentic_dirs = {
+            "agent",
+            "agents",
+            "workflow",
+            "workflows",
+            "tool",
+            "tools",
+            "pipeline",
+            "pipelines",
+            "orchestrator",
+        }
+        structure["has_agentic_structure"] = any(
+            any(k in d.lower() for k in agentic_dirs)
+            for d in structure["directories"]
+        )
+        return structure
+    def find_entry_points(self) -> List[str]:
+        patterns = [
+            "main.py",
+            "app.py",
+            "run.py",
+            "cli.py",
+            "server.py",
+            "agent.py",
+            "pipeline.py",
+            "__main__.py",
+        ]
+        return [
+            str(p.relative_to(self.repo_path))
+            for pat in patterns
+            for p in self.repo_path.rglob(pat)
+        ][:5]
+    def find_config_files(self) -> List[str]:
+        patterns = [
+            "config*.py",
+            "settings*.py",
+            ".env*",
+            "*.toml",
+            "*.yaml",
+            "*.yml",
+            "*.json",
+            "*.cfg",
+            "*.ini",
+        ]
+        files: List[str] = []
+        for pat in patterns:
+            for p in self.repo_path.rglob(pat):
+                rel = str(p.relative_to(self.repo_path))
+                if not any(x in rel for x in [".git", "__pycache__", "node_modules"]):
+                    files.append(rel)
+        return sorted(files)[:10]
+    # ---------------------------------------------------------------------
+    # Internals
+    # ---------------------------------------------------------------------
+    def _get_repo_size_mb(self) -> float:
+        total = sum(
+            f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()
+        )
+        return round(total / (1024 * 1024), 2)
+    def _count_files(self) -> int:
+        return sum(
+            1
+            for f in self.repo_path.rglob("*")
+            if f.is_file() and ".git" not in str(f)
+        )
+    def _run_git_command(self, args: List[str]) -> Optional[str]:
+        try:
+            result = subprocess.run(
+                ["git", "-C", str(self.repo_path)] + args,
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip() or None
+        except Exception:
+            return None

scripts/core/ingestion/ts_chunker.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Tree-sitter based syntactic chunker - Span enrichment and fallback parser.
+This module provides byte-level precise chunking using Tree-sitter, which
+serves as a structural fallback and span enrichment layer. Tree-sitter is
+language-aware and robust against malformed code, making it ideal for
+extracting exact byte spans and as a backup parser.
+ARCHITECTURE POSITION:
+    - Enrichment Layer: Provides byte-level precision
+    - Fallback Parser: Robust parsing for malformed code
+    - Span Authority: Source of truth for byte positions
+KEY FEATURES:
+    1. Byte-level accurate spans (exact source positions)
+    2. Language-aware parsing (supports multiple languages)
+    3. Robust against syntax errors
+    4. Extracts structural nodes even from partial code
+FLOW:
+    File → Tree-sitter parser → Structural nodes → Spans for enrichment
+USAGE:
+    from ts_chunker import extract_ts_chunks
+    chunks = extract_ts_chunks(Path("file.py"))
+NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
+      precise byte spans and serve as fallback for syntax errors.
+"""
+from pathlib import Path
+from typing import List, Optional, Literal, Dict, Tuple
+from tree_sitter import Parser, Language, Node
+import tree_sitter_python as tspython
+from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType
+# ----------------------------
+# Types
+# ----------------------------
+TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
+    "module": "module",
+    "class_definition": "class",
+    "function_definition": "function",
+    "async_function_definition": "function",
+    "import_statement": "imports",
+    "import_from_statement": "imports",
+}
+MAX_TS_DEPTH = 3  # module → imports → class/function → method
+# ----------------------------
+# Helpers
+# ----------------------------
+def _safe_decode(data: bytes) -> str:
+    try:
+        return data.decode("utf-8")
+    except UnicodeDecodeError:
+        return data.decode("utf-8", errors="ignore")
+def _get_node_name(node: Node) -> Optional[str]:
+    """
+    Extract identifier name for class / function nodes.
+    """
+    for child in node.children:
+        if child.type == "identifier":
+            text = child.text
+            if isinstance(text, (bytes, bytearray)):
+                return _safe_decode(text)
+    return None
+# ----------------------------
+# Public API
+# ----------------------------
+def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
+    source_bytes = file_path.read_bytes()
+    language = Language(tspython.language())
+    parser = Parser(language=language)
+    tree = parser.parse(source_bytes)
+    root = tree.root_node
+    chunks: List[CodeChunk] = []
+    def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
+        if depth > MAX_TS_DEPTH:
+            return
+        node_type = node.type
+        if node_type in TS_TO_CHUNK_TYPE:
+            code_bytes = source_bytes[node.start_byte : node.end_byte]
+            code = _safe_decode(code_bytes)
+            chunk_type = TS_TO_CHUNK_TYPE[node_type]
+            name = _get_node_name(node)
+            # For imports, use the full import as name
+            if chunk_type == "imports":
+                name = code.strip()
+            # Create chunk with byte-level precision
+            chunks.append(
+                CodeChunk(
+                    chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
+                    file_path=str(file_path),
+                    language="python",
+                    chunk_type=chunk_type,
+                    code=code,
+                    ast=ChunkAST(
+                        symbol_type=None,  # TS doesn't provide semantic types
+                        name=name,
+                        parent=None,  # Parent relationships from AST
+                        docstring=None,
+                        decorators=[],
+                        imports=[],
+                        node_type=node_type,
+                    ),
+                    span=ChunkSpan(
+                        start_byte=node.start_byte,
+                        end_byte=node.end_byte,
+                        start_line=node.start_point[0] + 1,
+                        end_line=node.end_point[0] + 1,
+                        char_count=len(code),
+                    ),
+                    hierarchy=ChunkHierarchy(
+                        is_primary=False,  # Tree-sitter chunks are for span enrichment only
+                        is_extracted=True,
+                        depth=depth,
+                        parent_id=None,  # Parent relationships from AST
+                    ),
+                    metadata={
+                        "byte_span": {
+                            "start": node.start_byte,
+                            "end": node.end_byte,
+                        },
+                        "tree_sitter_node_type": node_type,
+                        "is_exact_span": True,
+                    },
+                )
+            )
+        for child in node.children:
+            walk(child, depth + 1, node)
+    walk(root)
+    return chunks

scripts/core/training/__init__.py ADDED Viewed

File without changes

scripts/core/training/model.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoConfig
+class CodeEmbedder(nn.Module):
+    """
+    A wrapper around a Transformer model (default: CodeBERT) to produce
+    dense vector embeddings for code snippets using Mean Pooling.
+    """
+    def __init__(self, model_name_or_path="microsoft/codebert-base", trust_remote_code=False):
+        super(CodeEmbedder, self).__init__()
+        self.config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+        self.encoder = AutoModel.from_pretrained(model_name_or_path, config=self.config, trust_remote_code=trust_remote_code)
+    def mean_pooling(self, token_embeddings, attention_mask):
+        """
+        Average the token embeddings, ignoring padding tokens.
+        """
+        # attention_mask: (batch_size, seq_len)
+        # token_embeddings: (batch_size, seq_len, hidden_dim)
+        # Expand mask to match embedding dimensions
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        # Sum embeddings (ignoring padding)
+        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+        # Count non-padding tokens (prevent division by zero with clamp)
+        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        return sum_embeddings / sum_mask
+    def forward(self, input_ids, attention_mask):
+        # Pass through the transformer
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        # Extract last hidden state
+        # Shape: (batch_size, seq_len, hidden_dim)
+        last_hidden_state = outputs.last_hidden_state
+        # Perform Mean Pooling (Better than CLS token for sentence similarity)
+        embeddings = self.mean_pooling(last_hidden_state, attention_mask)
+        # Normalize embeddings (Optional but recommended for cosine similarity)
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings

scripts/core/training/test_model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+# 1. Load Model from Hugging Face (Your Team's Checkpoint)
+MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
+import time
+print(f"Downloading model: {MODEL_NAME}...")
+MAX_RETRIES = 3
+for attempt in range(MAX_RETRIES):
+    try:
+        print(f"Attempt {attempt+1}/{MAX_RETRIES}...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModel.from_pretrained(MODEL_NAME)
+        print("Model loaded successfully!")
+        break
+    except Exception as e:
+        print(f"Attempt {attempt+1} failed: {e}")
+        if attempt == MAX_RETRIES - 1:
+            print("Failed to load model after multiple attempts.")
+            print("Tip: Check internet connection or repo visibility.")
+            exit(1)
+        time.sleep(5) # Wait before retry
+# 2. Define Inputs (Query vs Code)
+query = "How to create a state graph in langgraph?"
+code = """
+from langgraph.graph import StateGraph
+def create_workflow():
+    workflow = StateGraph(AgentState)
+    workflow.add_node("agent", agent_node)
+    return workflow.compile()
+"""
+irrelevant_code = "def fast_inverse_sqrt(number): return number ** -0.5"
+# 3. Embed & Compare
+def embed(text):
+    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        # Mean pooling for sentence representation
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        return F.normalize(embeddings, p=2, dim=1)
+print("\nRunning Inference Test...")
+query_emb = embed(query)
+code_emb = embed(code)
+irrelevant_emb = embed(irrelevant_code)
+# 4. Calculate Similarity
+sim_positive = F.cosine_similarity(query_emb, code_emb).item()
+sim_negative = F.cosine_similarity(query_emb, irrelevant_emb).item()
+print(f"Query: '{query}'")
+print(f"Similarity to Relevant Code:   {sim_positive:.4f} (Should be high)")
+print(f"Similarity to Irrelevant Code: {sim_negative:.4f} (Should be low)")
+if sim_positive > sim_negative:
+    print("\nSUCCESS: Model correctly ranks relevant code higher.")
+else:
+    print("\n⚠️ WARNING: Model performance might be poor.")

scripts/core/training/train.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import argparse
+import os
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoTokenizer
+from scripts.core.training.model import CodeEmbedder
+from scripts.core.training.trainer import CodeTrainer
+import json
+# Real Dataset class for Triplet Training
+class RealCodeDataset(Dataset):
+    def __init__(self, jsonl_path, tokenizer, max_length=512):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.data = []
+        print(f"Loading data from {jsonl_path}...")
+        with open(jsonl_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.strip():
+                    self.data.append(json.loads(line))
+        print(f"Loaded {len(self.data)} triplets.")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        # Helper to tokenize
+        def tokenize_text(text):
+            return self.tokenizer(
+                text,
+                return_tensors='pt',
+                padding='max_length',
+                truncation=True,
+                max_length=self.max_length
+            )
+        # Tokenize all three parts
+        anchor = tokenize_text(item['anchor'])
+        positive = tokenize_text(item['positive'])
+        negative = tokenize_text(item['negative'])
+        # Return a flat dict with prefixed keys
+        return {
+            'anchor_input_ids': anchor['input_ids'].squeeze(0),
+            'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
+            'positive_input_ids': positive['input_ids'].squeeze(0),
+            'positive_attention_mask': positive['attention_mask'].squeeze(0),
+            'negative_input_ids': negative['input_ids'].squeeze(0),
+            'negative_attention_mask': negative['attention_mask'].squeeze(0)
+        }
+# Dummy Dataset class for MVP testing without the robust data pipeline availability
+class DummyCodeDataset(Dataset):
+    def __init__(self, tokenizer, size=100):
+        self.tokenizer = tokenizer
+        self.size = size
+        # Generate dummy triplet structure
+        self.data = [{"anchor": "def hello(): return 'world'", "positive": "def hi(): return 'earth'", "negative": "class Foo: pass"}] * size
+    def __len__(self):
+        return self.size
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        # Helper to tokenize
+        def tokenize_text(text):
+            return self.tokenizer(
+                text,
+                return_tensors='pt',
+                padding='max_length',
+                truncation=True,
+                max_length=128
+            )
+        anchor = tokenize_text(item['anchor'])
+        positive = tokenize_text(item['positive'])
+        negative = tokenize_text(item['negative'])
+        return {
+            'anchor_input_ids': anchor['input_ids'].squeeze(0),
+            'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
+            'positive_input_ids': positive['input_ids'].squeeze(0),
+            'positive_attention_mask': positive['attention_mask'].squeeze(0),
+            'negative_input_ids': negative['input_ids'].squeeze(0),
+            'negative_attention_mask': negative['attention_mask'].squeeze(0)
+        }
+def main():
+    parser = argparse.ArgumentParser(description="Train CodeMode Embeddings")
+    parser.add_argument("--model_name", type=str, default="microsoft/codebert-base", help="Hub model name")
+    parser.add_argument("--data_path", type=str, required=False, help="Path to parsed chunks.jsonl")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Where to save checkpoints")
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--accumulation_steps", type=int, default=4, help="Gradient Accumulation Steps")
+    parser.add_argument("--lr", type=float, default=2e-5)
+    parser.add_argument("--dry_run", action="store_true", help="Run with dummy data for 1 epoch")
+    args = parser.parse_args()
+    print(f"Initializing Training Pipeline...")
+    print(f"   Model: {args.model_name}")
+    print(f"   Output: {args.output_dir}")
+    print(f"   Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
+    # 1. Initialize Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    # 2. Load Dataset (Real or Dummy)
+    if args.data_path and os.path.exists(args.data_path):
+        train_dataset = RealCodeDataset(args.data_path, tokenizer)
+    else:
+        print("No data path provided or file missing. Using DUMMY data for verification.")
+        train_dataset = DummyCodeDataset(tokenizer, size=100)
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
+    # 3. Initialize Model
+    model = CodeEmbedder(model_name_or_path=args.model_name)
+    # 4. Initialize Trainer
+    trainer = CodeTrainer(
+        model=model,
+        train_loader=train_loader,
+        epochs=args.epochs,
+        learning_rate=args.lr,
+        accumulation_steps=args.accumulation_steps,
+        mixed_precision=True, # Hardcoded True for the "Zero-Cost" philosophy
+        output_dir=args.output_dir
+    )
+    # 5. Connect and Train
+    trainer.train()
+    print("Training Complete.")
+if __name__ == "__main__":
+    main()

scripts/core/training/trainer.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import os
+import logging
+from .model import CodeEmbedder
+# Setup Logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class CodeTrainer:
+    def __init__(
+        self,
+        model: CodeEmbedder,
+        train_loader: DataLoader,
+        val_loader: DataLoader = None,
+        epochs: int = 3,
+        learning_rate: float = 2e-5,
+        accumulation_steps: int = 1,
+        mixed_precision: bool = True,
+        output_dir: str = "./output",
+        device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    ):
+        self.model = model.to(device)
+        self.train_loader = train_loader
+        self.val_loader = val_loader
+        self.epochs = epochs
+        self.lr = learning_rate
+        self.accumulation_steps = accumulation_steps
+        self.mixed_precision = mixed_precision
+        self.output_dir = output_dir
+        self.device = device
+        # Optimizer
+        self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
+        # Scheduler (Optional: constant for now, can transform to Linear later)
+        # self.scheduler = ...
+        # Mixed Precision Scaler
+        self.scaler = torch.cuda.amp.GradScaler(enabled=self.mixed_precision)
+        # Loss Function: Triplet Margin Loss (Standard for Sentence Embeddings)
+        # Tries to maximize distance between Anchor-Negative and minimize Anchor-Positive
+        self.criterion = nn.TripletMarginLoss(margin=1.0, p=2)
+    def train_step(self, batch):
+        """
+        Runs one training step. Returns loss.
+        """
+        # Unpack the Triplet Batch
+        # We assume the Dataset returns keys: 'anchor_input_ids', 'anchor_attention_mask', etc.
+        # Helper to move dict to device
+        to_device = lambda x: x.to(self.device)
+        # Autocast for Mixed Precision
+        with torch.cuda.amp.autocast(enabled=self.mixed_precision):
+            # 1. Forward Pass for all 3 components
+            anchor_emb = self.model(to_device(batch['anchor_input_ids']), to_device(batch['anchor_attention_mask']))
+            positive_emb = self.model(to_device(batch['positive_input_ids']), to_device(batch['positive_attention_mask']))
+            negative_emb = self.model(to_device(batch['negative_input_ids']), to_device(batch['negative_attention_mask']))
+            # 2. Compute Triplet Loss
+            loss = self.criterion(anchor_emb, positive_emb, negative_emb)
+        return loss
+    def train(self):
+        logger.info(f"Starting training on {self.device}...")
+        logger.info(f"Batch Size: {self.train_loader.batch_size}, Accumulation Steps: {self.accumulation_steps}")
+        logger.info(f"Effective Batch Size: {self.train_loader.batch_size * self.accumulation_steps}")
+        self.model.train()
+        for epoch in range(self.epochs):
+            total_loss = 0
+            self.optimizer.zero_grad()
+            progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.epochs}")
+            for step, batch in enumerate(progress_bar):
+                # Forward + Loss Calculation
+                loss = self.train_step(batch)
+                # Gradient Accumulation: Normalize loss
+                loss = loss / self.accumulation_steps
+                # Backward Pass (Scaled)
+                self.scaler.scale(loss).backward()
+                if (step + 1) % self.accumulation_steps == 0:
+                    # Update Weights
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
+                    self.optimizer.zero_grad()
+                total_loss += loss.item() * self.accumulation_steps
+                progress_bar.set_postfix({'loss': total_loss / (step + 1)})
+            # Save Checkpoint
+            self.save_model(epoch+1)
+    def save_model(self, epoch):
+        save_path = os.path.join(self.output_dir, f"checkpoint-{epoch}")
+        os.makedirs(save_path, exist_ok=True)
+        logger.info(f"Saving model to {save_path}...")
+        # Save explicitly as safetensors via transformers API
+        self.model.encoder.save_pretrained(save_path, safe_serialization=True)
+        self.model.config.save_pretrained(save_path)
+        # Note: We save the 'encoder' which is the AutoModel,
+        # so it can be loaded easily by others.

scripts/core/utils/__init__.py ADDED Viewed

File without changes

scripts/core/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (183 Bytes). View file

scripts/core/utils/__pycache__/id_utils.cpython-311.pyc ADDED Viewed

Binary file (3.18 kB). View file

scripts/core/utils/id_utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Deterministic ID generation for code chunks.
+This module provides deterministic hashing for chunk IDs, ensuring that
+identical code chunks receive the same ID across runs. This is crucial for:
+1. Version tracking and change detection
+2. Cache consistency
+3. Reproducible datasets
+4. Efficient deduplication
+ID GENERATION STRATEGY:
+    Hash = SHA256(file_path + chunk_type + name + parent +
+                  start_line + end_line + code + byte_spans)
+    Result: prefix_hash (e.g., "primary_5c442008")
+KEY PROPERTIES:
+    1. Deterministic: Same input → same ID
+    2. Content-aware: Code changes → ID changes
+    3. Position-aware: Line/byte changes → ID changes
+    4. Hierarchical: Parent relationships affect ID
+USE CASE:
+    Ensures that during RAG operations, identical code chunks are
+    recognized as the same entity, improving retrieval accuracy.
+EXAMPLE:
+    deterministic_chunk_id(
+        file_path="src/module.py",
+        chunk_type="class",
+        name="MyClass",
+        parent="module",
+        start_line=10,
+        end_line=50,
+        code="class MyClass: ...",
+        start_byte=100,
+        end_byte=500
+    )
+    → "primary_a1b2c3d4"
+"""
+import hashlib
+from typing import Optional
+def deterministic_chunk_id(
+    *,
+    file_path: str,
+    chunk_type: str,
+    name: Optional[str],
+    parent: Optional[str],
+    start_line: Optional[int],
+    end_line: Optional[int],
+    code: str,
+    prefix: str = "primary",
+    start_byte: Optional[int] = None,
+    end_byte: Optional[int] = None,
+) -> str:
+    """
+    Generate deterministic chunk ID that includes code content.
+    Args:
+        file_path: Path to source file
+        chunk_type: Type of chunk (function, class, method, etc.)
+        name: Name of the symbol
+        parent: Parent symbol name
+        start_line: Starting line number
+        end_line: Ending line number
+        code: Actual code content
+        prefix: ID prefix (primary/secondary)
+        start_byte: Starting byte offset
+        end_byte: Ending byte offset
+    Returns:
+        Deterministic chunk ID
+    """
+    # Create a payload that uniquely identifies this chunk
+    payload = f"""
+    {file_path}
+    {chunk_type}
+    {name}
+    {parent}
+    {start_line}
+    {end_line}
+    {start_byte}
+    {end_byte}
+    {code}
+    """.strip()
+    # Generate hash and use first 8 chars for readability
+    hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8]
+    return f"{prefix}_{hash_digest}"

scripts/generate_all_frameworks.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Generate training datasets for ALL frameworks automatically.
+This script auto-discovers all chunk files and processes them,
+generating separate datasets for each framework PLUS a combined dataset.
+Usage:
+    python scripts/generate_all_frameworks.py
+Output Structure:
+    data/processed/training_crewai/
+        - positive_pairs.json
+        - triplets.json
+    data/processed/training_langgraph/
+        - positive_pairs.json
+        - triplets.json
+    data/processed/training_combined/
+        - positive_pairs.json  (ALL frameworks merged)
+        - triplets.json        (ALL frameworks merged)
+"""
+import sys
+import json
+from pathlib import Path
+from typing import List, Tuple
+from dataclasses import asdict
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.task_3_data_engineering.export.pairs_triplets_generator import (
+    generate_pairs_and_triplets,
+    PositivePair,
+    Triplet
+)
+def discover_all_chunk_files() -> List[Tuple[Path, str]]:
+    """
+    Discover all chunk files in the workspace.
+    Returns:
+        List of (chunk_path, framework_name) tuples
+    """
+    chunk_files = []
+    # Check local chunks
+    local_paths = [
+        PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
+        PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
+    ]
+    for path in local_paths:
+        if path.exists():
+            # Extract framework from parent directory or use "local"
+            if "Local_saved_files" in str(path):
+                framework = "crewai"
+            elif "sample_code" in str(path):
+                framework = "sample"
+            else:
+                framework = path.parent.name
+            chunk_files.append((path, framework))
+    # Check repository chunks
+    repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
+    if repos_dir.exists():
+        for repo_dir in repos_dir.iterdir():
+            if repo_dir.is_dir():
+                for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
+                    # Extract framework from filename or directory
+                    framework = jsonl_file.stem.replace("_chunks", "").split("_")[0]
+                    chunk_files.append((jsonl_file, framework))
+    return chunk_files
+def merge_datasets(all_pairs: List[List[PositivePair]],
+                   all_triplets: List[List[Triplet]],
+                   output_dir: Path) -> None:
+    """Merge all framework datasets into combined files (JSON + JSONL)."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Flatten lists
+    combined_pairs = []
+    for pairs in all_pairs:
+        combined_pairs.extend(pairs)
+    combined_triplets = []
+    for triplets in all_triplets:
+        combined_triplets.extend(triplets)
+    # Export combined positive pairs - JSON
+    pairs_json_path = output_dir / "positive_pairs.json"
+    with open(pairs_json_path, "w", encoding="utf-8") as f:
+        json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False)
+    print(f"✅ Combined positive pairs (JSON): {pairs_json_path}")
+    # Export combined positive pairs - JSONL
+    pairs_jsonl_path = output_dir / "positive_pairs.jsonl"
+    with open(pairs_jsonl_path, "w", encoding="utf-8") as f:
+        for p in combined_pairs:
+            f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n")
+    print(f"✅ Combined positive pairs (JSONL): {pairs_jsonl_path}")
+    # Export combined triplets - JSON
+    triplets_json_path = output_dir / "triplets.json"
+    with open(triplets_json_path, "w", encoding="utf-8") as f:
+        json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False)
+    print(f"✅ Combined triplets (JSON): {triplets_json_path}")
+    # Export combined triplets - JSONL
+    triplets_jsonl_path = output_dir / "triplets.jsonl"
+    with open(triplets_jsonl_path, "w", encoding="utf-8") as f:
+        for t in combined_triplets:
+            f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
+    print(f"✅ Combined triplets (JSONL): {triplets_jsonl_path}")
+    return len(combined_pairs), len(combined_triplets)
+def main():
+    """Generate datasets for all discovered frameworks + combined dataset."""
+    print("=" * 80)
+    print("🚀 MULTI-FRAMEWORK TRAINING DATA GENERATOR")
+    print("=" * 80)
+    # Discover all chunk files
+    print("\n🔍 Discovering chunk files...")
+    chunk_files = discover_all_chunk_files()
+    if not chunk_files:
+        print("❌ No chunk files found!")
+        print("\nPlease ensure chunks exist in:")
+        print("  - data/processed/chunks/Local_saved_files/")
+        print("  - data/processed/repos/*/")
+        return
+    print(f"✅ Found {len(chunk_files)} chunk file(s):\n")
+    for path, framework in chunk_files:
+        print(f"   📦 {framework}: {path.name}")
+    # Process each framework
+    print("\n" + "=" * 80)
+    print("🔄 PROCESSING INDIVIDUAL FRAMEWORKS")
+    print("=" * 80 + "\n")
+    results = []
+    all_pairs = []
+    all_triplets = []
+    for i, (chunks_path, framework) in enumerate(chunk_files, 1):
+        print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...")
+        print("-" * 60)
+        output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}"
+        try:
+            pairs, triplets = generate_pairs_and_triplets(
+                chunks_path=chunks_path,
+                output_dir=output_dir,
+                num_pairs=100,
+                num_triplets=100,
+                variance=5,
+                export_format="both"  # JSON + JSONL
+            )
+            # Collect for combined dataset
+            all_pairs.append(pairs)
+            all_triplets.append(triplets)
+            results.append({
+                "framework": framework,
+                "status": "✅ SUCCESS",
+                "pairs": len(pairs),
+                "variations": sum(len(p.variations) for p in pairs),
+                "triplets": len(triplets),
+                "output": output_dir
+            })
+        except Exception as e:
+            results.append({
+                "framework": framework,
+                "status": f"❌ FAILED: {str(e)}",
+                "output": output_dir
+            })
+    # Create combined dataset
+    print("\n" + "=" * 80)
+    print("🔗 CREATING COMBINED DATASET (ALL FRAMEWORKS)")
+    print("=" * 80 + "\n")
+    combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined"
+    total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir)
+    # Final summary
+    print("\n" + "=" * 80)
+    print("📊 FINAL SUMMARY")
+    print("=" * 80 + "\n")
+    print("INDIVIDUAL FRAMEWORK DATASETS:")
+    print("-" * 40)
+    for result in results:
+        print(f"\n📦 {result['framework'].upper()}")
+        print(f"   Status: {result['status']}")
+        if "pairs" in result:
+            print(f"   - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)")
+            print(f"   - triplets.json: {result['triplets']} docs")
+        print(f"   📁 {result['output']}")
+    print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):")
+    print("-" * 40)
+    print(f"📁 {combined_dir}")
+    print(f"   - positive_pairs.json: {total_pairs} docs")
+    print(f"   - triplets.json: {total_triplets} docs")
+    # File count summary
+    successful = sum(1 for r in results if "SUCCESS" in r["status"])
+    total_files = (successful * 4) + 4  # 4 per framework + 4 combined
+    print(f"\n\n📄 TOTAL FILES GENERATED: {total_files}")
+    print(f"   - {successful} frameworks × 4 files = {successful * 4} files")
+    print(f"   - Combined dataset = 4 files")
+    print("=" * 80)
+if __name__ == "__main__":
+    main()

scripts/run_pairs_triplets_pipeline.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Script to generate positive pairs and triplets from code chunks.
+This script loads code chunks and generates:
+1. Positive Pairs: (question, code) with 4-5 variations per sample
+2. Triplets: (anchor_question, positive_code, negative_code)
+Usage:
+    python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir>
+    python -m scripts.run_pairs_triplets_pipeline --help
+Examples:
+    # Generate from local chunks with default settings
+    python -m scripts.run_pairs_triplets_pipeline \\
+        --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
+        --output data/processed/training
+    # Generate from repository chunks
+    python -m scripts.run_pairs_triplets_pipeline \\
+        --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\
+        --output data/processed/training/langgraph
+    # Custom settings
+    python -m scripts.run_pairs_triplets_pipeline \\
+        --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
+        --output data/processed/training \\
+        --pairs 100 --triplets 100 --variance 5
+"""
+import sys
+from pathlib import Path
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.task_3_data_engineering.export.pairs_triplets_generator import (
+    generate_pairs_and_triplets,
+    main as cli_main
+)
+def run_default_pipeline():
+    """Run with default settings for the available chunks."""
+    # Try multiple possible chunk locations
+    possible_paths = [
+        PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
+        PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
+    ]
+    # Find all chunks.jsonl files in chunks folder subdirectories
+    chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks"
+    if chunks_dir.exists():
+        for subdir in chunks_dir.iterdir():
+            if subdir.is_dir():
+                chunks_file = subdir / "chunks.jsonl"
+                if chunks_file.exists() and chunks_file not in possible_paths:
+                    possible_paths.append(chunks_file)
+    # Find repository chunks
+    repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
+    if repos_dir.exists():
+        for repo_dir in repos_dir.iterdir():
+            if repo_dir.is_dir():
+                for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
+                    possible_paths.append(jsonl_file)
+    chunks_path = None
+    for path in possible_paths:
+        if path.exists():
+            chunks_path = path
+            break
+    if chunks_path is None:
+        print("❌ No chunks files found. Please specify a chunks file with --chunks")
+        print("\nPossible locations checked:")
+        for p in possible_paths[:5]:
+            print(f"   - {p}")
+        return
+    output_dir = PROJECT_ROOT / "data" / "processed" / "training"
+    print("=" * 60)
+    print("🚀 Positive Pairs & Triplets Generator")
+    print("=" * 60)
+    print(f"\n📂 Chunks Path: {chunks_path}")
+    print(f"📁 Output Dir: {output_dir}")
+    print(f"📊 Settings: pairs=100, triplets=100, variance=5")
+    print("\n" + "-" * 60)
+    pairs, triplets = generate_pairs_and_triplets(
+        chunks_path=chunks_path,
+        output_dir=output_dir,
+        num_pairs=100,
+        num_triplets=100,
+        variance=5,
+        export_format="both"
+    )
+    print("\n" + "=" * 60)
+    print("✅ Pipeline Complete!")
+    print("=" * 60)
+    print(f"\n📁 Output files saved to: {output_dir}")
+    print("   - positive_pairs.jsonl")
+    print("   - positive_pairs.json")
+    print("   - triplets.jsonl")
+    print("   - triplets.json")
+if __name__ == "__main__":
+    import argparse
+    # Check if any arguments provided
+    if len(sys.argv) > 1:
+        # Use CLI with provided arguments
+        cli_main()
+    else:
+        # Run with defaults
+        run_default_pipeline()

scripts/run_python_pipeline.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Local Codebase Pipeline Runner - Processes local codebases for dataset creation.
+This is the main entry point for processing LOCAL CODEBASES (not Git repos).
+It orchestrates the entire chunking pipeline for local files, handling both
+code files and documentation with intelligent fallback strategies.
+ARCHITECTURE POSITION:
+    - Local Pipeline Orchestrator: Coordinates local file processing
+    - Fallback Handler: Intelligent fallback from code to documentation
+    - Dataset Exporter: Creates final JSONL datasets with statistics
+KEY FEATURES:
+    1. Unified processing of Python files and documentation
+    2. Intelligent fallback (failed code chunking → documentation chunking)
+    3. Hierarchical chunking for Python files
+    4. Documentation-aware chunking for markdown/text files
+    5. Dataset statistics and metadata generation
+DATA FLOW:
+    Local files → Type detection → Python chunking (or fallback) →
+    Documentation chunking → JSONL export → Statistics
+USE CASES:
+    - Processing locally saved code examples
+    - Creating datasets from example repositories
+    - Testing chunking strategies on local files
+USAGE:
+    python run_python_pipeline.py --name crewai_examples --include crewai
+    python run_python_pipeline.py --name test_dataset --exclude large_repos
+"""
+from pathlib import Path
+import json
+import argparse
+from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker
+from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl
+from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
+from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata
+from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks
+INPUT_DIR = Path("data/raw/codebases")
+BASE_OUTPUT_DIR = Path("data/processed/chunks")
+DOC_EXTS = {".md", ".txt", ".rst"}
+def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None):
+    output_dir = BASE_OUTPUT_DIR / dataset_name
+    output_dir.mkdir(parents=True, exist_ok=True)
+    chunker = HierarchicalChunker()
+    all_chunks = []
+    files = [p for p in INPUT_DIR.rglob("*") if p.is_file()]
+    for file_path in files:
+        rel = file_path.relative_to(INPUT_DIR).parts
+        if include and rel[0] not in include:
+            continue
+        if exclude and rel[0] in exclude:
+            continue
+        print(f"Processing: {file_path}")
+        # ---- Python files ----
+        if file_path.suffix == ".py":
+            try:
+                code_chunks = chunker.chunk_file(file_path)
+                if code_chunks:
+                    all_chunks.extend(code_chunks)
+                    continue
+            except Exception:
+                pass  # fallback to doc mode
+        # ---- Documentation / text ----
+        if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py":
+            try:
+                raw_text = file_path.read_text(encoding="utf-8", errors="ignore")
+            except Exception:
+                continue
+            if not raw_text.strip():
+                continue
+            doc_chunks = chunk_document(
+                raw_text=raw_text,
+                source_name=str(file_path),
+                source_url=None,
+            )
+            all_chunks.extend(wrap_doc_chunks(doc_chunks))
+    # ---- Export ----
+    export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True)
+    stats = compute_dataset_stats(all_chunks)
+    primary = [c for c in all_chunks if c.hierarchy.is_primary]
+    stats["hierarchy"] = {
+        "primary_chunks": len(primary),
+        "secondary_chunks": len(all_chunks) - len(primary),
+    }
+    with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2)
+    write_dataset_metadata(
+        chunks=all_chunks,
+        output_path=output_dir / "dataset_metadata.json",
+        dataset_name=dataset_name,
+        dataset_version="v1",
+    )
+    print("\n✅ Dataset built successfully")
+    print(f"   - Files: {len({c.file_path for c in all_chunks})}")
+    print(f"   - Chunks: {len(all_chunks)}")
+    print(f"   - Output: {output_dir}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name", required=True)
+    parser.add_argument("--include", nargs="+")
+    parser.add_argument("--exclude", nargs="+")
+    args = parser.parse_args()
+    run(args.name, args.include, args.exclude)

scripts/run_repo_pipeline.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Git Repository Pipeline Runner - Processes Git repositories at scale.
+This is the main entry point for processing GIT REPOSITORIES. It provides
+enhanced features for repository analysis, including git metadata extraction,
+agentic framework detection, and comprehensive statistics generation.
+ARCHITECTURE POSITION:
+    - Repository Pipeline Orchestrator: Coordinates Git repo processing
+    - Enhanced Metadata Collector: Extracts git history and agentic patterns
+    - Production Pipeline: Handles large repositories with performance tracking
+KEY FEATURES:
+    1. Complete repository processing with git metadata
+    2. Extension-aware filtering (None = full repository)
+    3. Performance tracking (files/sec, chunks/sec)
+    4. Agentic framework detection (via RepoMetadataExtractor)
+    5. Comprehensive output (JSONL chunks + metadata + statistics)
+DATA FLOW:
+    Repo URL → Clone → Metadata extraction → File listing → Chunking →
+    Enhanced export → Statistics → Comprehensive output package
+USE CASES:
+    - Processing complete Git repositories for training data
+    - Creating agentic-aware datasets
+    - Benchmarking chunking performance
+    - Production dataset generation
+USAGE:
+    python run_repo_pipeline.py single https://github.com/crewAIInc/crewAI
+    python run_repo_pipeline.py single https://github.com/autogen/autogen --extensions .py .md
+    python run_repo_pipeline.py single https://github.com/langchain --max-files 1000
+"""
+from pathlib import Path
+import json
+from typing import Dict, Any, Optional, Set, List
+import argparse
+import time
+from datetime import datetime
+# Import enhanced components
+from src.task_3_data_engineering.ingestion.git_crawler import GitCrawler
+from src.task_3_data_engineering.ingestion.repo_metadata import RepoMetadataExtractor
+from src.task_3_data_engineering.chunking.repo_chunker import RepoChunker
+from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
+from src.task_3_data_engineering.export.enhanced_jsonl_exporter import export_repo_chunks_jsonl
+class EnhancedRepoPipeline:
+    """Enhanced pipeline with agentic focus"""
+    def __init__(
+        self,
+        output_base: Path = Path("data/processed/repos"),
+        use_hierarchical: bool = True,
+    ):
+        self.crawler = GitCrawler()
+        self.chunker = RepoChunker(use_hierarchical=use_hierarchical)
+        self.output_base = output_base
+        self.output_base.mkdir(parents=True, exist_ok=True)
+    def process_repository(
+        self,
+        repo_url: str,
+        extensions: Optional[Set[str]] = None,
+        output_name: Optional[str] = None,
+        include_binary: bool = False,
+        max_files: Optional[int] = None,
+        skip_git_metadata: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Process repository with enhanced features
+        IMPORTANT FIX:
+        - extensions=None  => FULL repository (no filtering)
+        - extensions=set() => filtered repository
+        """
+        start_time = time.time()
+        print(f"🚀 Processing repository: {repo_url}")
+        print("-" * 60)
+        # 1. Clone repository
+        repo_path = self.crawler.clone_repository(repo_url)
+        if not repo_path:
+            raise RuntimeError(f"Failed to clone {repo_url}")
+        # 2. Determine output name
+        if not output_name:
+            output_name = repo_path.name
+        # 3. Log extension behavior (FIXED)
+        if extensions:
+            print(f"📁 Extension filter enabled: {sorted(extensions)}")
+        else:
+            print("📁 No extension filter → processing FULL repository")
+        # 4. Extract repository metadata
+        print("📊 Extracting repository metadata...")
+        metadata = {}
+        if not skip_git_metadata:
+            extractor = RepoMetadataExtractor(repo_path)
+            metadata = extractor.extract_comprehensive_metadata()
+        # 5. List files (CORE LOGIC UNCHANGED)
+        print("📁 Listing repository files...")
+        file_infos, file_stats = self.crawler.list_files_with_info(
+            repo_path,
+            extensions=extensions,  # None => full repo
+            skip_binary=not include_binary,
+        )
+        # 6. Optional file limiting
+        if max_files and len(file_infos) > max_files:
+            print(f"⚠️ Limiting to {max_files} files (out of {len(file_infos)})")
+            file_infos = file_infos[:max_files]
+        print(f"📊 Found {len(file_infos)} files to process")
+        # 7. Create output directory
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = self.output_base / f"{output_name}_{timestamp}"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # 8. Repository-level metadata
+        # Get actual repo name from metadata
+        actual_repo_name = metadata.get("basic", {}).get("repo_name", output_name)
+        repo_metadata = {
+            "repo_url": repo_url,
+            "repo_name": actual_repo_name,  # ✅ Use actual repo name
+            "folder_name": output_name,     # ✅ Track user's folder
+            "local_path": str(repo_path),
+            "extensions_included": list(extensions) if extensions else "ALL",
+            "timestamp": timestamp,
+            **metadata,
+        }
+        metadata_file = output_dir / "repository_metadata.json"
+        with open(metadata_file, "w", encoding="utf-8") as f:
+            json.dump(repo_metadata, f, indent=2, default=str)
+        # 9. Chunk processing
+        all_chunks = []
+        processing_stats = {
+            "total_files": len(file_infos),
+            "processed": 0,
+            "failed": 0,
+            "file_types": {},
+            "chunk_types": {},
+        }
+        print("\n🔧 Processing files...")
+        print("-" * 60)
+        for idx, file_info in enumerate(file_infos, start=1):
+            try:
+                if idx % 10 == 0:
+                    print(f"  [{idx}/{len(file_infos)}] Processing...")
+                file_metadata = {
+                    **repo_metadata,
+                    "file_info": {
+                        "relative_path": file_info.relative_path,
+                        "size_bytes": file_info.size,
+                        "extension": file_info.extension,
+                        "is_binary": file_info.is_binary,
+                    },
+                }
+                chunks = self.chunker.chunk_file(
+                    file_info.path,
+                    file_metadata,
+                )
+                all_chunks.extend(chunks)
+                processing_stats["processed"] += 1
+                processing_stats["file_types"][file_info.extension] = (
+                    processing_stats["file_types"].get(file_info.extension, 0) + 1
+                )
+                for chunk in chunks:
+                    ct = chunk.chunk_type
+                    processing_stats["chunk_types"][ct] = (
+                        processing_stats["chunk_types"].get(ct, 0) + 1
+                    )
+            except Exception as e:
+                print(f"⚠️ Error processing {file_info.relative_path}: {str(e)[:120]}")
+                processing_stats["failed"] += 1
+        # 10. Export chunks
+        print("\n💾 Exporting chunks...")
+        output_file = output_dir / f"{output_name}_chunks.jsonl"
+        export_repo_chunks_jsonl(
+            chunks=all_chunks,
+            output_path=output_file,
+            repo_metadata=repo_metadata,
+            print_stats=True,
+        )
+        # 11. Compute statistics
+        print("📈 Computing statistics...")
+        chunk_stats = compute_dataset_stats(all_chunks)
+        total_time = time.time() - start_time
+        final_stats = {
+            "repository_info": {
+                "name": actual_repo_name,  # ✅ USE actual_repo_name
+                "folder_name": output_name,  # ✅ ADD folder_name field
+                "url": repo_url,
+                "path": str(repo_path),
+                "timestamp": timestamp,
+            },
+            "processing_stats": processing_stats,
+            "chunk_statistics": chunk_stats,
+            "performance": {
+                "total_time_seconds": round(total_time, 2),
+                "files_per_second": round(len(file_infos) / total_time, 2),
+                "chunks_per_second": round(len(all_chunks) / total_time, 2),
+            },
+            "output_files": {
+                "chunks": str(output_file),
+                "metadata": str(metadata_file),
+            },
+        }
+        stats_file = output_dir / f"{output_name}_stats.json"
+        with open(stats_file, "w", encoding="utf-8") as f:
+            json.dump(final_stats, f, indent=2)
+        # 12. Summary
+        print("\n" + "=" * 70)
+        print("✅ REPOSITORY PROCESSING COMPLETE")
+        print("=" * 70)
+        print(f"📁 Repository: {output_name}")
+        print(f"📄 Files:      {len(file_infos)}")
+        print(f"🧩 Chunks:     {len(all_chunks)}")
+        print(f"⏱️  Time:       {final_stats['performance']['total_time_seconds']}s")
+        print(f"💾 Output:     {output_dir}")
+        print("=" * 70)
+        return final_stats
+def main():
+    """Enhanced CLI for repository pipeline (FIXED)"""
+    parser = argparse.ArgumentParser(
+        description="Process Git repositories for agentic datasets"
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # ---- Single repo ----
+    single = subparsers.add_parser("single", help="Process single repository")
+    single.add_argument("repo_url", help="Git repository URL")
+    single.add_argument("--name", help="Custom output name")
+    single.add_argument(
+        "--extensions",
+        nargs="+",
+        default=None,
+        help="Optional file extensions (.py .md). If omitted, FULL repo is processed.",
+    )
+    single.add_argument("--max-files", type=int, help="Limit number of files")
+    single.add_argument("--skip-git-metadata", action="store_true")
+    single.add_argument("--include-binary", action="store_true")
+    args = parser.parse_args()
+    pipeline = EnhancedRepoPipeline()
+    if args.command == "single":
+        pipeline.process_repository(
+            repo_url=args.repo_url,
+            output_name=args.name,
+            extensions=set(args.extensions) if args.extensions else None,
+            max_files=args.max_files,
+            skip_git_metadata=args.skip_git_metadata,
+            include_binary=args.include_binary,
+        )
+if __name__ == "__main__":
+    main()

scripts/triplets_synthesis.py ADDED Viewed

	@@ -0,0 +1,259 @@

+'''
+Synthesize triplet and positive pair datasets from chunked code files.'''
+import argparse
+import json
+import random
+import hashlib
+from pathlib import Path
+from typing import Dict, List
+from datetime import datetime
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# ============================
+# CONFIG
+# ============================
+MAX_DOCUMENTS = 200
+POSITIVE_VARIANTS = 5
+TFIDF_MAX_FEATURES = 5000
+RANDOM_SEED = 42
+BASE_OUTPUT_DIR = Path("data/synthetic")
+random.seed(RANDOM_SEED)
+# ============================
+# UTILITIES
+# ============================
+def load_chunks(file_path):
+    path = Path(file_path)
+    if path.suffix == ".jsonl":
+        chunks = []
+        with open(path, "r", encoding="utf-8") as f:
+            for line_no, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    chunks.append(json.loads(line))
+                except json.JSONDecodeError as e:
+                    raise ValueError(
+                        f"Invalid JSON on line {line_no} in {path}"
+                    ) from e
+        return chunks
+    elif path.suffix == ".json":
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, list):
+            raise ValueError(f"{path} must contain a list of chunks")
+        return data
+    else:
+        raise ValueError(
+            f"Unsupported file format {path.suffix}. Use .json or .jsonl"
+        )
+def save_jsonl(path: Path, records: List[Dict]):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def save_json(path: Path, data):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2)
+def stable_document_id(chunk: Dict, idx: int) -> str:
+    """
+    Generate a canonical, stable document_id.
+    """
+    base = f"{chunk.get('file_path','unknown')}::{idx}"
+    return "doc_" + hashlib.sha1(base.encode()).hexdigest()
+def infer_framework(input_path: Path) -> str:
+    """
+    Infer framework from path (fallback-safe).
+    """
+    parts = [p.lower() for p in input_path.parts]
+    for fw in ["crewai", "langchain", "langgraph", "autogen"]:
+        if fw in parts:
+            return fw
+    return "unknown"
+# ============================
+# ANCHOR GENERATION (LLM PLACEHOLDER)
+# ============================
+def generate_anchor_questions(code: str, n: int) -> List[str]:
+    """
+    Deterministic placeholder (LLM-ready).
+    """
+    symbol = code.split("(")[0].replace("def ", "").replace("class ", "").strip()
+    templates = [
+        f"How does {symbol} work in Python?",
+        f"How to implement {symbol}?",
+        f"Example usage of {symbol}",
+        f"Explain the {symbol} logic",
+        f"Best practices for {symbol}",
+    ]
+    random.shuffle(templates)
+    return templates[:n]
+# ============================
+# NEGATIVE MINING
+# ============================
+def build_tfidf(chunks: List[Dict]):
+    corpus = [c["code"] for c in chunks]
+    vectorizer = TfidfVectorizer(
+        stop_words="english",
+        max_features=TFIDF_MAX_FEATURES
+    )
+    matrix = vectorizer.fit_transform(corpus)
+    return vectorizer, matrix
+def mine_hard_negative(
+    anchor: str,
+    positive_idx: int,
+    chunks: List[Dict],
+    vectorizer,
+    matrix,
+) -> Dict:
+    query_vec = vectorizer.transform([anchor])
+    scores = cosine_similarity(query_vec, matrix)[0]
+    ranked = sorted(
+        [(i, s) for i, s in enumerate(scores)],
+        key=lambda x: x[1],
+        reverse=True,
+    )
+    for idx, _ in ranked:
+        if idx != positive_idx:
+            return chunks[idx]
+    raise RuntimeError("No negative candidate found")
+# ============================
+# MAIN PIPELINE
+# ============================
+def generate_datasets(input_path: Path, run_name: str):
+    output_dir = BASE_OUTPUT_DIR / run_name
+    framework = infer_framework(input_path)
+    chunks = load_chunks(input_path)
+    # Filter only semantic code chunks
+    chunks = [
+        c for c in chunks
+        if c.get("chunk_type") in {"class", "method", "function"}
+        and "code" in c
+    ]
+    random.shuffle(chunks)
+    chunks = chunks[:MAX_DOCUMENTS]
+    # Assign canonical document_id
+    for idx, c in enumerate(chunks):
+        c["document_id"] = stable_document_id(c, idx)
+    vectorizer, matrix = build_tfidf(chunks)
+    positive_pairs = []
+    triplets = []
+    for idx, chunk in enumerate(chunks):
+        code = chunk["code"]
+        doc_id = chunk["document_id"]
+        # -------- POSITIVE PAIRS --------
+        anchors = generate_anchor_questions(code, POSITIVE_VARIANTS)
+        for a in anchors:
+            positive_pairs.append({
+                "document_id": doc_id,
+                "anchor": a,
+                "positive": code,
+                "framework": framework,
+                "source": "synthetic_positive_v2",
+            })
+        # -------- TRIPLET --------
+        anchor = anchors[0]
+        negative_chunk = mine_hard_negative(
+            anchor, idx, chunks, vectorizer, matrix
+        )
+        triplets.append({
+            "document_id": doc_id,
+            "anchor": anchor,
+            "positive": code,
+            "negative": negative_chunk["code"],
+            "framework": framework,
+            "source": "synthetic_triplet_v2",
+        })
+    # -------- SAVE --------
+    save_jsonl(output_dir / "positive_pairs.jsonl", positive_pairs)
+    save_jsonl(output_dir / "triplets.jsonl", triplets)
+    save_json(output_dir / "positive_pairs.json", positive_pairs)
+    save_json(output_dir / "triplets.json", triplets)
+    metadata = {
+        "name": run_name,
+        "framework": framework,
+        "input_file": str(input_path),
+        "num_chunks": len(chunks),
+        "positive_pairs": len(positive_pairs),
+        "triplets": len(triplets),
+        "created_at": datetime.utcnow().isoformat(),
+        "random_seed": RANDOM_SEED,
+    }
+    save_json(output_dir / "metadata.json", metadata)
+    print(f"✅ Dataset generated at: {output_dir}")
+# ============================
+# ENTRY POINT
+# ============================
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Chunked JSONL file")
+    parser.add_argument("--name", required=True, help="Synthetic dataset name")
+    args = parser.parse_args()
+    generate_datasets(
+        input_path=Path(args.input),
+        run_name=args.name,
+    )
+# # For document id
+# document_id := sha1(
+#     normalized_repo_path +
+#     file_path +
+#     top_level_symbol
+# )