CodeMode Agent commited on
Commit
463fc7e
·
1 Parent(s): 17cc505

Deploy CodeMode via Agent

Browse files
Files changed (42) hide show
  1. README.md +20 -5
  2. app.py +430 -0
  3. requirements.txt +9 -0
  4. scripts/__init__.py +0 -0
  5. scripts/__pycache__/__init__.cpython-311.pyc +0 -0
  6. scripts/aggregate_datasets.py +77 -0
  7. scripts/core/README.md +37 -0
  8. scripts/core/__init__.py +0 -0
  9. scripts/core/__pycache__/__init__.cpython-311.pyc +0 -0
  10. scripts/core/ingestion/__init__.py +0 -0
  11. scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc +0 -0
  12. scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc +0 -0
  13. scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc +0 -0
  14. scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc +0 -0
  15. scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc +0 -0
  16. scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc +0 -0
  17. scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc +0 -0
  18. scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc +0 -0
  19. scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc +0 -0
  20. scripts/core/ingestion/ast_chunker.py +390 -0
  21. scripts/core/ingestion/chunk.py +497 -0
  22. scripts/core/ingestion/chunk_schema.py +112 -0
  23. scripts/core/ingestion/doc_chunker.py +446 -0
  24. scripts/core/ingestion/generate_data.py +658 -0
  25. scripts/core/ingestion/hierarchical_chunker.py +182 -0
  26. scripts/core/ingestion/ingest.py +380 -0
  27. scripts/core/ingestion/repo_metadata.py +408 -0
  28. scripts/core/ingestion/ts_chunker.py +155 -0
  29. scripts/core/training/__init__.py +0 -0
  30. scripts/core/training/model.py +47 -0
  31. scripts/core/training/test_model.py +64 -0
  32. scripts/core/training/train.py +145 -0
  33. scripts/core/training/trainer.py +118 -0
  34. scripts/core/utils/__init__.py +0 -0
  35. scripts/core/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  36. scripts/core/utils/__pycache__/id_utils.cpython-311.pyc +0 -0
  37. scripts/core/utils/id_utils.py +91 -0
  38. scripts/generate_all_frameworks.py +228 -0
  39. scripts/run_pairs_triplets_pipeline.py +120 -0
  40. scripts/run_python_pipeline.py +131 -0
  41. scripts/run_repo_pipeline.py +289 -0
  42. scripts/triplets_synthesis.py +259 -0
README.md CHANGED
@@ -1,12 +1,27 @@
1
  ---
2
  title: CodeMode
3
- emoji: 🏢
4
- colorFrom: purple
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: CodeMode
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # CodeMode: Agentic RAG Engine
14
+
15
+ This is the official demo for CodeMode, an advanced RAG engine for codebases.
16
+
17
+ ## Features
18
+ - **Ingest**: Clone and index any public GitHub repository.
19
+ - **Semantic Search**: Find relevant code using natural language.
20
+ - **Code-to-Code**: Find similar functions using code snippets.
21
+ - **MROps**: Analyze embedding quality and diversity.
22
+
23
+ ## Local Setup
24
+ ```bash
25
+ pip install -r requirements.txt
26
+ python app.py
27
+ ```
app.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import pandas as pd
6
+ import sys
7
+ import os
8
+ import shutil
9
+ from pathlib import Path
10
+ import chromadb
11
+ from chromadb.config import Settings
12
+ import uuid
13
+
14
+ # --- Add scripts to path so we can import ingestion modules ---
15
+ # --- Add scripts to path so we can import ingestion modules ---
16
+ sys.path.append(os.path.dirname(__file__))
17
+ from scripts.core.ingestion.ingest import GitCrawler
18
+ from scripts.core.ingestion.chunk import RepoChunker
19
+
20
+ # --- Configuration ---
21
+ MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
22
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
+ DB_DIR = Path("data/chroma_db")
24
+ DB_DIR.mkdir(parents=True, exist_ok=True)
25
+
26
+ print(f"Loading model: {MODEL_NAME} on {DEVICE}...")
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
+ model = AutoModel.from_pretrained(MODEL_NAME)
29
+ model.to(DEVICE)
30
+ model.eval()
31
+ print("Model loaded!")
32
+
33
+ # --- Vector Database Setup ---
34
+ # Initialize ChromaDB Client (Persistent)
35
+ chroma_client = chromadb.PersistentClient(path=str(DB_DIR))
36
+
37
+ # Create or Get Collection
38
+ # We use cosine similarity space
39
+ collection = chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
40
+
41
+ # --- Helper Functions ---
42
+ def compute_embeddings(text_list):
43
+ """Batch compute embeddings"""
44
+ if not text_list: return None
45
+ # Truncate to 512 tokens to avoid errors
46
+ inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
47
+ with torch.no_grad():
48
+ out = model(**inputs)
49
+ emb = out.last_hidden_state.mean(dim=1)
50
+ return F.normalize(emb, p=2, dim=1)
51
+
52
+ def reset_db():
53
+ """Clear database"""
54
+ try:
55
+ chroma_client.delete_collection("codemode_rag")
56
+ chroma_client.get_or_create_collection(name="codemode_rag", metadata={"hnsw:space": "cosine"})
57
+ return "Database reset (All embeddings deleted)."
58
+ except Exception as e:
59
+ return f"Error resetting DB: {e}"
60
+
61
+ def search_codebase(query, top_k=5):
62
+ """Semantic Search using ChromaDB"""
63
+ if collection.count() == 0: return []
64
+
65
+ query_emb = compute_embeddings([query])
66
+ if query_emb is None: return []
67
+
68
+ # Convert tensor to list for Chroma
69
+ query_vec = query_emb.cpu().numpy().tolist()[0]
70
+
71
+ results = collection.query(
72
+ query_embeddings=[query_vec],
73
+ n_results=min(top_k, collection.count()),
74
+ include=["metadatas", "documents", "distances"]
75
+ )
76
+
77
+ # Parse items
78
+ output = []
79
+ if results['ids']:
80
+ for i in range(len(results['ids'][0])):
81
+ meta = results['metadatas'][0][i]
82
+ code = results['documents'][0][i]
83
+ dist = results['distances'][0][i]
84
+ score = 1 - dist # Cosine distance to similarity
85
+
86
+ link_icon = "[Link]" if score > 0.7 else ""
87
+ output.append([meta.get("file_name", "unknown"), f"{score:.4f} {link_icon}", code[:300] + "..."])
88
+
89
+ return output
90
+
91
+ def fn_ingest(repo_url):
92
+ """
93
+ 1. Clone Repo
94
+ 2. Chunk Files
95
+ 3. Compute Embeddings (Batched)
96
+ 4. Store in ChromaDB
97
+ """
98
+ if not repo_url.startswith("http"):
99
+ return "Invalid URL"
100
+
101
+ DATA_DIR = Path(os.path.abspath("data/raw_ingest"))
102
+ import stat
103
+ def remove_readonly(func, path, _):
104
+ os.chmod(path, stat.S_IWRITE)
105
+ func(path)
106
+
107
+ try:
108
+ # Clean up old raw data
109
+ if DATA_DIR.exists():
110
+ shutil.rmtree(DATA_DIR, onerror=remove_readonly)
111
+
112
+ # 1. Clone
113
+ yield f"Cloning {repo_url}..."
114
+ crawler = GitCrawler(cache_dir=DATA_DIR)
115
+ repo_path = crawler.clone_repository(repo_url)
116
+
117
+ if not repo_path:
118
+ return "Failed to clone repository."
119
+
120
+ # 2. Chunk
121
+ yield "Listing files..."
122
+ files = crawler.list_files(repo_path, extensions={'.py', '.md', '.json', '.js', '.ts', '.java', '.cpp'})
123
+ if isinstance(files, tuple): files = [f.path for f in files[0]]
124
+
125
+ total_files = len(files)
126
+ yield f"Found {total_files} files. Chunking..."
127
+
128
+ chunker = RepoChunker()
129
+ all_chunks = []
130
+
131
+ for i, file_path in enumerate(files):
132
+ yield f"Chunking: {i+1}/{total_files} ({file_path.name})"
133
+ try:
134
+ meta = {"file_name": file_path.name, "url": repo_url}
135
+ file_chunks = chunker.chunk_file(file_path, repo_metadata=meta)
136
+ all_chunks.extend(file_chunks)
137
+ except Exception as e:
138
+ print(f"Skipping {file_path}: {e}")
139
+
140
+ if not all_chunks:
141
+ return "No valid chunks found."
142
+
143
+ # 3. Indexing Loop (Batched)
144
+ total_chunks = len(all_chunks)
145
+ yield f"Generated {total_chunks} chunks. Embedding & Indexing into ChromaDB..."
146
+
147
+ batch_size = 64
148
+ for i in range(0, total_chunks, batch_size):
149
+ batch = all_chunks[i:i+batch_size]
150
+
151
+ # Prepare data
152
+ texts = [c.code for c in batch]
153
+ ids = [str(uuid.uuid4()) for _ in batch]
154
+ metadatas = [{"file_name": Path(c.file_path).name, "url": repo_url} for c in batch]
155
+
156
+ # Compute Embeddings
157
+ embeddings = compute_embeddings(texts)
158
+ if embeddings is not None:
159
+ # Add to Chroma
160
+ collection.add(
161
+ ids=ids,
162
+ embeddings=embeddings.cpu().numpy().tolist(),
163
+ metadatas=metadatas,
164
+ documents=texts
165
+ )
166
+
167
+ progress = int((i / total_chunks) * 100)
168
+ yield f"Indexed {min(i+batch_size, total_chunks)}/{total_chunks} ({progress}%)"
169
+
170
+ count = collection.count()
171
+ yield f"Success! Database now has {count} code chunks. Ready for search."
172
+
173
+ except Exception as e:
174
+ import traceback
175
+ traceback.print_exc()
176
+ yield f"Error: {str(e)}"
177
+
178
+ # --- Analysis Functions ---
179
+ def fn_analyze_embeddings():
180
+ count = collection.count()
181
+ if count < 5:
182
+ return "Not enough data (Need > 5 chunks).", None
183
+
184
+ try:
185
+ # Fetch all embeddings (Limit to 2000 for visualization speed)
186
+ limit = min(count, 2000)
187
+ data = collection.get(limit=limit, include=["embeddings", "metadatas"])
188
+
189
+ X = torch.tensor(data['embeddings'])
190
+
191
+ # PCA
192
+ X_mean = torch.mean(X, 0)
193
+ X_centered = X - X_mean
194
+ U, S, V = torch.pca_lowrank(X_centered, q=2)
195
+ projected = torch.matmul(X_centered, V[:, :2]).numpy()
196
+
197
+ # Diversity
198
+ indices = torch.randint(0, len(X), (min(100, len(X)),))
199
+ sample = X[indices]
200
+ sim_matrix = torch.mm(sample, sample.t())
201
+ mask = ~torch.eye(len(sample), dtype=bool)
202
+ avg_sim = sim_matrix[mask].mean().item()
203
+ diversity_score = 1.0 - avg_sim
204
+
205
+ metrics = (
206
+ f"Total Chunks: {count}\n"
207
+ f"Analyzed: {len(X)} (Sampled)\n"
208
+ f"Diversity Score: {diversity_score:.4f}\n"
209
+ f"Est. Avg Similarity: {avg_sim:.4f}"
210
+ )
211
+
212
+ plot_df = pd.DataFrame({
213
+ "x": projected[:, 0],
214
+ "y": projected[:, 1],
215
+ "topic": [m.get("file_name", "unknown") for m in data['metadatas']]
216
+ })
217
+
218
+ return metrics, gr.ScatterPlot(value=plot_df, x="x", y="y", color="topic", title="Semantic Space", tooltip="topic")
219
+
220
+ except Exception as e:
221
+ import traceback
222
+ traceback.print_exc()
223
+ return f"Analysis Error: {e}", None
224
+
225
+ def fn_evaluate_retrieval(sample_limit):
226
+ count = collection.count()
227
+ if count < 10: return "Not enough data for evaluation (Need > 10 chunks)."
228
+
229
+ try:
230
+ # Sample random chunks
231
+ # Chroma doesn't support random sample easily, so we get a larger batch and pick random
232
+ fetch_limit = min(count, 2000) # Fetch up to 2k to sample from
233
+ data = collection.get(limit=fetch_limit, include=["documents", "ids"])
234
+
235
+ import random
236
+ actual_sample_size = min(sample_limit, len(data['ids']))
237
+ sample_indices = random.sample(range(len(data['ids'])), actual_sample_size)
238
+
239
+ hits_at_1 = 0
240
+ hits_at_5 = 0
241
+ mrr_sum = 0
242
+
243
+ # Generator for progress updates
244
+ yield f"Running evaluation on {actual_sample_size} chunks..."
245
+
246
+ for i, idx in enumerate(sample_indices):
247
+ target_id = data['ids'][idx]
248
+ code = data['documents'][idx]
249
+
250
+ # Synthetic Query
251
+ query = "\n".join(code.split("\n")[:3])
252
+ query_emb = compute_embeddings([query]).cpu().numpy().tolist()[0]
253
+
254
+ # Query DB
255
+ results = collection.query(query_embeddings=[query_emb], n_results=10)
256
+
257
+ # Check results
258
+ found_ids = results['ids'][0]
259
+ if target_id in found_ids:
260
+ rank = found_ids.index(target_id) + 1
261
+ mrr_sum += 1.0 / rank
262
+ if rank == 1: hits_at_1 += 1
263
+ if rank <= 5: hits_at_5 += 1
264
+
265
+ if i % 10 == 0:
266
+ yield f"Evaluated {i}/{actual_sample_size}..."
267
+
268
+ recall_1 = hits_at_1 / actual_sample_size
269
+ recall_5 = hits_at_5 / actual_sample_size
270
+ mrr = mrr_sum / actual_sample_size
271
+
272
+ report = (
273
+ f"Evaluation on {actual_sample_size} random chunks:\n"
274
+ f"--------------------------------------------\n"
275
+ f"Recall@1: {recall_1:.4f}\n"
276
+ f"Recall@5: {recall_5:.4f}\n"
277
+ f"MRR: {mrr:.4f}\n"
278
+ f"\n(Note: Using ChromaDB for retrieval)"
279
+ )
280
+ yield report
281
+ except Exception as e:
282
+ import traceback
283
+ traceback.print_exc()
284
+ yield f"Eval Error: {e}"
285
+
286
+
287
+ # --- UI Layout ---
288
+ theme = gr.themes.Soft(
289
+ primary_hue="slate",
290
+ neutral_hue="slate",
291
+ spacing_size="sm",
292
+ radius_size="md"
293
+ ).set(
294
+ body_background_fill="*neutral_50",
295
+ block_background_fill="white",
296
+ block_border_width="1px",
297
+ block_title_text_weight="600"
298
+ )
299
+
300
+ css = """
301
+ h1 {
302
+ text-align: center;
303
+ font-family: 'Inter', sans-serif;
304
+ margin-bottom: 1rem;
305
+ color: #1e293b;
306
+ }
307
+ .gradio-container {
308
+ max-width: 1200px !important;
309
+ margin: auto;
310
+ }
311
+ """
312
+
313
+ with gr.Blocks(theme=theme, css=css, title="CodeMode") as demo:
314
+ gr.Markdown("# CodeMode")
315
+
316
+ with gr.Tabs():
317
+ # --- TAB 1: INGEST ---
318
+ with gr.Tab("1. Ingest GitHub Repo"):
319
+ gr.Markdown("### Connect a Repository")
320
+ with gr.Row():
321
+ repo_input = gr.Textbox(label="GitHub URL", placeholder="https://github.com/fastapi/fastapi", value="https://github.com/langchain-ai/langgraph")
322
+ ingest_btn = gr.Button("Ingest & Index", variant="primary")
323
+
324
+ with gr.Row():
325
+ reset_btn = gr.Button("Reset Database", variant="stop")
326
+ ingest_status = gr.Textbox(label="Status")
327
+
328
+ with gr.Accordion("Database Inspector", open=False):
329
+ list_files_btn = gr.Button("Refresh File List")
330
+ files_df = gr.Dataframe(
331
+ headers=["File Name", "Chunks", "Source URL"],
332
+ datatype=["str", "number", "str"],
333
+ interactive=False
334
+ )
335
+
336
+ def fn_list_files():
337
+ count = collection.count()
338
+ if count == 0: return [["Database Empty", 0, "-"]]
339
+
340
+ try:
341
+ # Fetch all metadata (limit to 10k to prevent UI freeze)
342
+ limit = min(count, 10000)
343
+ data = collection.get(limit=limit, include=["metadatas"])
344
+
345
+ if not data or 'metadatas' not in data or data['metadatas'] is None:
346
+ return [["Error: No metadata found", 0, "-"]]
347
+
348
+ # Aggregate stats
349
+ file_counts = {} # filename -> count
350
+ file_urls = {} # filename -> url
351
+
352
+ for meta in data['metadatas']:
353
+ if meta is None: continue # Skip None entries
354
+ fname = meta.get("file_name", "unknown")
355
+ url = meta.get("url", "-")
356
+ file_counts[fname] = file_counts.get(fname, 0) + 1
357
+ file_urls[fname] = url
358
+
359
+ # Convert to list
360
+ output = []
361
+ for fname, count in file_counts.items():
362
+ output.append([fname, count, file_urls[fname]])
363
+
364
+ if not output:
365
+ return [["No files found in metadata", 0, "-"]]
366
+
367
+ # Sort by chunk count (descending)
368
+ output.sort(key=lambda x: x[1], reverse=True)
369
+ return output
370
+ except Exception as e:
371
+ import traceback
372
+ traceback.print_exc()
373
+ return [[f"Error: {str(e)}", 0, "-"]]
374
+
375
+ ingest_btn.click(fn_ingest, inputs=repo_input, outputs=[ingest_status])
376
+ reset_btn.click(fn=reset_db, inputs=[], outputs=[ingest_status])
377
+ list_files_btn.click(fn_list_files, inputs=[], outputs=[files_df])
378
+
379
+ # --- TAB 2: SEARCH ---
380
+ with gr.Tab("2. Semantic Search"):
381
+ gr.Markdown("### Search the Ingested Code")
382
+ with gr.Row():
383
+ search_box = gr.Textbox(label="Search Query", placeholder="e.g., 'how to create a state graph'")
384
+ search_btn = gr.Button("Search", variant="primary")
385
+
386
+ results_df = gr.Dataframe(
387
+ headers=["File Name", "Score", "Code Snippet"],
388
+ datatype=["str", "str", "str"],
389
+ interactive=False,
390
+ wrap=True
391
+ )
392
+ search_btn.click(fn=search_codebase, inputs=search_box, outputs=results_df)
393
+
394
+ # --- TAB 3: CODE SEARCH ---
395
+ with gr.Tab("3. Find Similar Code"):
396
+ gr.Markdown("### Code-to-Code Retrieval")
397
+ with gr.Row():
398
+ code_input = gr.Code(label="Reference Code", language="python")
399
+ code_search_btn = gr.Button("Find Matches", variant="primary")
400
+
401
+ code_results_df = gr.Dataframe(
402
+ headers=["File Name", "Score", "Matched Code"],
403
+ datatype=["str", "str", "str"],
404
+ interactive=False,
405
+ wrap=True
406
+ )
407
+ code_search_btn.click(fn=search_codebase, inputs=code_input, outputs=code_results_df)
408
+
409
+ # --- TAB 4: MLOps MONITORING ---
410
+ with gr.Tab("4. Deployment Monitoring"):
411
+ gr.Markdown("### Embedding Quality Analysis")
412
+ analyze_btn = gr.Button("Analyze Embeddings", variant="secondary")
413
+
414
+ with gr.Row():
415
+ quality_metrics = gr.Textbox(label="Quality Metrics")
416
+ plot_output = gr.ScatterPlot(label="Semantic Space (PCA)")
417
+
418
+ analyze_btn.click(fn_analyze_embeddings, inputs=[], outputs=[quality_metrics, plot_output])
419
+
420
+ gr.Markdown("### Extrinsic Evaluation (Retrieval Performance)")
421
+ with gr.Row():
422
+ eval_size = gr.Slider(minimum=10, maximum=1000, value=50, step=10, label="Sample Size (Chunks)")
423
+ eval_btn = gr.Button("Run Retrieval Evaluation", variant="primary")
424
+
425
+ eval_output = gr.Textbox(label="Evaluation Report")
426
+
427
+ eval_btn.click(fn_evaluate_retrieval, inputs=[eval_size], outputs=eval_output)
428
+
429
+ if __name__ == "__main__":
430
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ chromadb>=0.4.0
3
+ torch
4
+ transformers
5
+ pandas
6
+ scikit-learn
7
+ tree-sitter==0.21.3
8
+ tree-sitter-languages
9
+ gitpython
scripts/__init__.py ADDED
File without changes
scripts/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
scripts/aggregate_datasets.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+
3
+ Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py.
4
+
5
+ '''
6
+
7
+ import json
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ from typing import List, Dict
11
+
12
+ BASE_SYNTHETIC_DIR = Path("data/synthetic")
13
+ OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined"
14
+
15
+
16
+ def load_jsonl(path: Path) -> List[Dict]:
17
+ with path.open("r", encoding="utf-8") as f:
18
+ return [json.loads(line) for line in f]
19
+
20
+
21
+ def save_jsonl(path: Path, records: List[Dict]):
22
+ path.parent.mkdir(parents=True, exist_ok=True)
23
+ with path.open("w", encoding="utf-8") as f:
24
+ for r in records:
25
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
26
+
27
+
28
+ def save_json(path: Path, records: List[Dict]):
29
+ path.parent.mkdir(parents=True, exist_ok=True)
30
+ with path.open("w", encoding="utf-8") as f:
31
+ json.dump(records, f, indent=2)
32
+
33
+
34
+ def aggregate():
35
+ positive_pairs_all = []
36
+ triplets_all = []
37
+ included_runs = []
38
+
39
+ for run_dir in BASE_SYNTHETIC_DIR.iterdir():
40
+ if not run_dir.is_dir():
41
+ continue
42
+ if run_dir.name == "combined":
43
+ continue
44
+
45
+ pos_path = run_dir / "positive_pairs.jsonl"
46
+ tri_path = run_dir / "triplets.jsonl"
47
+
48
+ if pos_path.exists() and tri_path.exists():
49
+ positive_pairs_all.extend(load_jsonl(pos_path))
50
+ triplets_all.extend(load_jsonl(tri_path))
51
+ included_runs.append(run_dir.name)
52
+
53
+ # Save JSONL (training)
54
+ save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all)
55
+ save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all)
56
+
57
+ # Save JSON (inspection / upload)
58
+ save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all)
59
+ save_json(OUTPUT_DIR / "triplets.json", triplets_all)
60
+
61
+ # Metadata
62
+ metadata = {
63
+ "type": "combined_dataset",
64
+ "included_runs": included_runs,
65
+ "total_positive_pairs": len(positive_pairs_all),
66
+ "total_triplets": len(triplets_all),
67
+ "created_at": datetime.utcnow().isoformat(),
68
+ }
69
+
70
+ with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f:
71
+ json.dump(metadata, f, indent=2)
72
+
73
+ print("✅ Combined dataset created at:", OUTPUT_DIR)
74
+
75
+
76
+ if __name__ == "__main__":
77
+ aggregate()
scripts/core/README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeMode Core Scripts 🚀
2
+
3
+ This directory contains the **modular core logic** for the CodeMode pipeline. It is designed to be cleaner and more production-ready than the experimental notebooks.
4
+
5
+ ## Structure
6
+
7
+ ### 1. Ingestion (`scripts/core/ingestion`)
8
+ Handles data collection and processing.
9
+ - `ingest.py`: The Git Crawler (formerly `git_crawler.py`).
10
+ - `chunk.py`: The Universal Chunker (formerly `repo_chunker.py`).
11
+ - `generate_data.py`: Creates training triplets (formerly `pairs_triplets_generator.py`).
12
+
13
+ **Usage:**
14
+ ```bash
15
+ # Example: Ingest a repo
16
+ python -m scripts.core.ingestion.ingest --url https://github.com/crewAIInc/crewAI
17
+
18
+ # Example: Generate Triplets
19
+ python -m scripts.core.ingestion.generate_data --chunks data/processed/chunks.jsonl --output data/training
20
+ ```
21
+
22
+ ### 2. Training (`scripts/core/training`)
23
+ Handles model training and embedding generation.
24
+ - `train.py`: Main training loop.
25
+ - `model.py`: The CodeEmbedder model architecture.
26
+ - `trainer.py`: The training loop logic.
27
+
28
+ **Usage:**
29
+ ```bash
30
+ # Example: Train the model
31
+ python -m scripts.core.training.train --data_path data/training/triplets.jsonl --epochs 3
32
+ ```
33
+
34
+ ## Why this structure?
35
+ - **Separation of Concerns:** Training logic doesn't depend on web scraping libraries.
36
+ - **Reusability:** You can import `CodeEmbedder` or `RepoChunker` in other projects easily.
37
+ - **Production Ready:** Direct python scripts instead of notebooks.
scripts/core/__init__.py ADDED
File without changes
scripts/core/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (177 Bytes). View file
 
scripts/core/ingestion/__init__.py ADDED
File without changes
scripts/core/ingestion/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
scripts/core/ingestion/__pycache__/ast_chunker.cpython-311.pyc ADDED
Binary file (14.9 kB). View file
 
scripts/core/ingestion/__pycache__/chunk.cpython-311.pyc ADDED
Binary file (20.4 kB). View file
 
scripts/core/ingestion/__pycache__/chunk_schema.cpython-311.pyc ADDED
Binary file (4.74 kB). View file
 
scripts/core/ingestion/__pycache__/doc_chunker.cpython-311.pyc ADDED
Binary file (14.8 kB). View file
 
scripts/core/ingestion/__pycache__/hierarchical_chunker.cpython-311.pyc ADDED
Binary file (8.04 kB). View file
 
scripts/core/ingestion/__pycache__/ingest.cpython-311.pyc ADDED
Binary file (18 kB). View file
 
scripts/core/ingestion/__pycache__/repo_metadata.cpython-311.pyc ADDED
Binary file (21.7 kB). View file
 
scripts/core/ingestion/__pycache__/ts_chunker.cpython-311.pyc ADDED
Binary file (5.77 kB). View file
 
scripts/core/ingestion/ast_chunker.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AST-based semantic code chunker - Primary source of truth for code structure.
3
+
4
+ This module implements the core AST-based chunking strategy that forms the
5
+ authority layer of our hybrid chunking pipeline. It uses Python's built-in
6
+ AST parser to extract semantic chunks (modules, classes, functions, methods)
7
+ while preserving hierarchical relationships.
8
+
9
+ ARCHITECTURE POSITION:
10
+ - Authority Layer: Source of truth for semantic structure
11
+ - Primary Chunker: Generates all primary chunks
12
+ - Hierarchy Builder: Establishes parent-child relationships
13
+
14
+ KEY FEATURES:
15
+ 1. AST-first parsing for semantic accuracy
16
+ 2. Hierarchical chunk generation with depth tracking
17
+ 3. Byte-level span calculation for precise positioning
18
+ 4. Import and decorator extraction per node
19
+ 5. Deterministic chunk ID generation
20
+
21
+ FLOW:
22
+ File → Python AST → ASTChunker visitor → Semantic chunks with hierarchy
23
+
24
+ USAGE:
25
+ from ast_chunker import extract_ast_chunks
26
+ chunks = extract_ast_chunks(Path("file.py"))
27
+ """
28
+
29
+ import ast
30
+ from pathlib import Path
31
+ from typing import List, Optional, Union, Dict, Tuple
32
+ import hashlib
33
+
34
+ from ..utils.id_utils import deterministic_chunk_id
35
+ from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ASTSymbolType, ChunkType
36
+
37
+ DocNode = Union[
38
+ ast.Module,
39
+ ast.ClassDef,
40
+ ast.FunctionDef,
41
+ ast.AsyncFunctionDef,
42
+ ]
43
+
44
+
45
+ class ASTChunker(ast.NodeVisitor):
46
+ def __init__(self, source: str, file_path: str):
47
+ self.source = source
48
+ self.file_path = file_path
49
+ self.source_bytes = source.encode('utf-8')
50
+ self.chunks: List[CodeChunk] = []
51
+ self.tree = ast.parse(source)
52
+
53
+ # Track hierarchy
54
+ self.current_class: Optional[str] = None
55
+ self.imports_list: List[str] = []
56
+
57
+ # For hierarchy tracking
58
+ self.parent_stack: List[CodeChunk] = []
59
+ self.sibling_counters: Dict[str, int] = {}
60
+
61
+ # Attach parents to nodes
62
+ for node in ast.walk(self.tree):
63
+ for child in ast.iter_child_nodes(node):
64
+ setattr(child, "parent", node)
65
+
66
+ # ---------------- utilities ----------------
67
+
68
+ def _get_code(self, node: ast.AST) -> str:
69
+ code = ast.get_source_segment(self.source, node)
70
+ return code.strip() if code else ""
71
+
72
+ def _get_byte_span(self, start_line: int, end_line: int) -> Tuple[int, int]:
73
+ """Convert line numbers to byte positions"""
74
+ lines = self.source.split('\n')
75
+
76
+ # Calculate start byte
77
+ start_byte = sum(len(line.encode()) + 1 for line in lines[:start_line-1])
78
+
79
+ # Calculate end byte (up to end_line)
80
+ end_byte = sum(len(line.encode()) + 1 for line in lines[:end_line])
81
+
82
+ return start_byte, end_byte
83
+
84
+ def _extract_node_imports(self, node: ast.AST) -> List[str]:
85
+ """Extract imports specific to this node (not all module imports)"""
86
+ imports: List[str] = []
87
+
88
+ # Walk through this node's body
89
+ for child in ast.walk(node):
90
+ if isinstance(child, (ast.Import, ast.ImportFrom)):
91
+ try:
92
+ imports.append(ast.unparse(child))
93
+ except Exception:
94
+ imports.append(str(child))
95
+ return imports
96
+
97
+ def _extract_decorators(self, node: ast.AST) -> List[str]:
98
+ decorators: List[str] = []
99
+ if hasattr(node, "decorator_list"):
100
+ for d in node.decorator_list: # type: ignore[attr-defined]
101
+ try:
102
+ decorators.append(ast.unparse(d))
103
+ except Exception:
104
+ decorators.append(str(d))
105
+ return decorators
106
+
107
+ # ---------------- chunk creation ----------------
108
+
109
+ def _create_chunk(
110
+ self,
111
+ node: DocNode,
112
+ chunk_type: ChunkType,
113
+ name: str,
114
+ parent: Optional[str] = None,
115
+ parent_chunk: Optional[CodeChunk] = None,
116
+ ) -> CodeChunk:
117
+ code = self._get_code(node)
118
+
119
+ # Get line numbers
120
+ start_line = getattr(node, "lineno", None)
121
+ end_line = getattr(node, "end_lineno", None)
122
+
123
+ # Calculate byte span
124
+ start_byte, end_byte = None, None
125
+ if start_line and end_line:
126
+ start_byte, end_byte = self._get_byte_span(start_line, end_line)
127
+
128
+ # Determine parent if not provided
129
+ if parent is None and chunk_type == "method":
130
+ parent = self.current_class
131
+
132
+ decorators: List[str] = []
133
+ if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
134
+ decorators = self._extract_decorators(node)
135
+
136
+ # Get imports specific to this node (not all module imports)
137
+ node_imports = self._extract_node_imports(node)
138
+
139
+ # Get docstring only for nodes that can have one
140
+ docstring: Optional[str] = None
141
+ if hasattr(node, 'body'):
142
+ docstring = ast.get_docstring(node)
143
+
144
+ # Determine hierarchy depth
145
+ depth = 0
146
+ lineage: List[str] = []
147
+ sibling_index = 0
148
+
149
+ if parent_chunk:
150
+ depth = parent_chunk.hierarchy.depth + 1
151
+ lineage = parent_chunk.hierarchy.lineage.copy()
152
+ lineage.append(parent_chunk.chunk_id)
153
+
154
+ # Update sibling counter
155
+ parent_key = parent_chunk.chunk_id
156
+ self.sibling_counters[parent_key] = self.sibling_counters.get(parent_key, 0) + 1
157
+ sibling_index = self.sibling_counters[parent_key] - 1
158
+
159
+ ast_info = ChunkAST(
160
+ symbol_type=chunk_type,
161
+ name=name,
162
+ parent=parent,
163
+ docstring=docstring,
164
+ decorators=decorators,
165
+ imports=node_imports,
166
+ )
167
+
168
+ span = ChunkSpan(
169
+ start_byte=start_byte,
170
+ end_byte=end_byte,
171
+ start_line=start_line,
172
+ end_line=end_line,
173
+ )
174
+
175
+ # Generate chunk ID
176
+ chunk_id = deterministic_chunk_id(
177
+ file_path=self.file_path,
178
+ chunk_type=chunk_type,
179
+ name=name,
180
+ parent=parent,
181
+ start_line=start_line,
182
+ end_line=end_line,
183
+ code=code,
184
+ )
185
+
186
+ chunk = CodeChunk(
187
+ chunk_id=chunk_id,
188
+ file_path=self.file_path,
189
+ language="python",
190
+ chunk_type=chunk_type,
191
+ code=code,
192
+ ast=ast_info,
193
+ span=span,
194
+ hierarchy=ChunkHierarchy(
195
+ parent_id=parent_chunk.chunk_id if parent_chunk else None,
196
+ children_ids=[],
197
+ depth=depth,
198
+ is_primary=True,
199
+ is_extracted=False,
200
+ lineage=lineage,
201
+ sibling_index=sibling_index,
202
+ ),
203
+ )
204
+
205
+ # Add to parent's children if parent exists
206
+ if parent_chunk:
207
+ parent_chunk.hierarchy.children_ids.append(chunk_id)
208
+
209
+ self.chunks.append(chunk)
210
+ return chunk
211
+
212
+ def _create_module_chunk(self) -> CodeChunk:
213
+ """Create module chunk with all imports"""
214
+ module_name = Path(self.file_path).stem
215
+ start_line = 1
216
+ end_line = len(self.source.split('\n'))
217
+ start_byte, end_byte = self._get_byte_span(start_line, end_line)
218
+
219
+ # Module code - entire file
220
+ module_code = self.source
221
+
222
+ # Extract ALL imports for module
223
+ module_imports: List[str] = []
224
+ for node in ast.walk(self.tree):
225
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
226
+ try:
227
+ module_imports.append(ast.unparse(node))
228
+ except Exception:
229
+ pass
230
+
231
+ chunk_id = deterministic_chunk_id(
232
+ file_path=self.file_path,
233
+ chunk_type="module",
234
+ name=module_name,
235
+ parent=None,
236
+ start_line=start_line,
237
+ end_line=end_line,
238
+ code=module_code,
239
+ )
240
+
241
+ ast_info = ChunkAST(
242
+ symbol_type="module",
243
+ name=module_name,
244
+ parent=None,
245
+ docstring=ast.get_docstring(self.tree),
246
+ decorators=[],
247
+ imports=module_imports, # ALL imports in module
248
+ )
249
+
250
+ span = ChunkSpan(
251
+ start_byte=start_byte,
252
+ end_byte=end_byte,
253
+ start_line=start_line,
254
+ end_line=end_line,
255
+ )
256
+
257
+ chunk = CodeChunk(
258
+ chunk_id=chunk_id,
259
+ file_path=self.file_path,
260
+ language="python",
261
+ chunk_type="module",
262
+ code=module_code,
263
+ ast=ast_info,
264
+ span=span,
265
+ hierarchy=ChunkHierarchy(
266
+ parent_id=None,
267
+ children_ids=[],
268
+ depth=0,
269
+ is_primary=True,
270
+ is_extracted=False,
271
+ lineage=[],
272
+ sibling_index=0,
273
+ ),
274
+ )
275
+
276
+ self.chunks.append(chunk)
277
+ return chunk
278
+
279
+ # ---------------- visitors ----------------
280
+
281
+ def visit_Import(self, node: ast.Import) -> None:
282
+ try:
283
+ self.imports_list.append(ast.unparse(node))
284
+ except Exception:
285
+ pass
286
+ self.generic_visit(node)
287
+
288
+ def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
289
+ try:
290
+ self.imports_list.append(ast.unparse(node))
291
+ except Exception:
292
+ pass
293
+ self.generic_visit(node)
294
+
295
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
296
+ # Create class chunk
297
+ class_chunk = self._create_chunk(
298
+ node,
299
+ "class",
300
+ node.name,
301
+ parent="module",
302
+ parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
303
+ )
304
+
305
+ # Save current class context
306
+ previous_class = self.current_class
307
+ self.current_class = node.name
308
+
309
+ # Push class to stack
310
+ self.parent_stack.append(class_chunk)
311
+
312
+ # Visit class body
313
+ self.generic_visit(node)
314
+
315
+ # Restore previous context
316
+ self.current_class = previous_class
317
+ self.parent_stack.pop()
318
+
319
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
320
+ parent = getattr(node, "parent", None)
321
+
322
+ if isinstance(parent, ast.Module):
323
+ # Top-level function
324
+ self._create_chunk(
325
+ node,
326
+ "function",
327
+ node.name,
328
+ parent="module",
329
+ parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
330
+ )
331
+ elif isinstance(parent, ast.ClassDef):
332
+ # Method inside class
333
+ self._create_chunk(
334
+ node,
335
+ "method",
336
+ node.name,
337
+ parent=parent.name,
338
+ parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
339
+ )
340
+
341
+ self.generic_visit(node)
342
+
343
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
344
+ parent = getattr(node, "parent", None)
345
+
346
+ if isinstance(parent, ast.Module):
347
+ # Top-level async function
348
+ self._create_chunk(
349
+ node,
350
+ "function",
351
+ node.name,
352
+ parent="module",
353
+ parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
354
+ )
355
+ elif isinstance(parent, ast.ClassDef):
356
+ # Async method inside class
357
+ self._create_chunk(
358
+ node,
359
+ "method",
360
+ node.name,
361
+ parent=parent.name,
362
+ parent_chunk=self.parent_stack[-1] if self.parent_stack else None,
363
+ )
364
+
365
+ self.generic_visit(node)
366
+
367
+ def visit_Module(self, node: ast.Module) -> None:
368
+ # Create module chunk first (root)
369
+ module_chunk = self._create_module_chunk()
370
+
371
+ # Push module to stack
372
+ self.parent_stack.append(module_chunk)
373
+
374
+ # Visit children to create classes and functions
375
+ self.generic_visit(node)
376
+
377
+ # Pop module from stack
378
+ self.parent_stack.pop()
379
+
380
+
381
+ # ---------------- public API ----------------
382
+
383
+ def extract_ast_chunks(file_path: Path) -> List[CodeChunk]:
384
+ source = file_path.read_text(encoding="utf-8")
385
+ chunker = ASTChunker(source, str(file_path))
386
+
387
+ # Visit the tree (creates all chunks with relationships)
388
+ chunker.visit(chunker.tree)
389
+
390
+ return chunker.chunks
scripts/core/ingestion/chunk.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Repository File Type Chunker - Universal chunker for all file types.
3
+
4
+ This module provides file-type-aware chunking for repositories, handling
5
+ everything from Python code to configuration files, documentation, and
6
+ special files. It's the universal interface that delegates to specialized
7
+ chunkers based on file type.
8
+
9
+ ARCHITECTURE POSITION:
10
+ - File Type Dispatcher: Routes files to appropriate chunkers
11
+ - Universal Interface: Single entry point for all file types
12
+ - Metadata Enricher: Adds repository context to all chunks
13
+
14
+ KEY FEATURES:
15
+ 1. File type detection and intelligent routing
16
+ 2. Hierarchical chunking for Python files
17
+ 3. Documentation chunking for markdown/RST
18
+ 4. Configuration file handling (JSON/YAML/TOML)
19
+ 5. Special file handling (README, requirements.txt, Dockerfile)
20
+ 6. Binary file detection and skipping
21
+
22
+ FILE TYPE SUPPORT:
23
+ - .py: HierarchicalChunker (AST + Tree-sitter)
24
+ - .md/.mdx/.rst: Documentation chunker
25
+ - .json/.yaml/.toml: Configuration chunker
26
+ - requirements.txt/Dockerfile: Special chunker
27
+ - .txt/.cfg/.ini: Text chunker
28
+ - README/LICENSE: Documentation chunker
29
+ - Others: Text chunker with binary detection
30
+
31
+ DATA FLOW:
32
+ File → Type detection → Route to specialized chunker →
33
+ Add repo metadata → Return chunks
34
+
35
+ USAGE:
36
+ chunker = RepoChunker()
37
+ chunks = chunker.chunk_file(Path("file.py"), repo_metadata)
38
+ """
39
+
40
+ from pathlib import Path
41
+ from typing import List, Dict, Optional, cast
42
+ import json
43
+ import yaml
44
+ import re
45
+ import hashlib
46
+ from .hierarchical_chunker import HierarchicalChunker
47
+ from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType, ASTSymbolType
48
+ from .doc_chunker import chunk_document as chunk_markdown_file
49
+
50
+
51
+ class RepoChunker:
52
+ """
53
+ Repository chunker that handles ALL file types with proper structure
54
+ """
55
+
56
+ def __init__(self, use_hierarchical: bool = True):
57
+ if use_hierarchical:
58
+ self.hierarchical_chunker = HierarchicalChunker()
59
+ self.use_hierarchical = use_hierarchical
60
+
61
+ def _generate_stable_id(self, content: str, prefix: str = "stable") -> str:
62
+ """
63
+ Generate deterministic chunk ID using SHA256.
64
+
65
+ IMPORTANT: This ensures IDs are stable across runs, processes,
66
+ and Python versions - crucial for RAG reproducibility.
67
+
68
+ Args:
69
+ content: The text content to hash
70
+ prefix: ID prefix (config, doc, text, etc.)
71
+
72
+ Returns:
73
+ Deterministic ID like "config_8a3b2c1d"
74
+ """
75
+ # Use SHA256 for consistency with id_utils.py
76
+ hash_digest = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
77
+ return f"{prefix}_{hash_digest}"
78
+
79
+ def chunk_file(self, file_path: Path, repo_metadata: Optional[Dict] = None) -> List[CodeChunk]:
80
+ """
81
+ Chunk ANY file type with repository context
82
+
83
+ Args:
84
+ file_path: Path to the file
85
+ repo_metadata: Optional dict with repo metadata
86
+ """
87
+ suffix = file_path.suffix.lower()
88
+
89
+ # Python files - use your advanced hierarchical chunker
90
+ if suffix == '.py':
91
+ return self._chunk_python_file(file_path, repo_metadata)
92
+
93
+ # Markdown/RST documentation
94
+ elif suffix in ['.md', '.mdx', '.rst']:
95
+ return self._chunk_markdown_file_wrapper(file_path, repo_metadata)
96
+
97
+ # JSON config files
98
+ elif suffix == '.json':
99
+ return self._chunk_json_file(file_path, repo_metadata)
100
+
101
+ # YAML/TOML config files
102
+ elif suffix in ['.yaml', '.yml', '.toml']:
103
+ return self._chunk_config_file(file_path, repo_metadata)
104
+
105
+ # Requirements/Docker files
106
+ elif file_path.name.lower() in ['requirements.txt', 'dockerfile', 'docker-compose.yml']:
107
+ return self._chunk_special_file(file_path, repo_metadata)
108
+
109
+ # Text files
110
+ elif suffix in ['.txt', '.cfg', '.ini', '.conf']:
111
+ return self._chunk_text_file(file_path, repo_metadata)
112
+
113
+ # README/LICENSE files
114
+ elif file_path.name.lower() in ['readme', 'readme.md', 'license', 'license.txt', 'license.md']:
115
+ return self._chunk_readme_file(file_path, repo_metadata)
116
+
117
+ # All other files
118
+ else:
119
+ return self._chunk_other_file(file_path, repo_metadata)
120
+
121
+ def _chunk_python_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
122
+ """Use our hierarchical chunker for Python files"""
123
+ try:
124
+ if self.use_hierarchical:
125
+ chunks = self.hierarchical_chunker.chunk_file(file_path)
126
+ else:
127
+ # Fallback to basic text chunking instead of hybrid
128
+ return self._chunk_text_file(file_path, repo_metadata)
129
+
130
+ # Add repository metadata
131
+ if repo_metadata:
132
+ for chunk in chunks:
133
+ if "repo_info" not in chunk.metadata:
134
+ chunk.metadata["repo_info"] = {}
135
+ chunk.metadata["repo_info"].update(repo_metadata)
136
+
137
+ return chunks
138
+
139
+ except Exception as e:
140
+ print(f"[ERROR] Error chunking Python file {file_path}: {e}")
141
+ return self._chunk_text_file(file_path, repo_metadata)
142
+
143
+ def _chunk_markdown_file_wrapper(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
144
+ """Chunk markdown files using our doc_chunker"""
145
+ try:
146
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
147
+
148
+ # Use your existing doc_chunker
149
+ doc_chunks = chunk_markdown_file(
150
+ content,
151
+ source_name=file_path.name,
152
+ source_url=f"file://{file_path}"
153
+ )
154
+
155
+ # Convert to CodeChunk schema
156
+ code_chunks = []
157
+ for doc_chunk in doc_chunks:
158
+ code_chunk = CodeChunk(
159
+ chunk_id=doc_chunk["chunk_id"], # Already uses SHA1 from doc_chunker.py
160
+ file_path=str(file_path),
161
+ language=doc_chunk.get("language", "markdown"),
162
+ chunk_type="documentation",
163
+ code=doc_chunk["content"],
164
+ ast=ChunkAST(
165
+ symbol_type="documentation",
166
+ name=file_path.name,
167
+ parent=None,
168
+ docstring=None
169
+ ),
170
+ span=ChunkSpan(
171
+ start_line=doc_chunk.get("metadata", {}).get("line_start", 1),
172
+ end_line=doc_chunk.get("metadata", {}).get("line_end", 1)
173
+ ),
174
+ metadata={
175
+ "doc_chunk_type": doc_chunk.get("chunk_type", "text"),
176
+ "repo_info": repo_metadata or {},
177
+ **doc_chunk.get("metadata", {})
178
+ },
179
+ hierarchy=ChunkHierarchy(
180
+ is_primary=True,
181
+ is_extracted=False,
182
+ depth=0
183
+ )
184
+ )
185
+ code_chunks.append(code_chunk)
186
+
187
+ return code_chunks
188
+
189
+ except Exception as e:
190
+ print(f"[ERROR] Error chunking markdown file {file_path}: {e}")
191
+ return self._chunk_text_file(file_path, repo_metadata)
192
+
193
+ def _chunk_json_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
194
+ """Chunk JSON config files"""
195
+ try:
196
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
197
+ data = json.loads(content)
198
+
199
+ pretty_content = json.dumps(data, indent=2)
200
+
201
+ # FIXED: Use deterministic SHA256 instead of hash()
202
+ chunk = CodeChunk(
203
+ chunk_id=self._generate_stable_id(pretty_content, "config"),
204
+ file_path=str(file_path),
205
+ language="json",
206
+ chunk_type="configuration",
207
+ code=pretty_content,
208
+ ast=ChunkAST(
209
+ symbol_type="configuration",
210
+ name=file_path.name,
211
+ parent=None,
212
+ docstring=None
213
+ ),
214
+ span=ChunkSpan(
215
+ start_line=1,
216
+ end_line=len(pretty_content.split('\n'))
217
+ ),
218
+ metadata={
219
+ "file_type": "json_config",
220
+ "config_keys": list(data.keys()) if isinstance(data, dict) else [],
221
+ "repo_info": repo_metadata or {}
222
+ },
223
+ hierarchy=ChunkHierarchy(
224
+ is_primary=True,
225
+ is_extracted=False,
226
+ depth=0
227
+ )
228
+ )
229
+
230
+ return [chunk]
231
+
232
+ except Exception as e:
233
+ print(f"[ERROR] Error chunking JSON file {file_path}: {e}")
234
+ return self._chunk_text_file(file_path, repo_metadata)
235
+
236
+ def _chunk_config_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
237
+ """Chunk YAML/TOML config files"""
238
+ try:
239
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
240
+ suffix = file_path.suffix.lower()
241
+
242
+ language = "yaml" if suffix in ['.yaml', '.yml'] else "toml"
243
+
244
+ # FIXED: Use deterministic SHA256 instead of hash()
245
+ chunk = CodeChunk(
246
+ chunk_id=self._generate_stable_id(content, "config"),
247
+ file_path=str(file_path),
248
+ language=language,
249
+ chunk_type="configuration",
250
+ code=content,
251
+ ast=ChunkAST(
252
+ symbol_type="configuration",
253
+ name=file_path.name,
254
+ parent=None,
255
+ docstring=None
256
+ ),
257
+ span=ChunkSpan(
258
+ start_line=1,
259
+ end_line=len(content.split('\n'))
260
+ ),
261
+ metadata={
262
+ "file_type": f"{language}_config",
263
+ "repo_info": repo_metadata or {}
264
+ },
265
+ hierarchy=ChunkHierarchy(
266
+ is_primary=True,
267
+ is_extracted=False,
268
+ depth=0
269
+ )
270
+ )
271
+
272
+ return [chunk]
273
+
274
+ except Exception as e:
275
+ print(f"[ERROR] Error chunking config file {file_path}: {e}")
276
+ return self._chunk_text_file(file_path, repo_metadata)
277
+
278
+ def _chunk_special_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
279
+ """Chunk special files (requirements.txt, Dockerfile, etc.)"""
280
+ try:
281
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
282
+ file_name = file_path.name.lower()
283
+
284
+ if 'requirements' in file_name:
285
+ language = "requirements"
286
+ chunk_type = "configuration"
287
+ prefix = "config"
288
+ elif 'docker' in file_name:
289
+ language = "dockerfile"
290
+ chunk_type = "script"
291
+ prefix = "script"
292
+ else:
293
+ language = "text"
294
+ chunk_type = "text"
295
+ prefix = "text"
296
+
297
+ # FIXED: Use deterministic SHA256 instead of hash()
298
+ chunk = CodeChunk(
299
+ chunk_id=self._generate_stable_id(content, prefix),
300
+ file_path=str(file_path),
301
+ language=language,
302
+ chunk_type=chunk_type,
303
+ code=content,
304
+ ast=ChunkAST(
305
+ symbol_type=chunk_type,
306
+ name=file_path.name,
307
+ parent=None,
308
+ docstring=None
309
+ ),
310
+ span=ChunkSpan(
311
+ start_line=1,
312
+ end_line=len(content.split('\n'))
313
+ ),
314
+ metadata={
315
+ "file_type": file_name,
316
+ "repo_info": repo_metadata or {},
317
+ "dependencies": self._extract_dependencies(content) if "requirements" in file_name else []
318
+ },
319
+ hierarchy=ChunkHierarchy(
320
+ is_primary=True,
321
+ is_extracted=False,
322
+ depth=0
323
+ )
324
+ )
325
+
326
+ return [chunk]
327
+
328
+ except Exception as e:
329
+ print(f"[ERROR] Error chunking special file {file_path}: {e}")
330
+ return self._chunk_text_file(file_path, repo_metadata)
331
+
332
+ def _chunk_text_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
333
+ """Chunk plain text files"""
334
+ try:
335
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
336
+
337
+ # Create a single chunk for small files, multiple for large ones
338
+ if len(content.split('\n')) <= 200:
339
+ chunks = [self._create_text_chunk(content, file_path, repo_metadata)]
340
+ else:
341
+ # Split large text files into reasonable chunks
342
+ chunks = []
343
+ lines = content.split('\n')
344
+ chunk_size = 100
345
+
346
+ for i in range(0, len(lines), chunk_size):
347
+ chunk_lines = lines[i:i + chunk_size]
348
+ chunk_content = '\n'.join(chunk_lines)
349
+
350
+ chunk = self._create_text_chunk(
351
+ chunk_content,
352
+ file_path,
353
+ repo_metadata,
354
+ chunk_index=i // chunk_size
355
+ )
356
+ chunks.append(chunk)
357
+
358
+ return chunks
359
+
360
+ except Exception as e:
361
+ print(f"[ERROR] Error reading text file {file_path}: {e}")
362
+ return []
363
+
364
+ def _chunk_readme_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
365
+ """Special handling for README/LICENSE files"""
366
+ try:
367
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
368
+ file_name_lower = file_path.name.lower()
369
+
370
+ # Determine appropriate prefix
371
+ if 'readme' in file_name_lower:
372
+ prefix = "doc"
373
+ elif 'license' in file_name_lower:
374
+ prefix = "license"
375
+ else:
376
+ prefix = "doc"
377
+
378
+ # FIXED: Use deterministic SHA256 instead of hash()
379
+ chunk = CodeChunk(
380
+ chunk_id=self._generate_stable_id(content, prefix),
381
+ file_path=str(file_path),
382
+ language="markdown" if file_path.suffix in ['.md', '.mdx'] else "text",
383
+ chunk_type="documentation",
384
+ code=content,
385
+ ast=ChunkAST(
386
+ symbol_type="documentation",
387
+ name=file_path.name,
388
+ parent=None,
389
+ docstring=None
390
+ ),
391
+ span=ChunkSpan(
392
+ start_line=1,
393
+ end_line=len(content.split('\n'))
394
+ ),
395
+ metadata={
396
+ "file_type": "readme_license",
397
+ "is_readme": "readme" in file_name_lower,
398
+ "is_license": "license" in file_name_lower,
399
+ "repo_info": repo_metadata or {}
400
+ },
401
+ hierarchy=ChunkHierarchy(
402
+ is_primary=True,
403
+ is_extracted=False,
404
+ depth=0
405
+ )
406
+ )
407
+
408
+ return [chunk]
409
+
410
+ except Exception as e:
411
+ print(f"[ERROR] Error chunking README file {file_path}: {e}")
412
+ return self._chunk_text_file(file_path, repo_metadata)
413
+
414
+ def _chunk_other_file(self, file_path: Path, repo_metadata: Optional[Dict]) -> List[CodeChunk]:
415
+ """Fallback for unknown file types (binary or unsupported)"""
416
+ try:
417
+ # Try to read as text first
418
+ content = file_path.read_text(encoding='utf-8', errors='ignore')
419
+
420
+ # If it looks like binary (mostly non-printable characters)
421
+ if self._looks_like_binary(content):
422
+ print(f"[SKIPPED] Skipping binary file: {file_path}")
423
+ return []
424
+
425
+ # If readable text, treat as text file
426
+ return self._chunk_text_file(file_path, repo_metadata)
427
+
428
+ except UnicodeDecodeError:
429
+ print(f"[SKIPPED] Skipping binary file: {file_path}")
430
+ return []
431
+ except Exception as e:
432
+ print(f"[ERROR] Error with file {file_path}: {e}")
433
+ return []
434
+
435
+ def _create_text_chunk(self, content: str, file_path: Path,
436
+ repo_metadata: Optional[Dict], chunk_index: int = 0) -> CodeChunk:
437
+ """Helper to create a text chunk"""
438
+ lines = content.split('\n')
439
+
440
+ # ENHANCED: Use deterministic ID that includes chunk_index for uniqueness
441
+ id_payload = f"{content}_{chunk_index}"
442
+
443
+ return CodeChunk(
444
+ chunk_id=self._generate_stable_id(id_payload, "text"),
445
+ file_path=str(file_path),
446
+ language="text",
447
+ chunk_type="text",
448
+ code=content,
449
+ ast=ChunkAST(
450
+ symbol_type="text",
451
+ name=file_path.name,
452
+ parent=None,
453
+ docstring=None
454
+ ),
455
+ span=ChunkSpan(
456
+ start_line=1,
457
+ end_line=len(lines)
458
+ ),
459
+ metadata={
460
+ "file_type": "text",
461
+ "chunk_index": chunk_index,
462
+ "total_lines": len(lines),
463
+ "repo_info": repo_metadata or {}
464
+ },
465
+ hierarchy=ChunkHierarchy(
466
+ is_primary=True,
467
+ is_extracted=False,
468
+ depth=0
469
+ )
470
+ )
471
+
472
+ def _extract_dependencies(self, requirements_content: str) -> List[str]:
473
+ """Extract package names from requirements.txt"""
474
+ dependencies = []
475
+ for line in requirements_content.split('\n'):
476
+ line = line.strip()
477
+ if line and not line.startswith('#'):
478
+ # Extract package name (before version specifiers)
479
+ package = line.split('==')[0].split('>=')[0].split('<=')[0].strip()
480
+ if package:
481
+ dependencies.append(package)
482
+ return dependencies
483
+
484
+ def _looks_like_binary(self, content: str, threshold: float = 0.3) -> bool:
485
+ """Check if content looks like binary data"""
486
+ if not content:
487
+ return False
488
+
489
+ # Count printable vs non-printable characters
490
+ printable = sum(1 for c in content if 32 <= ord(c) <= 126 or c in '\n\r\t')
491
+ total = len(content)
492
+
493
+ if total == 0:
494
+ return False
495
+
496
+ ratio = printable / total
497
+ return ratio < threshold
scripts/core/ingestion/chunk_schema.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ chunk_schema.py - UPDATED with enhanced hierarchy
3
+ """
4
+
5
+ from typing import Dict, List, Optional, Literal, Union
6
+ from dataclasses import dataclass, field
7
+
8
+
9
+ # ✅ EXPANDED ChunkType to support ALL file types
10
+ ChunkType = Literal[
11
+ "module", # Python module
12
+ "class", # Python class
13
+ "function", # Python function
14
+ "method", # Python method
15
+ "context", # General context
16
+ "documentation", # Markdown/RST docs
17
+ "configuration", # Config files (JSON, YAML, TOML)
18
+ "notebook", # Jupyter notebook
19
+ "script", # Shell scripts
20
+ "dockerfile", # Docker files
21
+ "typescript", # TypeScript files
22
+ "javascript", # JavaScript files
23
+ "text", # Plain text
24
+ "imports", # Import statements
25
+ "unknown" # Unknown file type
26
+ ]
27
+
28
+ # For AST symbol types
29
+ ASTSymbolType = Literal[
30
+ "module", "class", "function", "method", "context",
31
+ "documentation", "configuration", "notebook", "script",
32
+ "dockerfile", "typescript", "javascript", "text",
33
+ "imports",
34
+ "unknown"
35
+ ]
36
+
37
+
38
+ # @dataclass
39
+ # class ChunkHierarchy:
40
+ # """Enhanced hierarchical relationship metadata"""
41
+ # parent_id: Optional[str] = None
42
+ # children_ids: List[str] = field(default_factory=list)
43
+ # depth: int = 0
44
+ # is_primary: bool = True
45
+ # is_extracted: bool = False
46
+ # lineage: List[str] = field(default_factory=list) # Path from root
47
+ # sibling_index: int = 0 # Position among siblings
48
+
49
+ @dataclass
50
+ class ChunkHierarchy:
51
+ """Enhanced hierarchical relationship metadata"""
52
+ parent_id: Optional[str] = None
53
+ children_ids: List[str] = field(default_factory=list)
54
+ depth: int = 0
55
+ is_primary: bool = True
56
+ is_extracted: bool = False
57
+ lineage: List[str] = field(default_factory=list) # Path from root
58
+ sibling_index: int = 0 # Position among siblings
59
+
60
+ # Optional: Add methods for type-safe operations
61
+ def add_child(self, child_id: str) -> None:
62
+ """Type-safe method to add child"""
63
+ if child_id not in self.children_ids:
64
+ self.children_ids.append(child_id)
65
+
66
+ def remove_child(self, child_id: str) -> None:
67
+ """Type-safe method to remove child"""
68
+ if child_id in self.children_ids:
69
+ self.children_ids.remove(child_id)
70
+
71
+ def set_parent(self, parent_id: Optional[str]) -> None:
72
+ """Type-safe method to set parent"""
73
+ self.parent_id = parent_id
74
+
75
+ def increment_depth(self) -> None:
76
+ """Increment depth by 1"""
77
+ self.depth += 1
78
+
79
+
80
+ @dataclass
81
+ class ChunkAST:
82
+ symbol_type: Optional[ASTSymbolType] = None
83
+ name: Optional[str] = None
84
+ parent: Optional[str] = None
85
+ docstring: Optional[str] = None
86
+ decorators: List[str] = field(default_factory=list)
87
+ imports: List[str] = field(default_factory=list)
88
+ node_type: Optional[str] = None # Original AST node type
89
+
90
+
91
+ @dataclass
92
+ class ChunkSpan:
93
+ start_byte: Optional[int] = None
94
+ end_byte: Optional[int] = None
95
+ start_line: Optional[int] = None
96
+ end_line: Optional[int] = None
97
+ char_count: Optional[int] = None # Character count for quick reference
98
+
99
+
100
+
101
+ @dataclass
102
+ class CodeChunk:
103
+ chunk_id: str
104
+ file_path: str
105
+ language: str
106
+ chunk_type: ChunkType # ✅ Now accepts ALL types
107
+ code: str
108
+ ast: ChunkAST
109
+ span: ChunkSpan
110
+ metadata: Dict = field(default_factory=dict)
111
+ hierarchy: ChunkHierarchy = field(default_factory=ChunkHierarchy)
112
+
scripts/core/ingestion/doc_chunker.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import re
5
+ from typing import List, Dict, Optional
6
+ from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy
7
+
8
+ def _hash_id(text: str, prefix: str) -> str:
9
+ """
10
+ Generate deterministic ID using SHA256 (standardized).
11
+
12
+ Previously used SHA1, now standardized to SHA256 for consistency
13
+ with repo_chunker.py and id_utils.py.
14
+ """
15
+ # CHANGED: sha1 → sha256
16
+ h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
17
+ return f"{prefix}_{h}"
18
+
19
+
20
+ def _is_actual_code(text: str) -> bool:
21
+ """
22
+ Check if text inside a fenced block is actual executable code
23
+ or just formatted text.
24
+ """
25
+ text = text.strip()
26
+
27
+ # Common patterns that indicate formatted text, not code
28
+ formatted_text_patterns = [
29
+ # Lines with many = or - characters (dividers)
30
+ r'^=+\s*[A-Za-z\s]+\s*=+$',
31
+ r'^-+\s*[A-Za-z\s]+\s*-+$',
32
+ # Lines that look like headers/separators
33
+ r'^[=_-]{20,}$',
34
+ # Contains natural language sentences
35
+ r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
36
+ r'[.!?]\s+[A-Z]', # Sentence boundaries
37
+ # Message-like patterns
38
+ r'^\s*(Human|AI|Tool|System|User|Assistant)\s+(Message|Response|Input|Output)?\s*[:=-]',
39
+ r'^\s*[A-Z][a-z]+\s*:', # "Reasoning:", "Acting:", etc.
40
+ ]
41
+
42
+ # Check if it looks like formatted text
43
+ lines = text.split('\n')
44
+ formatted_line_count = 0
45
+ code_line_count = 0
46
+
47
+ # Patterns that indicate actual code
48
+ code_patterns = [
49
+ r'^\s*(def|class|import|from|async|await|return|if|for|while|try|except|with)\b',
50
+ r'^\s*@\w+',
51
+ r'^\s*\w+\s*=\s*.+',
52
+ r'^\s*\w+\(.+\)',
53
+ r'^\s*print\(.+\)',
54
+ r'^\s*\{.*\}', # JSON/dict
55
+ r'^\s*\[.*\]', # List
56
+ ]
57
+
58
+ for line in lines:
59
+ line = line.strip()
60
+ if not line:
61
+ continue
62
+
63
+ # Check for formatted text patterns
64
+ is_formatted = any(re.search(pattern, line, re.IGNORECASE) for pattern in formatted_text_patterns)
65
+
66
+ # Check for code patterns
67
+ is_code = any(re.search(pattern, line) for pattern in code_patterns)
68
+
69
+ if is_formatted:
70
+ formatted_line_count += 1
71
+ if is_code:
72
+ code_line_count += 1
73
+
74
+ # If it has many formatted text lines and few/no code lines, it's not actual code
75
+ if formatted_line_count > 1 and code_line_count == 0:
76
+ return False
77
+
78
+ # Default to treating fenced blocks as code (original behavior)
79
+ return True
80
+
81
+
82
+ def _looks_like_code_block(lines: List[str]) -> bool:
83
+ """
84
+ Heuristic to recover code blocks when Markdown fences are missing
85
+ (common after HTML → MD conversion).
86
+ """
87
+ if not lines:
88
+ return False
89
+
90
+ # Join lines and check for minimum length
91
+ joined = "\n".join(lines)
92
+ text = joined.strip()
93
+
94
+ # Too short? Probably not code
95
+ if len(text) < 50:
96
+ return False
97
+
98
+ # Check for code patterns
99
+ code_patterns = [
100
+ # Python keywords at line start
101
+ r'^\s*(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|from\s+\w+\s+import)',
102
+ # Function calls or assignments
103
+ r'^\s*\w+\s*=\s*.+|^\s*\w+\s*\(.+\)',
104
+ # Control structures
105
+ r'^\s*(if|for|while|with|try|except|finally|async|await)\s+',
106
+ # Decorators
107
+ r'^\s*@\w+',
108
+ # Return statements
109
+ r'^\s*return\b',
110
+ # Print statements
111
+ r'^\s*print\(',
112
+ # Indented blocks (common in Python)
113
+ r'^\s{4,}\S',
114
+ ]
115
+
116
+ # Check for prose indicators (if these are present, it's likely text)
117
+ prose_indicators = [
118
+ # Common English words in prose
119
+ r'\b(the|and|that|this|with|for|are|is|was|were|have|has|had)\b',
120
+ # Sentence endings followed by capital
121
+ r'[.!?]\s+[A-Z]',
122
+ # Articles
123
+ r'\b(a|an|the)\s+\w+',
124
+ ]
125
+
126
+ lines_list = text.split('\n')
127
+ code_line_count = 0
128
+ prose_line_count = 0
129
+
130
+ for line in lines_list:
131
+ line = line.strip()
132
+ if not line:
133
+ continue
134
+
135
+ # Check if line looks like code
136
+ is_code = any(re.search(pattern, line) for pattern in code_patterns)
137
+
138
+ # Check if line looks like prose (but only if it's not empty/short)
139
+ is_prose = len(line) > 20 and any(re.search(pattern, line, re.IGNORECASE) for pattern in prose_indicators)
140
+
141
+ if is_code:
142
+ code_line_count += 1
143
+ if is_prose:
144
+ prose_line_count += 1
145
+
146
+ # Need strong evidence for code
147
+ total_non_empty_lines = len([l for l in lines_list if l.strip()])
148
+
149
+ # If more than 2 lines look like code and not many look like prose
150
+ if code_line_count >= 2 and prose_line_count <= code_line_count // 2:
151
+ return True
152
+
153
+ # Special case: single strong code line in short text
154
+ if total_non_empty_lines <= 3 and code_line_count >= 1 and prose_line_count == 0:
155
+ return True
156
+
157
+ # Check for specific code-only patterns
158
+ code_only_patterns = [
159
+ r'^\s*from langchain\.',
160
+ r'^\s*import langchain',
161
+ r'^\s*@tool\b', # Decorator
162
+ r'^\s*agent = create_agent\(',
163
+ r'^\s*result = agent\.invoke\(',
164
+ ]
165
+
166
+ if any(re.search(pattern, text) for pattern in code_only_patterns):
167
+ return True
168
+
169
+ return False
170
+
171
+
172
+ def _looks_like_executable_code(text: str) -> bool:
173
+ """Check if code looks like it could be executed"""
174
+ # First check if it's actually code (not formatted text)
175
+ if not _is_actual_code(text):
176
+ return False
177
+
178
+ # Check for actual Python syntax patterns
179
+ patterns = [
180
+ r'\bdef\s+\w+\s*\([^)]*\)\s*:',
181
+ r'\bclass\s+\w+\s*\(?[^:]*\)?\s*:',
182
+ r'^\s*from\s+\w+\s+import\s+\w+',
183
+ r'^\s*import\s+\w+',
184
+ r'\breturn\b',
185
+ r'\bprint\(',
186
+ r'^\s*\w+\s*=\s*[^=\n]+$', # Variable assignment
187
+ ]
188
+
189
+ lines = text.split('\n')
190
+ executable_lines = 0
191
+
192
+ for line in lines:
193
+ line = line.strip()
194
+ if not line or line.startswith('#') or line.startswith('"""'):
195
+ continue
196
+ if any(re.search(pattern, line) for pattern in patterns):
197
+ executable_lines += 1
198
+
199
+ # Need at least 2 executable lines or 1 strong executable line
200
+ return executable_lines >= 2 or (
201
+ executable_lines >= 1 and len([l for l in lines if l.strip()]) <= 3
202
+ )
203
+
204
+
205
+ def chunk_document(
206
+ raw_text: str,
207
+ source_name: str,
208
+ source_url: Optional[str] = None,
209
+ ) -> List[Dict]:
210
+ """
211
+ Chunk documentation text containing headings, prose, and code examples.
212
+
213
+ Design goals:
214
+ - Preserve document hierarchy
215
+ - Separate prose vs code
216
+ - Recover code even if Markdown fences are lost
217
+ - Deterministic chunk IDs
218
+ """
219
+
220
+ chunks: List[Dict] = []
221
+
222
+ heading_stack: List[str] = []
223
+ current_heading: Optional[str] = None
224
+ current_heading_level: Optional[int] = None
225
+
226
+ buffer: List[str] = []
227
+
228
+ code_block = False
229
+ code_language: Optional[str] = None
230
+ code_lines: List[str] = []
231
+
232
+ lines = raw_text.splitlines()
233
+ chunk_index = 0
234
+ line_cursor = 0
235
+
236
+ def heading_path() -> Optional[str]:
237
+ return " > ".join(heading_stack) if heading_stack else None
238
+
239
+ def flush_text(start_line: int, end_line: int):
240
+ nonlocal buffer, chunk_index
241
+ if not buffer:
242
+ return
243
+
244
+ text = "\n".join(buffer).strip()
245
+ buffer = []
246
+
247
+ if not text:
248
+ return
249
+
250
+ lines_local = text.splitlines()
251
+
252
+ # 🔹 Recover unfenced code blocks - use stricter heuristic
253
+ # Only mark as code if it's very clearly code
254
+ if _looks_like_code_block(lines_local) and len(text) > 30:
255
+ # Double-check: make sure it doesn't look like prose
256
+ looks_like_prose = any(word in text.lower() for word in
257
+ ['the', 'and', 'that', 'this', 'with', 'for', 'are', 'is', 'was'])
258
+
259
+ if not looks_like_prose:
260
+ chunks.append(
261
+ {
262
+ "chunk_id": _hash_id(text, "doc_code"),
263
+ "source": "documentation",
264
+ "source_name": source_name,
265
+ "source_url": source_url,
266
+ "language": "python",
267
+ "chunk_type": "code",
268
+ "content": text,
269
+ "chunk_index": chunk_index,
270
+ "metadata": {
271
+ "heading": current_heading,
272
+ "heading_level": current_heading_level,
273
+ "heading_path": heading_path(),
274
+ "line_start": start_line,
275
+ "line_end": end_line,
276
+ "inferred_block": True,
277
+ },
278
+ }
279
+ )
280
+ chunk_index += 1
281
+ return
282
+
283
+ # Default to text
284
+ chunks.append(
285
+ {
286
+ "chunk_id": _hash_id(text, "doc_text"),
287
+ "source": "documentation",
288
+ "source_name": source_name,
289
+ "source_url": source_url,
290
+ "language": "markdown",
291
+ "chunk_type": "text",
292
+ "content": text,
293
+ "chunk_index": chunk_index,
294
+ "metadata": {
295
+ "heading": current_heading,
296
+ "heading_level": current_heading_level,
297
+ "heading_path": heading_path(),
298
+ "line_start": start_line,
299
+ "line_end": end_line,
300
+ },
301
+ }
302
+ )
303
+ chunk_index += 1
304
+
305
+ def flush_code(start_line: int, end_line: int):
306
+ nonlocal code_lines, code_language, chunk_index
307
+ if not code_lines:
308
+ return
309
+
310
+ code = "\n".join(code_lines)
311
+ code_lines = []
312
+
313
+ # Check if this is actually code or just formatted text
314
+ is_actual_code = _is_actual_code(code)
315
+
316
+ if is_actual_code:
317
+ chunks.append(
318
+ {
319
+ "chunk_id": _hash_id(code, "doc_code"),
320
+ "source": "documentation",
321
+ "source_name": source_name,
322
+ "source_url": source_url,
323
+ "language": code_language or "unknown",
324
+ "chunk_type": "code",
325
+ "content": code,
326
+ "chunk_index": chunk_index,
327
+ "metadata": {
328
+ "heading": current_heading,
329
+ "heading_level": current_heading_level,
330
+ "heading_path": heading_path(),
331
+ "fenced_block": True,
332
+ "line_start": start_line,
333
+ "line_end": end_line,
334
+ "looks_executable": _looks_like_executable_code(code),
335
+ },
336
+ }
337
+ )
338
+ else:
339
+ # It's formatted text, not actual code
340
+ chunks.append(
341
+ {
342
+ "chunk_id": _hash_id(code, "doc_text"),
343
+ "source": "documentation",
344
+ "source_name": source_name,
345
+ "source_url": source_url,
346
+ "language": "markdown",
347
+ "chunk_type": "text",
348
+ "content": code,
349
+ "chunk_index": chunk_index,
350
+ "metadata": {
351
+ "heading": current_heading,
352
+ "heading_level": current_heading_level,
353
+ "heading_path": heading_path(),
354
+ "line_start": start_line,
355
+ "line_end": end_line,
356
+ "was_fenced_block": True, # Note: was in ``` but isn't code
357
+ },
358
+ }
359
+ )
360
+
361
+ chunk_index += 1
362
+ code_language = None
363
+
364
+ buffer_start_line = 0
365
+ code_start_line = 0
366
+
367
+ for i, line in enumerate(lines):
368
+ line_cursor = i + 1
369
+
370
+ # ---- Heading detection ----
371
+ m = re.match(r"^(#{2,6})\s+(.*)", line)
372
+ if not code_block and m:
373
+ flush_text(buffer_start_line, line_cursor - 1)
374
+
375
+ level = len(m.group(1))
376
+ title = m.group(2).strip()
377
+
378
+ # Maintain heading stack
379
+ heading_stack[:] = heading_stack[: level - 2]
380
+ heading_stack.append(title)
381
+
382
+ current_heading = title
383
+ current_heading_level = level
384
+ buffer_start_line = line_cursor
385
+ continue
386
+
387
+ # ---- Code fence detection ----
388
+ if line.strip().startswith("```"):
389
+ if not code_block:
390
+ flush_text(buffer_start_line, line_cursor - 1)
391
+ code_block = True
392
+ code_language = line.strip().replace("```", "").strip() or None
393
+ code_start_line = line_cursor + 1
394
+ else:
395
+ code_block = False
396
+ flush_code(code_start_line, line_cursor - 1)
397
+ buffer_start_line = line_cursor + 1
398
+ continue
399
+
400
+ if code_block:
401
+ code_lines.append(line)
402
+ else:
403
+ if not buffer:
404
+ buffer_start_line = line_cursor
405
+ buffer.append(line)
406
+
407
+ flush_text(buffer_start_line, line_cursor)
408
+ flush_code(code_start_line, line_cursor)
409
+
410
+ return chunks
411
+
412
+
413
+ def wrap_doc_chunks(doc_chunks: List[dict]) -> List[CodeChunk]:
414
+ """
415
+ Adapter: convert doc_chunker output (dict)
416
+ into CodeChunk(documentation).
417
+ Does NOT affect core doc_chunker parsing logic.
418
+ """
419
+ wrapped: List[CodeChunk] = []
420
+
421
+ for d in doc_chunks:
422
+ wrapped.append(
423
+ CodeChunk(
424
+ chunk_id=d["chunk_id"],
425
+ file_path=d["source_name"],
426
+ language=d.get("language", "markdown"),
427
+ chunk_type="documentation",
428
+ code=d["content"],
429
+ ast=ChunkAST(
430
+ symbol_type="documentation",
431
+ name=d.get("metadata", {}).get("heading"),
432
+ parent=d.get("metadata", {}).get("heading_path"),
433
+ ),
434
+ span=ChunkSpan(
435
+ start_line=d.get("metadata", {}).get("line_start"),
436
+ end_line=d.get("metadata", {}).get("line_end"),
437
+ ),
438
+ hierarchy=ChunkHierarchy(
439
+ is_primary=True,
440
+ is_extracted=True,
441
+ ),
442
+ metadata=d.get("metadata", {}),
443
+ )
444
+ )
445
+
446
+ return wrapped
scripts/core/ingestion/generate_data.py ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Positive Pairs and Triplets Generator for Training Data
3
+
4
+ This module generates positive pairs and triplets from code chunks for
5
+ contrastive learning and similarity-based model training.
6
+
7
+ ARCHITECTURE POSITION:
8
+ - Training Data Generator: Creates pairs/triplets from code chunks
9
+ - Question Generator: Creates natural language queries for code
10
+ - Variance Generator: Creates multiple variations of pairs
11
+
12
+ KEY FEATURES:
13
+ 1. Positive Pairs: (question, code) with 4-5 variations per sample
14
+ 2. Triplets: (anchor_question, positive_code, negative_code)
15
+ 3. Global ID tracking via chunk_id
16
+ 4. Supports code-to-question and question-to-code mappings
17
+
18
+ OUTPUT FORMATS:
19
+ Positive Pairs:
20
+ {
21
+ "id": "pair_001",
22
+ "global_id": "chunk_id",
23
+ "anchor": "How to create a state graph with conditional edges?",
24
+ "positive": "<code snippet>"
25
+ }
26
+
27
+ Triplets:
28
+ {
29
+ "id": "triplet_001",
30
+ "global_id": "chunk_id",
31
+ "anchor": "How to create a reusable prompt template?",
32
+ "positive": "<relevant code>",
33
+ "negative": "<irrelevant code>"
34
+ }
35
+
36
+ USAGE:
37
+ from export.pairs_triplets_generator import generate_pairs_and_triplets
38
+
39
+ pairs, triplets = generate_pairs_and_triplets(
40
+ chunks_path="data/processed/chunks/chunks.jsonl",
41
+ output_dir="data/processed/training",
42
+ num_pairs=100,
43
+ variance=5
44
+ )
45
+ """
46
+
47
+ import json
48
+ import random
49
+ import hashlib
50
+ from pathlib import Path
51
+ from typing import List, Dict, Any, Optional, Tuple
52
+ from dataclasses import dataclass, field, asdict
53
+
54
+
55
+ @dataclass
56
+ class PositivePairVariation:
57
+ """A single anchor-positive variation."""
58
+ anchor: str # Question (natural language query)
59
+ positive: str # Code snippet
60
+
61
+
62
+ @dataclass
63
+ class PositivePair:
64
+ """A positive pair document with multiple anchor-positive variations.
65
+
66
+ Format:
67
+ {
68
+ "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
69
+ "variations": [
70
+ {"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
71
+ {"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
72
+ {"anchor": "How to implement async aadd_documents?", "positive": "<code>"},
73
+ {"anchor": "Show the async aadd_documents code", "positive": "<code>"},
74
+ {"anchor": "Explain async aadd_documents function", "positive": "<code>"}
75
+ ],
76
+ "framework": "crewai"
77
+ }
78
+ """
79
+ document_id: str # Original chunk_id
80
+ variations: List[PositivePairVariation] # List of (anchor, positive) pairs
81
+ framework: str # Framework name from file path
82
+
83
+
84
+ @dataclass
85
+ class Triplet:
86
+ """A triplet for contrastive learning.
87
+
88
+ Format:
89
+ {
90
+ "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
91
+ "anchor": "Best practices for async aadd_documents",
92
+ "positive": "async def aadd_documents(...)",
93
+ "negative": "async def async_agent(self):...",
94
+ "framework": "crewai"
95
+ }
96
+ """
97
+ document_id: str # Original chunk_id
98
+ anchor: str # Question (natural language query)
99
+ positive: str # Relevant code snippet
100
+ negative: str # Irrelevant/different code snippet
101
+ framework: str # Framework name from file path
102
+
103
+
104
+ # Question templates for different code patterns - IMPROVED for cleaner questions
105
+ QUESTION_TEMPLATES = {
106
+ "class": [
107
+ "How does the {name} class work in Python?",
108
+ "What is the implementation of the {name} class?",
109
+ "How to create a {name} class?",
110
+ "Show me the {name} class implementation",
111
+ "Explain the {name} class structure",
112
+ ],
113
+ "function": [
114
+ "How does {name} function work in Python?",
115
+ "What is the implementation of {name}?",
116
+ "How to implement the {name} function?",
117
+ "Show the code for {name} function",
118
+ "Explain how {name} works",
119
+ ],
120
+ "method": [
121
+ "How does the {name} method work in Python?",
122
+ "What is the implementation of {name} method?",
123
+ "How to implement the {name} method?",
124
+ "Show me the {name} method code",
125
+ "Explain the {name} method",
126
+ ],
127
+ "async_function": [
128
+ "How does async {name} work in Python?",
129
+ "What is the async implementation of {name}?",
130
+ "How to implement async {name}?",
131
+ "Show the async {name} code",
132
+ "Explain async {name} function",
133
+ ],
134
+ "module": [
135
+ "How to implement {name} module?",
136
+ "What's the structure of {name}?",
137
+ "Show the {name} module implementation",
138
+ "Explain the {name} module",
139
+ "How does {name} module work?",
140
+ ],
141
+ "workflow": [
142
+ "How to create a {name} workflow?",
143
+ "What's the pattern for {name}?",
144
+ "Show the {name} workflow implementation",
145
+ "Explain the {name} workflow",
146
+ "How does the {name} workflow work?",
147
+ ],
148
+ }
149
+
150
+ # Variance templates to create multiple questions for the same code
151
+ VARIANCE_TEMPLATES = [
152
+ "How to {action}?",
153
+ "What's the code for {action}?",
154
+ "Show me how to {action}",
155
+ "Implement {action}",
156
+ "Write code that {action}",
157
+ ]
158
+
159
+
160
+ def extract_code_context(code: str, ast_info: Dict, file_path: str) -> Dict[str, str]:
161
+ """Extract contextual information from code for question generation."""
162
+ context = {
163
+ "name": ast_info.get("name", "unknown"),
164
+ "parent": ast_info.get("parent", ""),
165
+ "symbol_type": ast_info.get("symbol_type", "unknown"),
166
+ "docstring": ast_info.get("docstring", ""),
167
+ "file_name": Path(file_path).stem if file_path else "unknown",
168
+ }
169
+
170
+ # Extract purpose/description from docstring or code patterns
171
+ if context["docstring"]:
172
+ # Use first sentence of docstring as description
173
+ desc = context["docstring"].split(".")[0].strip()
174
+ context["description"] = desc[:100] if len(desc) > 100 else desc
175
+ else:
176
+ # Generate description from code patterns
177
+ context["description"] = _infer_description(code, context["name"])
178
+
179
+ context["purpose"] = context["description"].lower()
180
+
181
+ return context
182
+
183
+
184
+ def _infer_description(code: str, name: str) -> str:
185
+ """Infer a description from code patterns when no docstring exists."""
186
+ code_lower = code.lower()
187
+
188
+ # Common patterns
189
+ if "stategraph" in code_lower or "workflow" in code_lower:
190
+ return f"building a stateful workflow"
191
+ elif "agent" in code_lower:
192
+ return f"creating an AI agent"
193
+ elif "tool" in code_lower or "@tool" in code:
194
+ return f"implementing a tool"
195
+ elif "async" in code_lower:
196
+ return f"async operations"
197
+ elif "api" in code_lower or "request" in code_lower:
198
+ return f"API interactions"
199
+ elif "database" in code_lower or "sql" in code_lower:
200
+ return f"database operations"
201
+ elif "parse" in code_lower:
202
+ return f"parsing data"
203
+ elif "format" in code_lower:
204
+ return f"formatting output"
205
+ elif "template" in code_lower:
206
+ return f"creating templates"
207
+ elif "filter" in code_lower:
208
+ return f"filtering data"
209
+ elif "search" in code_lower:
210
+ return f"search functionality"
211
+ elif "create" in code_lower or "build" in code_lower:
212
+ return f"building {name}"
213
+ else:
214
+ return f"implementing {name}"
215
+
216
+
217
+ def generate_question(code: str, ast_info: Dict, file_path: str,
218
+ variation_index: int = 0) -> str:
219
+ """Generate a clean natural language question for a code snippet."""
220
+ name = ast_info.get("name", "unknown")
221
+ symbol_type = ast_info.get("symbol_type", "function")
222
+
223
+ # Clean up the name for display
224
+ clean_name = name.replace("_", " ") if name else "this code"
225
+
226
+ # Check if it's async
227
+ is_async = code.strip().startswith("async ") or "async def" in code[:100]
228
+
229
+ # Determine template category
230
+ if is_async and symbol_type in ("function", "method"):
231
+ template_category = "async_function"
232
+ elif symbol_type in QUESTION_TEMPLATES:
233
+ template_category = symbol_type
234
+ elif "graph" in code.lower() or "workflow" in code.lower() or "state" in code.lower():
235
+ template_category = "workflow"
236
+ else:
237
+ template_category = "function"
238
+
239
+ templates = QUESTION_TEMPLATES[template_category]
240
+
241
+ # Select template based on variation index
242
+ template_idx = variation_index % len(templates)
243
+ template = templates[template_idx]
244
+
245
+ # Fill in template with clean name
246
+ question = template.format(name=name)
247
+
248
+ return question
249
+
250
+
251
+ def generate_question_variations(code: str, ast_info: Dict, file_path: str,
252
+ num_variations: int = 5) -> List[str]:
253
+ """Generate multiple unique question variations for a code snippet."""
254
+ questions = []
255
+ seen_questions = set()
256
+
257
+ # Generate primary variations using templates
258
+ for i in range(num_variations):
259
+ q = generate_question(code, ast_info, file_path, variation_index=i)
260
+ q_lower = q.lower()
261
+ if q_lower not in seen_questions:
262
+ questions.append(q)
263
+ seen_questions.add(q_lower)
264
+
265
+ # Return exactly num_variations (templates should provide enough)
266
+ return questions[:num_variations]
267
+
268
+
269
+ def extract_framework(file_path: str) -> str:
270
+ """Extract framework name from file path.
271
+
272
+ Examples:
273
+ 'data/raw/codebases/crewai/...' -> 'crewai'
274
+ 'data/raw/codebases/langgraph/...' -> 'langgraph'
275
+ 'data/processed/repos/langgraph_20260116/...' -> 'langgraph'
276
+ """
277
+ path_lower = file_path.lower()
278
+
279
+ # Known frameworks to detect
280
+ frameworks = [
281
+ "crewai", "langgraph", "langchain", "autogen", "llamaindex",
282
+ "dspy", "haystack", "semantic_kernel", "fastapi", "flask", "django"
283
+ ]
284
+
285
+ for framework in frameworks:
286
+ if framework in path_lower:
287
+ return framework
288
+
289
+ # Try to extract from path structure
290
+ parts = file_path.replace("\\", "/").split("/")
291
+ for part in parts:
292
+ if "codebases" in parts or "repos" in parts:
293
+ # Get the next part after codebases/repos
294
+ try:
295
+ idx = parts.index("codebases") if "codebases" in parts else parts.index("repos")
296
+ if idx + 1 < len(parts):
297
+ framework_part = parts[idx + 1].split("_")[0] # Handle 'langgraph_20260116'
298
+ if framework_part and framework_part not in ["raw", "processed"]:
299
+ return framework_part
300
+ except (ValueError, IndexError):
301
+ pass
302
+
303
+ return "unknown"
304
+
305
+
306
+ def is_semantically_different(chunk1: Dict, chunk2: Dict) -> bool:
307
+ """Check if two chunks are semantically different (good for negative pairs)."""
308
+ # Different symbol types
309
+ type1 = chunk1.get("ast", {}).get("symbol_type", "")
310
+ type2 = chunk2.get("ast", {}).get("symbol_type", "")
311
+
312
+ # Different purposes (check for different keywords)
313
+ code1 = chunk1.get("code", "").lower()
314
+ code2 = chunk2.get("code", "").lower()
315
+
316
+ # Keywords that indicate different functionality
317
+ keywords = [
318
+ "parse", "format", "create", "delete", "update", "read", "write",
319
+ "input", "output", "agent", "tool", "graph", "state", "workflow",
320
+ "template", "filter", "search", "database", "api", "async"
321
+ ]
322
+
323
+ keywords1 = set(k for k in keywords if k in code1)
324
+ keywords2 = set(k for k in keywords if k in code2)
325
+
326
+ # Consider different if keyword overlap is low
327
+ if not keywords1 or not keywords2:
328
+ return type1 != type2
329
+
330
+ overlap = len(keywords1 & keywords2) / len(keywords1 | keywords2)
331
+ return overlap < 0.3
332
+
333
+
334
+ def select_negative_sample(anchor_chunk: Dict, all_chunks: List[Dict],
335
+ max_attempts: int = 50) -> Optional[Dict]:
336
+ """Select a semantically different chunk as negative sample."""
337
+ anchor_id = anchor_chunk.get("chunk_id", "")
338
+
339
+ # Shuffle chunks for random selection
340
+ candidates = [c for c in all_chunks if c.get("chunk_id") != anchor_id]
341
+ random.shuffle(candidates)
342
+
343
+ for candidate in candidates[:max_attempts]:
344
+ if is_semantically_different(anchor_chunk, candidate):
345
+ return candidate
346
+
347
+ # Fallback: return any different chunk
348
+ if candidates:
349
+ return candidates[0]
350
+ return None
351
+
352
+
353
+ def load_chunks(chunks_path: Path) -> List[Dict]:
354
+ """Load chunks from JSONL file."""
355
+ chunks = []
356
+ with open(chunks_path, "r", encoding="utf-8") as f:
357
+ for line in f:
358
+ line = line.strip()
359
+ if line:
360
+ try:
361
+ chunks.append(json.loads(line))
362
+ except json.JSONDecodeError:
363
+ continue
364
+ return chunks
365
+
366
+
367
+ def filter_valid_chunks(chunks: List[Dict], min_code_length: int = 50) -> List[Dict]:
368
+ """Filter chunks that are suitable for training pairs."""
369
+ valid_chunks = []
370
+
371
+ for chunk in chunks:
372
+ code = chunk.get("code", "")
373
+ chunk_type = chunk.get("chunk_type", "")
374
+ ast_info = chunk.get("ast", {})
375
+
376
+ # Skip empty or very short chunks
377
+ if len(code) < min_code_length:
378
+ continue
379
+
380
+ # Skip pure imports or empty modules
381
+ if chunk_type == "imports" or (chunk_type == "module" and not ast_info.get("docstring")):
382
+ symbol_type = ast_info.get("symbol_type", "")
383
+ if symbol_type == "imports":
384
+ continue
385
+
386
+ # Skip __init__ files without content
387
+ if "__init__" in chunk.get("file_path", "") and len(code) < 100:
388
+ continue
389
+
390
+ valid_chunks.append(chunk)
391
+
392
+ return valid_chunks
393
+
394
+
395
+ def generate_positive_pairs(chunks: List[Dict], num_pairs: int = 100,
396
+ variance: int = 5) -> List[PositivePair]:
397
+ """
398
+ Generate positive pairs from chunks with multiple (anchor, positive) variations per document.
399
+
400
+ Output format:
401
+ {
402
+ "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
403
+ "variations": [
404
+ {"anchor": "How does async aadd_documents work in Python?", "positive": "<code>"},
405
+ {"anchor": "What is the implementation of aadd_documents?", "positive": "<code>"},
406
+ ...
407
+ ],
408
+ "framework": "crewai"
409
+ }
410
+
411
+ Args:
412
+ chunks: List of code chunks
413
+ num_pairs: Number of documents to generate (each with `variance` variations)
414
+ variance: Number of (anchor, positive) variations per document (4-5 recommended)
415
+
416
+ Returns:
417
+ List of PositivePair objects (one per document, each with multiple variations)
418
+ """
419
+ pairs = []
420
+
421
+ # Filter valid chunks
422
+ valid_chunks = filter_valid_chunks(chunks)
423
+
424
+ # Sample chunks if needed
425
+ if len(valid_chunks) > num_pairs:
426
+ selected_chunks = random.sample(valid_chunks, num_pairs)
427
+ else:
428
+ selected_chunks = valid_chunks
429
+
430
+ for chunk in selected_chunks:
431
+ code = chunk.get("code", "")
432
+ ast_info = chunk.get("ast", {})
433
+ file_path = chunk.get("file_path", "")
434
+ document_id = chunk.get("chunk_id", "")
435
+
436
+ # Extract framework from file path
437
+ framework = extract_framework(file_path)
438
+
439
+ # Generate multiple question variations
440
+ anchors = generate_question_variations(code, ast_info, file_path, variance)
441
+
442
+ # Create variations list with (anchor, positive) pairs
443
+ variations = [
444
+ PositivePairVariation(anchor=anchor, positive=code)
445
+ for anchor in anchors
446
+ ]
447
+
448
+ pair = PositivePair(
449
+ document_id=document_id,
450
+ variations=variations,
451
+ framework=framework
452
+ )
453
+ pairs.append(pair)
454
+
455
+ return pairs
456
+
457
+
458
+ def generate_triplets(chunks: List[Dict], num_triplets: int = 100) -> List[Triplet]:
459
+ """
460
+ Generate triplets from chunks (no variations, flat structure).
461
+
462
+ Output format:
463
+ {
464
+ "document_id": "b8bcf898f9644fc3eb9946092f96ca7a9ba8e6ac",
465
+ "anchor": "Best practices for async aadd_documents",
466
+ "positive": "async def aadd_documents(...)",
467
+ "negative": "async def async_agent(self):...",
468
+ "framework": "crewai"
469
+ }
470
+
471
+ Args:
472
+ chunks: List of code chunks
473
+ num_triplets: Number of triplets to generate (100, no variance)
474
+
475
+ Returns:
476
+ List of Triplet objects
477
+ """
478
+ triplets = []
479
+
480
+ # Filter valid chunks
481
+ valid_chunks = filter_valid_chunks(chunks)
482
+
483
+ if len(valid_chunks) < 2:
484
+ return triplets
485
+
486
+ # Sample chunks if needed
487
+ if len(valid_chunks) > num_triplets:
488
+ selected_chunks = random.sample(valid_chunks, num_triplets)
489
+ else:
490
+ selected_chunks = valid_chunks
491
+
492
+ for anchor_chunk in selected_chunks:
493
+ # Find a semantically different chunk as negative
494
+ negative_chunk = select_negative_sample(anchor_chunk, valid_chunks)
495
+
496
+ if negative_chunk is None:
497
+ continue
498
+
499
+ code = anchor_chunk.get("code", "")
500
+ ast_info = anchor_chunk.get("ast", {})
501
+ file_path = anchor_chunk.get("file_path", "")
502
+ document_id = anchor_chunk.get("chunk_id", "")
503
+
504
+ # Extract framework from file path
505
+ framework = extract_framework(file_path)
506
+
507
+ # Generate question for anchor
508
+ question = generate_question(code, ast_info, file_path)
509
+
510
+ triplet = Triplet(
511
+ document_id=document_id,
512
+ anchor=question,
513
+ positive=code,
514
+ negative=negative_chunk.get("code", ""),
515
+ framework=framework
516
+ )
517
+ triplets.append(triplet)
518
+
519
+ return triplets
520
+
521
+
522
+ def export_pairs_jsonl(pairs: List[PositivePair], output_path: Path) -> None:
523
+ """Export positive pairs to JSONL file."""
524
+ output_path.parent.mkdir(parents=True, exist_ok=True)
525
+
526
+ with open(output_path, "w", encoding="utf-8") as f:
527
+ for pair in pairs:
528
+ f.write(json.dumps(asdict(pair), ensure_ascii=False) + "\n")
529
+
530
+ print(f"Exported {len(pairs)} positive pairs to {output_path}")
531
+
532
+ def export_triplets_jsonl(triplets: List[Triplet], output_path: Path) -> None:
533
+ """Export triplets to JSONL file."""
534
+ output_path.parent.mkdir(parents=True, exist_ok=True)
535
+
536
+ with open(output_path, "w", encoding="utf-8") as f:
537
+ for triplet in triplets:
538
+ f.write(json.dumps(asdict(triplet), ensure_ascii=False) + "\n")
539
+
540
+ print(f"Exported {len(triplets)} triplets to {output_path}")
541
+
542
+ def export_pairs_json(pairs: List[PositivePair], output_path: Path) -> None:
543
+ """Export positive pairs to JSON file (list format for easier inspection)."""
544
+ output_path.parent.mkdir(parents=True, exist_ok=True)
545
+
546
+ data = [asdict(p) for p in pairs]
547
+ with open(output_path, "w", encoding="utf-8") as f:
548
+ json.dump(data, f, indent=2, ensure_ascii=False)
549
+
550
+ print(f"Exported {len(pairs)} positive pairs to {output_path}")
551
+
552
+ def export_triplets_json(triplets: List[Triplet], output_path: Path) -> None:
553
+ """Export triplets to JSON file (flat list format)."""
554
+ output_path.parent.mkdir(parents=True, exist_ok=True)
555
+
556
+ data = [asdict(t) for t in triplets]
557
+
558
+ with open(output_path, "w", encoding="utf-8") as f:
559
+ json.dump(data, f, indent=2, ensure_ascii=False)
560
+
561
+ with open(output_path, "w", encoding="utf-8") as f:
562
+ json.dump(data, f, indent=2, ensure_ascii=False)
563
+
564
+ print(f"Exported {len(triplets)} triplets to {output_path}")
565
+
566
+
567
+ def generate_pairs_and_triplets(
568
+ chunks_path: Path,
569
+ output_dir: Path,
570
+ num_pairs: int = 100,
571
+ num_triplets: int = 100,
572
+ variance: int = 5,
573
+ export_format: str = "both" # "jsonl", "json", or "both"
574
+ ) -> Tuple[List[PositivePair], List[Triplet]]:
575
+ """
576
+ Main function to generate positive pairs and triplets from chunks.
577
+
578
+ Args:
579
+ chunks_path: Path to chunks JSONL file
580
+ output_dir: Directory to save output files
581
+ num_pairs: Number of base pairs (will generate num_pairs * variance total)
582
+ num_triplets: Number of triplets (no variance)
583
+ variance: Number of variations per positive pair (4-5)
584
+ export_format: Output format ("jsonl", "json", or "both")
585
+
586
+ Returns:
587
+ Tuple of (pairs, triplets)
588
+ """
589
+ print(f"Loading chunks from {chunks_path}...")
590
+ chunks = load_chunks(chunks_path)
591
+ print(f" Loaded {len(chunks)} chunks")
592
+
593
+ # Generate positive pairs with variance
594
+ print(f"Generating positive pairs (base={num_pairs}, variance={variance})...")
595
+ pairs = generate_positive_pairs(chunks, num_pairs=num_pairs, variance=variance)
596
+ print(f" Generated {len(pairs)} positive pairs")
597
+
598
+ # Generate triplets (no variance)
599
+ print(f"Generating triplets (count={num_triplets})...")
600
+ triplets = generate_triplets(chunks, num_triplets=num_triplets)
601
+ print(f" Generated {len(triplets)} triplets")
602
+
603
+ # Create output directory
604
+ output_dir = Path(output_dir)
605
+ output_dir.mkdir(parents=True, exist_ok=True)
606
+
607
+ # Export based on format
608
+ if export_format in ("jsonl", "both"):
609
+ export_pairs_jsonl(pairs, output_dir / "positive_pairs.jsonl")
610
+ export_triplets_jsonl(triplets, output_dir / "triplets.jsonl")
611
+
612
+ if export_format in ("json", "both"):
613
+ export_pairs_json(pairs, output_dir / "positive_pairs.json")
614
+ export_triplets_json(triplets, output_dir / "triplets.json")
615
+
616
+ # Print summary statistics
617
+ print("Summary Statistics:")
618
+ print(f" Total Positive Pair Documents: {len(pairs)}")
619
+ print(f" Total Variations: {sum(len(p.variations) for p in pairs)}")
620
+ print(f" Total Triplets: {len(triplets)}")
621
+
622
+ return pairs, triplets
623
+
624
+
625
+
626
+ def main():
627
+ """CLI entry point for generating pairs and triplets."""
628
+ import argparse
629
+
630
+ parser = argparse.ArgumentParser(description="Generate positive pairs and triplets from code chunks")
631
+ parser.add_argument("--chunks", "-c", type=str, required=True,
632
+ help="Path to chunks JSONL file")
633
+ parser.add_argument("--output", "-o", type=str, required=True,
634
+ help="Output directory for generated files")
635
+ parser.add_argument("--pairs", "-p", type=int, default=100,
636
+ help="Number of base positive pairs (default: 100)")
637
+ parser.add_argument("--triplets", "-t", type=int, default=100,
638
+ help="Number of triplets (default: 100)")
639
+ parser.add_argument("--variance", "-v", type=int, default=5,
640
+ help="Number of variations per pair (default: 5)")
641
+ parser.add_argument("--format", "-f", type=str, default="both",
642
+ choices=["jsonl", "json", "both"],
643
+ help="Output format (default: both)")
644
+
645
+ args = parser.parse_args()
646
+
647
+ generate_pairs_and_triplets(
648
+ chunks_path=Path(args.chunks),
649
+ output_dir=Path(args.output),
650
+ num_pairs=args.pairs,
651
+ num_triplets=args.triplets,
652
+ variance=args.variance,
653
+ export_format=args.format
654
+ )
655
+
656
+
657
+ if __name__ == "__main__":
658
+ main()
scripts/core/ingestion/hierarchical_chunker.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hierarchical chunk coordinator - Orchestrates AST and Tree-sitter chunking.
3
+
4
+ This module serves as the coordination layer that integrates AST (semantic)
5
+ and Tree-sitter (syntactic) chunking. It ensures that:
6
+ 1. AST chunks get precise byte spans from Tree-sitter
7
+ 2. Hierarchy relationships are preserved across both sources
8
+ 3. Parent-child relationships are correctly established
9
+ 4. All chunks have consistent metadata and structure
10
+
11
+ ARCHITECTURE POSITION:
12
+ - Coordination Layer: Integrates AST and Tree-sitter
13
+ - Relationship Manager: Maintains parent-child links
14
+ - Quality Enforcer: Ensures consistent chunk structure
15
+
16
+ KEY RESPONSIBILITIES:
17
+ 1. Enrich AST chunks with Tree-sitter byte spans
18
+ 2. Build and verify hierarchy relationships
19
+ 3. Create secondary chunks for extracted content
20
+ 4. Ensure type safety across all chunk operations
21
+
22
+ FLOW:
23
+ File → AST chunks (semantic) + Tree-sitter chunks (spans)
24
+ → HierarchicalChunker.enrich_and_link()
25
+ → Final chunks with hierarchy + precise spans
26
+
27
+ USAGE:
28
+ chunker = HierarchicalChunker()
29
+ chunks = chunker.chunk_file(Path("file.py"))
30
+ """
31
+
32
+ from pathlib import Path
33
+ from typing import List, Dict, Optional, Tuple, Set, cast
34
+ import uuid
35
+
36
+ from .ast_chunker import extract_ast_chunks
37
+ from .ts_chunker import extract_ts_chunks
38
+ from .chunk_schema import CodeChunk, ChunkHierarchy, ChunkType
39
+
40
+
41
+ class HierarchicalChunker:
42
+ def __init__(self):
43
+ self.chunks_by_id: Dict[str, CodeChunk] = {}
44
+ self.imports_by_file: Dict[str, str] = {} # Track imports chunks by file
45
+
46
+ # ---------------- helpers ----------------
47
+
48
+ def _build_ts_span_map(
49
+ self, ts_chunks: List[CodeChunk]
50
+ ) -> Dict[Tuple[int, int], CodeChunk]:
51
+ span_map: Dict[Tuple[int, int], CodeChunk] = {}
52
+
53
+ for c in ts_chunks:
54
+ if c.span.start_line is None or c.span.end_line is None:
55
+ continue
56
+
57
+ span_map[(c.span.start_line, c.span.end_line)] = c
58
+
59
+ return span_map
60
+
61
+ def _enrich_spans_with_tree_sitter(
62
+ self, ast_chunks: List[CodeChunk], ts_chunks: List[CodeChunk]
63
+ ) -> List[CodeChunk]:
64
+ """Enrich AST chunks with Tree-sitter precise byte spans"""
65
+ ts_span_map = self._build_ts_span_map(ts_chunks)
66
+
67
+ for ast_chunk in ast_chunks:
68
+ if ast_chunk.span.start_line is not None and ast_chunk.span.end_line is not None:
69
+ key: Tuple[int, int] = (ast_chunk.span.start_line, ast_chunk.span.end_line)
70
+ ts_match = ts_span_map.get(key)
71
+
72
+ if ts_match:
73
+ # Update byte spans from Tree-sitter
74
+ ast_chunk.span.start_byte = ts_match.span.start_byte
75
+ ast_chunk.span.end_byte = ts_match.span.end_byte
76
+
77
+ return ast_chunks
78
+
79
+ def _preserve_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
80
+ """Ensure all hierarchy relationships are preserved with proper typing"""
81
+ # Build mapping for quick lookup
82
+ for chunk in all_chunks:
83
+ self.chunks_by_id[chunk.chunk_id] = chunk
84
+
85
+ # Verify and fix parent-child relationships with type safety
86
+ for chunk in all_chunks:
87
+ # Ensure hierarchy exists
88
+ if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
89
+ chunk.hierarchy = ChunkHierarchy()
90
+
91
+ if chunk.hierarchy.parent_id:
92
+ parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
93
+ if parent:
94
+ # Ensure parent has hierarchy
95
+ if not hasattr(parent, 'hierarchy') or parent.hierarchy is None:
96
+ parent.hierarchy = ChunkHierarchy()
97
+
98
+ # Add child to parent with type safety
99
+ if chunk.chunk_id not in parent.hierarchy.children_ids:
100
+ parent.hierarchy.children_ids.append(chunk.chunk_id)
101
+
102
+ def _create_secondary_chunks_for_extracted_content(
103
+ self, ast_chunks: List[CodeChunk]
104
+ ) -> List[CodeChunk]:
105
+ """Create secondary chunks for extracted content (if needed)"""
106
+ secondary_chunks: List[CodeChunk] = []
107
+
108
+ # Currently, our AST chunker creates everything as primary
109
+ # This method is for future extensions
110
+ return secondary_chunks
111
+
112
+ def _update_hierarchy_relationships(self, all_chunks: List[CodeChunk]) -> None:
113
+ """Update parent-child relationships based on AST parent field with proper typing"""
114
+ # Create mapping from (name, type) to chunk_id
115
+ chunk_map: Dict[Tuple[Optional[str], ChunkType], str] = {}
116
+
117
+ for chunk in all_chunks:
118
+ if chunk.ast and chunk.ast.name:
119
+ key = (chunk.ast.name, chunk.chunk_type)
120
+ chunk_map[key] = chunk.chunk_id
121
+
122
+ # Update parent relationships with type safety
123
+ for chunk in all_chunks:
124
+ # Ensure hierarchy exists
125
+ if not hasattr(chunk, 'hierarchy') or chunk.hierarchy is None:
126
+ chunk.hierarchy = ChunkHierarchy()
127
+
128
+ if chunk.ast and chunk.ast.parent and chunk.ast.parent != "None":
129
+ # Determine parent type based on current chunk type
130
+ parent_type: ChunkType = "class" if chunk.chunk_type == "method" else "module"
131
+
132
+ # Try to find parent chunk
133
+ parent_key = (chunk.ast.parent, parent_type)
134
+ parent_id = chunk_map.get(parent_key)
135
+
136
+ if parent_id and parent_id in self.chunks_by_id:
137
+ chunk.hierarchy.parent_id = parent_id
138
+
139
+ # Add this chunk to parent's children with type safety
140
+ parent_chunk = self.chunks_by_id.get(parent_id)
141
+ if parent_chunk:
142
+ # Ensure parent has hierarchy
143
+ if not hasattr(parent_chunk, 'hierarchy') or parent_chunk.hierarchy is None:
144
+ parent_chunk.hierarchy = ChunkHierarchy()
145
+
146
+ if chunk.chunk_id not in parent_chunk.hierarchy.children_ids:
147
+ parent_chunk.hierarchy.children_ids.append(chunk.chunk_id)
148
+
149
+ # Set depth based on parent relationships
150
+ for chunk in all_chunks:
151
+ if chunk.hierarchy.parent_id:
152
+ parent = self.chunks_by_id.get(chunk.hierarchy.parent_id)
153
+ if parent and hasattr(parent, 'hierarchy') and parent.hierarchy:
154
+ chunk.hierarchy.depth = parent.hierarchy.depth + 1
155
+
156
+ # ---------------- public API ----------------
157
+
158
+ def chunk_file(self, file_path: Path) -> List[CodeChunk]:
159
+ self.chunks_by_id.clear()
160
+ self.imports_by_file.clear()
161
+
162
+ try:
163
+ ast_chunks = extract_ast_chunks(file_path)
164
+ except SyntaxError:
165
+ ast_chunks = []
166
+
167
+ # Get Tree-sitter chunks for byte-level precision
168
+ ts_chunks = extract_ts_chunks(file_path)
169
+
170
+ # Enrich AST chunks with Tree-sitter byte spans
171
+ enriched_chunks = self._enrich_spans_with_tree_sitter(ast_chunks, ts_chunks)
172
+
173
+ # Update hierarchy relationships with proper typing
174
+ self._update_hierarchy_relationships(enriched_chunks)
175
+
176
+ # Preserve any existing relationships
177
+ self._preserve_hierarchy_relationships(enriched_chunks)
178
+
179
+ # Create any needed secondary chunks
180
+ secondary_chunks = self._create_secondary_chunks_for_extracted_content(enriched_chunks)
181
+
182
+ return enriched_chunks + secondary_chunks
scripts/core/ingestion/ingest.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Git Repository Crawler - Intelligent repository cloning and file listing system.
3
+
4
+ This module serves as the entry point for ingesting Git repositories into our
5
+ dataset pipeline. It handles cloning, file listing, metadata extraction, and
6
+ statistics generation with multiple strategies for different use cases.
7
+
8
+ ARCHITECTURE POSITION:
9
+ - Ingestion Layer: Entry point for Git repositories
10
+ - File Discovery: Finds and filters repository files
11
+ - Metadata Collector: Gathers repo-level information
12
+
13
+ KEY FEATURES:
14
+ 1. Multi-strategy file listing (fast/rich/smart)
15
+ 2. Intelligent binary detection and filtering
16
+ 3. Repository metadata extraction with git history
17
+ 4. Agentic framework detection (through RepoMetadataExtractor)
18
+ 5. Repository statistics and cleanup utilities
19
+
20
+ DATA FLOW:
21
+ Repository URL → Clone → File Discovery → Filtering → File Info/Metadata → Output
22
+
23
+ USE CASES:
24
+ - FAST: When only file paths are needed (performance-critical)
25
+ - RICH: When full metadata is required (dataset building)
26
+ - SMART: Auto-chooses based on needs (balanced approach)
27
+
28
+ USAGE:
29
+ crawler = GitCrawler()
30
+ repo_path = crawler.clone_repository("https://github.com/org/repo.git")
31
+ files_fast = crawler.list_files_fast(repo_path, extensions={'.py'})
32
+ files_rich, stats = crawler.list_files_with_info(repo_path)
33
+ """
34
+
35
+ import subprocess
36
+ from pathlib import Path
37
+ from typing import List, Optional, Set, Dict, Tuple, Union, cast
38
+ import os
39
+ from dataclasses import dataclass
40
+ import time
41
+ from .repo_metadata import RepoMetadataExtractor
42
+
43
+
44
+ @dataclass
45
+ class RepoFileInfo:
46
+ """Lightweight file info - optional for when you need it"""
47
+ path: Path
48
+ relative_path: str
49
+ size: int = 0
50
+ extension: str = ""
51
+ is_binary: Optional[bool] = None
52
+
53
+
54
+ class GitCrawler:
55
+ """
56
+ Optimized Git crawler with fast listing + optional rich info
57
+ """
58
+
59
+ def __init__(self, cache_dir: Path = Path("data/raw/repos")):
60
+ self.cache_dir = cache_dir
61
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
62
+
63
+ # -------- CORE: Cloning (same for both) --------
64
+ def clone_repository(self, repo_url: str) -> Optional[Path]:
65
+ """Clone a repository if not already cloned"""
66
+ repo_name = self._extract_repo_name(repo_url)
67
+ repo_path = self.cache_dir / repo_name
68
+
69
+ if repo_path.exists():
70
+ print(f"Repository already exists: {repo_path}")
71
+ return repo_path
72
+
73
+ print(f"Cloning {repo_url}...")
74
+ cmd = ["git", "clone", "--depth", "1", repo_url, str(repo_path)]
75
+
76
+ try:
77
+ start_time = time.time()
78
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
79
+ elapsed = time.time() - start_time
80
+ print(f"Cloned to {repo_path} ({elapsed:.1f}s)")
81
+ return repo_path
82
+ except subprocess.CalledProcessError as e:
83
+ print(f"Failed to clone {repo_url}: {e.stderr}")
84
+ return None
85
+
86
+ def extract_enhanced_metadata(self, repo_path: Path) -> Dict:
87
+ """
88
+ Extract enhanced metadata including agentic framework detection
89
+ """
90
+ extractor = RepoMetadataExtractor(repo_path)
91
+ return extractor.extract_comprehensive_metadata()
92
+
93
+ # -------- OPTION 1: FAST listing (old style) --------
94
+ def list_files_fast(self, repo_path: Path,
95
+ extensions: Optional[Set[str]] = None,
96
+ exclude_dirs: Optional[Set[str]] = None) -> List[Path]:
97
+ """
98
+ FAST file listing - returns just Path objects
99
+
100
+ Use when you need speed and don't need metadata
101
+ """
102
+ if exclude_dirs is None:
103
+ exclude_dirs = {'.git', '__pycache__', 'node_modules',
104
+ 'build', 'dist', '.venv', 'venv'}
105
+
106
+ files = []
107
+
108
+ for root, dirs, filenames in os.walk(repo_path):
109
+ # Filter directories
110
+ dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
111
+
112
+ for filename in filenames:
113
+ if filename.startswith('.'):
114
+ continue
115
+
116
+ file_path = Path(root) / filename
117
+
118
+ # Filter by extension if specified
119
+ if extensions:
120
+ if file_path.suffix.lower() in extensions:
121
+ files.append(file_path)
122
+ else:
123
+ files.append(file_path)
124
+
125
+ return sorted(files) # Sort for consistency
126
+
127
+ # -------- OPTION 2: RICH listing with metadata --------
128
+ def list_files_with_info(self, repo_path: Path,
129
+ extensions: Optional[Set[str]] = None,
130
+ exclude_dirs: Optional[Set[str]] = None,
131
+ skip_binary: bool = True) -> Tuple[List[RepoFileInfo], Dict]:
132
+ """
133
+ RICH file listing - returns file info + statistics
134
+
135
+ Use when you need metadata for better chunking
136
+ """
137
+ if exclude_dirs is None:
138
+ exclude_dirs = {'.git', '__pycache__', 'node_modules',
139
+ 'build', 'dist', '.venv', 'venv', '.env'}
140
+
141
+ file_infos = []
142
+ stats = {
143
+ "total_files": 0,
144
+ "total_size": 0,
145
+ "by_extension": {},
146
+ "binary_files": 0,
147
+ "text_files": 0
148
+ }
149
+
150
+ for root, dirs, filenames in os.walk(repo_path):
151
+ # Filter directories
152
+ dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
153
+
154
+ for filename in filenames:
155
+ if filename.startswith('.'):
156
+ continue
157
+
158
+ file_path = Path(root) / filename
159
+ relative_path = file_path.relative_to(repo_path)
160
+ extension = file_path.suffix.lower()
161
+
162
+ # Filter by extension
163
+ if extensions and extension not in extensions:
164
+ continue
165
+
166
+ try:
167
+ size = file_path.stat().st_size
168
+ is_binary = None
169
+
170
+ # Check if binary (only when needed)
171
+ if skip_binary:
172
+ is_binary = self._is_binary_file(file_path)
173
+ if is_binary:
174
+ stats["binary_files"] += 1
175
+ continue # Skip binary files
176
+ else:
177
+ stats["text_files"] += 1
178
+
179
+ # Create file info
180
+ file_info = RepoFileInfo(
181
+ path=file_path,
182
+ relative_path=str(relative_path),
183
+ size=size,
184
+ extension=extension,
185
+ is_binary=is_binary
186
+ )
187
+
188
+ file_infos.append(file_info)
189
+
190
+ # Update stats
191
+ stats["total_files"] += 1
192
+ stats["total_size"] += size
193
+ stats["by_extension"][extension] = stats["by_extension"].get(extension, 0) + 1
194
+
195
+ except (OSError, PermissionError) as e:
196
+ print(f"[WARNING] Could not read {file_path}: {e}")
197
+ continue
198
+
199
+ # Sort by relative path
200
+ file_infos.sort(key=lambda x: x.relative_path)
201
+
202
+ return file_infos, stats
203
+
204
+ # -------- OPTION 3: SMART listing (auto-chooses) --------
205
+ def list_files(self, repo_path: Path,
206
+ extensions: Optional[Set[str]] = None,
207
+ exclude_dirs: Optional[Set[str]] = None,
208
+ rich_metadata: bool = False,
209
+ skip_binary: bool = True) -> Union[List[Path], Tuple[List[RepoFileInfo], Dict]]:
210
+ """
211
+ SMART file listing - chooses method based on needs
212
+
213
+ Args:
214
+ rich_metadata: True for RepoFileInfo + stats, False for just Paths
215
+ skip_binary: Skip binary files (only when rich_metadata=True)
216
+ """
217
+ if rich_metadata:
218
+ return self.list_files_with_info(repo_path, extensions, exclude_dirs, skip_binary)
219
+ else:
220
+ return self.list_files_fast(repo_path, extensions, exclude_dirs)
221
+
222
+ # -------- HELPER: Get README --------
223
+ def get_readme_content(self, repo_path: Path) -> Optional[str]:
224
+ """Quickly get README content if exists"""
225
+ for pattern in ['README.md', 'README.rst', 'README.txt', 'README', 'readme.md']:
226
+ readme_path = repo_path / pattern
227
+ if readme_path.exists():
228
+ try:
229
+ return readme_path.read_text(encoding='utf-8', errors='ignore')[:5000] # First 5k chars
230
+ except:
231
+ continue
232
+ return None
233
+
234
+ # -------- HELPER: Get repository stats --------
235
+
236
+ def get_repo_stats(self, repo_path: Path) -> Dict:
237
+ """ACCURATE repository statistics (excludes .git)"""
238
+ try:
239
+ total_files = 0
240
+ total_size = 0
241
+ extensions = set()
242
+
243
+ for root, dirs, files in os.walk(repo_path):
244
+ # ✅ PROPERLY skip .git directory
245
+ root_path = Path(root)
246
+ if '.git' in root_path.parts:
247
+ continue # Skip entire .git directory
248
+
249
+ total_files += len(files)
250
+ for file in files:
251
+ file_path = Path(root) / file
252
+ try:
253
+ size = file_path.stat().st_size
254
+ total_size += size
255
+ if file_path.suffix:
256
+ extensions.add(file_path.suffix.lower())
257
+ except:
258
+ pass
259
+
260
+ return {
261
+ "total_files": total_files,
262
+ "total_size_mb": round(total_size / (1024 * 1024), 2),
263
+ "unique_extensions": sorted(list(extensions))[:20],
264
+ "path": str(repo_path),
265
+ "name": repo_path.name,
266
+ "note": "Size excludes .git directory" # ✅ Add note
267
+ }
268
+ except Exception as e:
269
+ return {"error": str(e)}
270
+
271
+
272
+ # -------- UTILITY METHODS --------
273
+ def _extract_repo_name(self, repo_url: str) -> str:
274
+ """Extract repository name from URL"""
275
+ name = repo_url.rstrip('/').split('/')[-1]
276
+ if name.endswith('.git'):
277
+ name = name[:-4]
278
+ return name
279
+
280
+ def _is_binary_file(self, file_path: Path, sample_size: int = 1024) -> bool:
281
+ """Quick binary detection by sampling"""
282
+ try:
283
+ with open(file_path, 'rb') as f:
284
+ sample = f.read(sample_size)
285
+
286
+ if not sample:
287
+ return False
288
+
289
+ # Check for null bytes (common in binaries)
290
+ if b'\x00' in sample:
291
+ return True
292
+
293
+ # Count printable ASCII
294
+ printable = sum(1 for byte in sample if 32 <= byte <= 126 or byte in (9, 10, 13))
295
+ return (printable / len(sample)) < 0.8 # Less than 80% printable
296
+ except:
297
+ return True # If we can't read, assume binary
298
+
299
+ def cleanup_old_repos(self, max_age_days: int = 7):
300
+ """Cleanup old cached repositories (optional)"""
301
+ import shutil
302
+ from datetime import datetime, timedelta
303
+
304
+ cutoff = datetime.now() - timedelta(days=max_age_days)
305
+
306
+ for repo_dir in self.cache_dir.iterdir():
307
+ if repo_dir.is_dir():
308
+ try:
309
+ mtime = datetime.fromtimestamp(repo_dir.stat().st_mtime)
310
+ if mtime < cutoff:
311
+ print(f"🧹 Cleaning up old repo: {repo_dir.name}")
312
+ shutil.rmtree(repo_dir)
313
+ except:
314
+ pass
315
+
316
+
317
+ # -------- SIMPLE USAGE EXAMPLES --------
318
+ def example_usage():
319
+ """Example of how to use the crawler - FIXED VERSION"""
320
+ crawler = GitCrawler()
321
+
322
+ # 1. Clone a repository
323
+ repo_path = crawler.clone_repository("https://github.com/microsoft/autogen.git")
324
+ if not repo_path:
325
+ print("❌ Failed to clone repository")
326
+ return
327
+
328
+ # 2. OPTION A: Fast listing (just paths)
329
+ print("\n=== FAST LISTING ===")
330
+ python_files = crawler.list_files_fast(repo_path, extensions={'.py'})
331
+ print(f"Found {len(python_files)} Python files")
332
+
333
+ # 3. OPTION B: Rich listing with metadata
334
+ print("\n=== RICH LISTING ===")
335
+ file_infos, stats = crawler.list_files_with_info(
336
+ repo_path,
337
+ extensions={'.py', '.md', '.json', '.yaml'},
338
+ skip_binary=True
339
+ )
340
+ print(f"Total files: {stats['total_files']}")
341
+ print(f"Total size: {stats['total_size'] / 1024 / 1024:.2f} MB")
342
+ print(f"Extensions: {stats['by_extension']}")
343
+
344
+ # 4. OPTION C: Smart listing (auto) - FIXED
345
+ print("\n=== SMART LISTING ===")
346
+ # Returns just paths (fast)
347
+ files_fast = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=False)
348
+ # Type check for PyLance
349
+ if isinstance(files_fast, list):
350
+ print(f"Fast count: {len(files_fast)}")
351
+ else:
352
+ # This shouldn't happen with rich_metadata=False
353
+ print("Unexpected return type from list_files()")
354
+
355
+ # Returns info + stats (rich) - FIXED
356
+ result = crawler.list_files(repo_path, extensions={'.py'}, rich_metadata=True)
357
+ if isinstance(result, tuple):
358
+ files_rich, stats = result
359
+ print(f"Rich count: {len(files_rich)}")
360
+ else:
361
+ # This shouldn't happen with rich_metadata=True
362
+ print("Unexpected return type from list_files()")
363
+
364
+ # 5. Get README
365
+ readme = crawler.get_readme_content(repo_path)
366
+ if readme:
367
+ print(f"\nREADME preview: {readme[:200]}...")
368
+
369
+ # 6. Get repo stats
370
+ repo_stats = crawler.get_repo_stats(repo_path)
371
+ print(f"\nRepository stats: {repo_stats}")
372
+
373
+
374
+ if __name__ == "__main__":
375
+ example_usage()
376
+
377
+
378
+
379
+
380
+
scripts/core/ingestion/repo_metadata.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Repository Metadata Extractor - Advanced metadata extraction for Git repositories.
3
+
4
+ This module extracts comprehensive metadata from Git repositories with a
5
+ special focus on agentic framework detection. It analyzes repository structure,
6
+ dependencies, git history, and patterns to identify agentic code patterns.
7
+
8
+ ARCHITECTURE POSITION:
9
+ - Repository Analyzer: Deep analysis of Git repositories
10
+ - Agentic Detector: Identifies agentic framework usage
11
+ - Dependency Mapper: Extracts dependency information
12
+
13
+ KEY FEATURES:
14
+ 1. Agentic framework detection across multiple frameworks
15
+ 2. Comprehensive dependency extraction (Python, Node.js, Docker)
16
+ 3. Git metadata extraction (commits, branches, tags)
17
+ 4. Repository structure analysis
18
+ 5. Entry point and configuration file discovery
19
+ """
20
+
21
+ import json
22
+ import re
23
+ import subprocess
24
+ from pathlib import Path
25
+ from typing import Dict, List, Optional
26
+ from datetime import datetime
27
+
28
+
29
+ class RepoMetadataExtractor:
30
+ """Enhanced metadata extractor for agentic codebases"""
31
+
32
+ AGENTIC_FRAMEWORKS = {
33
+ "langchain": ["langchain", "langsmith", "lc", "chain", "agent"],
34
+ "autogen": ["autogen", "agent", "groupchat"],
35
+ "crewai": ["crewai", "crew", "task", "agent"],
36
+ "haystack": ["haystack", "pipeline", "node"],
37
+ "llamaindex": ["llama_index", "query_engine", "index"],
38
+ "semantic_kernel": ["semantic_kernel", "sk"],
39
+ "transformers_agents": ["transformers_agents", "huggingface"],
40
+ "camel": ["camel", "role_playing"],
41
+ "agents": ["agent", "tool", "workflow", "orchestrator"],
42
+ }
43
+
44
+ def __init__(self, repo_path: Path):
45
+ self.repo_path = repo_path
46
+
47
+ # ---------------------------------------------------------------------
48
+ # Public API
49
+ # ---------------------------------------------------------------------
50
+
51
+ def extract_comprehensive_metadata(self) -> Dict:
52
+ return {
53
+ "basic": self.extract_basic_metadata(),
54
+ "git": self.extract_git_metadata(),
55
+ "dependencies": self.extract_dependency_info(),
56
+ "structure": self.extract_structure_info(),
57
+ "agentic_detection": self.detect_agentic_frameworks(),
58
+ "entry_points": self.find_entry_points(),
59
+ "config_files": self.find_config_files(),
60
+ }
61
+
62
+ # 🔧 FIXED: Now returns actual repo name, not folder name
63
+ def extract_basic_metadata(self) -> Dict:
64
+ """Extract basic repository metadata"""
65
+ return {
66
+ "repo_name": self._get_actual_repo_name(), # 🎯 FIXED LINE
67
+ "local_path": str(self.repo_path),
68
+ "size_mb": self._get_repo_size_mb(),
69
+ "file_count": self._count_files(),
70
+ "extracted_at": datetime.now().isoformat(),
71
+ }
72
+
73
+ # 🆕 NEW HELPER METHOD
74
+ def _get_actual_repo_name(self) -> str:
75
+ """
76
+ Get actual repository name from Git remote or folder structure.
77
+ Returns 'crewAI' not 'crewai_test'.
78
+ """
79
+ # 1. Try to get from git remote URL
80
+ try:
81
+ remote_url = self._run_git_command(["config", "--get", "remote.origin.url"])
82
+ if remote_url:
83
+ remote_url = remote_url.strip()
84
+ # Extract repo name from URL
85
+ # github.com/owner/repo.git -> repo
86
+ if '/' in remote_url:
87
+ repo_name = remote_url.split('/')[-1]
88
+ if repo_name.endswith('.git'):
89
+ repo_name = repo_name[:-4]
90
+ return repo_name
91
+ except Exception:
92
+ pass
93
+
94
+ # 2. Fallback: clean folder name
95
+ folder_name = self.repo_path.name
96
+
97
+ # Remove common suffixes
98
+ for suffix in ['_test', '_copy', '_backup', '_temp', '_local']:
99
+ if folder_name.lower().endswith(suffix.lower()):
100
+ return folder_name[:-len(suffix)]
101
+
102
+ return folder_name
103
+
104
+ def extract_git_metadata(self) -> Dict:
105
+ try:
106
+ remote_url = self._run_git_command(
107
+ ["config", "--get", "remote.origin.url"]
108
+ )
109
+
110
+ latest_commit = self._run_git_command(
111
+ ["log", "-1", "--pretty=format:%H|%an|%ae|%ad|%s"]
112
+ )
113
+ commit_parts = latest_commit.split("|") if latest_commit else []
114
+
115
+ branches_raw = self._run_git_command(["branch", "-a"])
116
+ branch_list = (
117
+ [
118
+ b.strip().replace("* ", "")
119
+ for b in branches_raw.split("\n")
120
+ if b.strip()
121
+ ]
122
+ if branches_raw
123
+ else []
124
+ )
125
+
126
+ tags_raw = self._run_git_command(["tag", "-l"])
127
+ tag_list = (
128
+ [t.strip() for t in tags_raw.split("\n") if t.strip()]
129
+ if tags_raw
130
+ else []
131
+ )
132
+
133
+ current_branch = self._run_git_command(["branch", "--show-current"])
134
+
135
+ return {
136
+ "remote_url": remote_url or "",
137
+ "branch": current_branch or "",
138
+ "latest_commit": {
139
+ "hash": commit_parts[0] if len(commit_parts) > 0 else "",
140
+ "author": commit_parts[1] if len(commit_parts) > 1 else "",
141
+ "email": commit_parts[2] if len(commit_parts) > 2 else "",
142
+ "date": commit_parts[3] if len(commit_parts) > 3 else "",
143
+ "message": commit_parts[4] if len(commit_parts) > 4 else "",
144
+ },
145
+ "branch_count": len(branch_list),
146
+ "branches": branch_list[:10],
147
+ "tag_count": len(tag_list),
148
+ "tags": tag_list[:10],
149
+ }
150
+
151
+ except Exception as e:
152
+ return {"error": str(e)}
153
+
154
+ # ---------------------------------------------------------------------
155
+ # Agentic detection
156
+ # ---------------------------------------------------------------------
157
+
158
+ def detect_agentic_frameworks(self) -> Dict:
159
+ detected: Dict[str, str] = {}
160
+
161
+ deps = self.extract_dependency_info()
162
+ python_packages = deps.get("python_packages", [])
163
+
164
+ for framework, keywords in self.AGENTIC_FRAMEWORKS.items():
165
+ for package in python_packages:
166
+ if any(k in package.lower() for k in keywords):
167
+ detected[framework] = "dependency"
168
+ break
169
+ else:
170
+ if self._scan_for_framework(keywords):
171
+ detected[framework] = "usage"
172
+
173
+ if self._has_agent_patterns():
174
+ detected["custom_agents"] = "implementation"
175
+
176
+ return detected
177
+
178
+ def _scan_for_framework(self, keywords: List[str]) -> bool:
179
+ python_files = list(self.repo_path.rglob("*.py"))[:50]
180
+
181
+ for py_file in python_files:
182
+ try:
183
+ content = py_file.read_text(encoding="utf-8", errors="ignore").lower()
184
+
185
+ if any(f"import {k}" in content or f"from {k}" in content for k in keywords):
186
+ return True
187
+
188
+ if any(re.search(rf"class.*{k}", content) for k in keywords):
189
+ return True
190
+
191
+ except Exception:
192
+ continue
193
+
194
+ return False
195
+
196
+ def _has_agent_patterns(self) -> bool:
197
+ patterns = [
198
+ r"class.*Agent",
199
+ r"def.*agent",
200
+ r"class.*Tool",
201
+ r"def.*tool",
202
+ r"class.*Workflow",
203
+ r"def.*workflow",
204
+ r"class.*Orchestrator",
205
+ r"def.*orchestrator",
206
+ r"@tool",
207
+ r"@agent",
208
+ r"@workflow",
209
+ ]
210
+
211
+ python_files = list(self.repo_path.rglob("*.py"))[:20]
212
+
213
+ for py_file in python_files:
214
+ try:
215
+ content = py_file.read_text(encoding="utf-8", errors="ignore")
216
+ if any(re.search(p, content, re.IGNORECASE) for p in patterns):
217
+ return True
218
+ except Exception:
219
+ continue
220
+
221
+ return False
222
+
223
+ # ---------------------------------------------------------------------
224
+ # Dependencies
225
+ # ---------------------------------------------------------------------
226
+
227
+ def extract_dependency_info(self) -> Dict:
228
+ deps = {
229
+ "python_packages": [],
230
+ "nodejs_packages": [],
231
+ "docker": False,
232
+ "other_dependencies": [],
233
+ }
234
+
235
+ req_files = [
236
+ "requirements.txt",
237
+ "pyproject.toml",
238
+ "setup.py",
239
+ "setup.cfg",
240
+ "Pipfile",
241
+ "environment.yml",
242
+ ]
243
+
244
+ for req_file in req_files:
245
+ path = self.repo_path / req_file
246
+ if path.exists():
247
+ try:
248
+ deps["python_packages"].extend(
249
+ self._parse_python_dependencies(path, req_file)
250
+ )
251
+ except Exception as e:
252
+ print(f"⚠️ Error parsing {req_file}: {e}")
253
+
254
+ package_json = self.repo_path / "package.json"
255
+ if package_json.exists():
256
+ try:
257
+ data = json.loads(package_json.read_text())
258
+ deps["nodejs_packages"].extend(data.get("dependencies", {}).keys())
259
+ deps["nodejs_packages"].extend(data.get("devDependencies", {}).keys())
260
+ except Exception:
261
+ pass
262
+
263
+ deps["docker"] = any(
264
+ (self.repo_path / f).exists()
265
+ for f in ["Dockerfile", "docker-compose.yml", "docker-compose.yaml"]
266
+ )
267
+
268
+ return deps
269
+
270
+ def _parse_python_dependencies(self, path: Path, file_name: str) -> List[str]:
271
+ packages: List[str] = []
272
+
273
+ if file_name == "requirements.txt":
274
+ for line in path.read_text().splitlines():
275
+ line = line.strip()
276
+ if line and not line.startswith("#"):
277
+ pkg = (
278
+ line.split("==")[0]
279
+ .split(">=")[0]
280
+ .split("<=")[0]
281
+ .split("~=")[0]
282
+ .strip()
283
+ )
284
+ if pkg and not pkg.startswith("-"):
285
+ packages.append(pkg)
286
+
287
+ elif file_name == "pyproject.toml":
288
+ import toml
289
+
290
+ data = toml.load(path)
291
+ deps = data.get("project", {}).get("dependencies", [])
292
+ for d in deps:
293
+ packages.append(d.split("==")[0].split(">=")[0].strip())
294
+
295
+ return packages
296
+
297
+ # ---------------------------------------------------------------------
298
+ # Structure & utilities
299
+ # ---------------------------------------------------------------------
300
+
301
+ def extract_structure_info(self) -> Dict:
302
+ structure = {
303
+ "directories": [],
304
+ "file_types": {},
305
+ "has_agentic_structure": False,
306
+ }
307
+
308
+ for item in self.repo_path.iterdir():
309
+ if item.is_dir() and item.name != ".git":
310
+ structure["directories"].append(item.name)
311
+
312
+ ext_count: Dict[str, int] = {}
313
+ for f in self.repo_path.rglob("*"):
314
+ if f.is_file():
315
+ ext_count[f.suffix.lower()] = ext_count.get(f.suffix.lower(), 0) + 1
316
+
317
+ structure["file_types"] = dict(
318
+ sorted(ext_count.items(), key=lambda x: x[1], reverse=True)[:10]
319
+ )
320
+
321
+ agentic_dirs = {
322
+ "agent",
323
+ "agents",
324
+ "workflow",
325
+ "workflows",
326
+ "tool",
327
+ "tools",
328
+ "pipeline",
329
+ "pipelines",
330
+ "orchestrator",
331
+ }
332
+
333
+ structure["has_agentic_structure"] = any(
334
+ any(k in d.lower() for k in agentic_dirs)
335
+ for d in structure["directories"]
336
+ )
337
+
338
+ return structure
339
+
340
+ def find_entry_points(self) -> List[str]:
341
+ patterns = [
342
+ "main.py",
343
+ "app.py",
344
+ "run.py",
345
+ "cli.py",
346
+ "server.py",
347
+ "agent.py",
348
+ "pipeline.py",
349
+ "__main__.py",
350
+ ]
351
+
352
+ return [
353
+ str(p.relative_to(self.repo_path))
354
+ for pat in patterns
355
+ for p in self.repo_path.rglob(pat)
356
+ ][:5]
357
+
358
+ def find_config_files(self) -> List[str]:
359
+ patterns = [
360
+ "config*.py",
361
+ "settings*.py",
362
+ ".env*",
363
+ "*.toml",
364
+ "*.yaml",
365
+ "*.yml",
366
+ "*.json",
367
+ "*.cfg",
368
+ "*.ini",
369
+ ]
370
+
371
+ files: List[str] = []
372
+ for pat in patterns:
373
+ for p in self.repo_path.rglob(pat):
374
+ rel = str(p.relative_to(self.repo_path))
375
+ if not any(x in rel for x in [".git", "__pycache__", "node_modules"]):
376
+ files.append(rel)
377
+
378
+ return sorted(files)[:10]
379
+
380
+ # ---------------------------------------------------------------------
381
+ # Internals
382
+ # ---------------------------------------------------------------------
383
+
384
+ def _get_repo_size_mb(self) -> float:
385
+ total = sum(
386
+ f.stat().st_size for f in self.repo_path.rglob("*") if f.is_file()
387
+ )
388
+ return round(total / (1024 * 1024), 2)
389
+
390
+ def _count_files(self) -> int:
391
+ return sum(
392
+ 1
393
+ for f in self.repo_path.rglob("*")
394
+ if f.is_file() and ".git" not in str(f)
395
+ )
396
+
397
+ def _run_git_command(self, args: List[str]) -> Optional[str]:
398
+ try:
399
+ result = subprocess.run(
400
+ ["git", "-C", str(self.repo_path)] + args,
401
+ capture_output=True,
402
+ text=True,
403
+ check=True,
404
+ )
405
+ return result.stdout.strip() or None
406
+ except Exception:
407
+ return None
408
+
scripts/core/ingestion/ts_chunker.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tree-sitter based syntactic chunker - Span enrichment and fallback parser.
3
+
4
+ This module provides byte-level precise chunking using Tree-sitter, which
5
+ serves as a structural fallback and span enrichment layer. Tree-sitter is
6
+ language-aware and robust against malformed code, making it ideal for
7
+ extracting exact byte spans and as a backup parser.
8
+
9
+ ARCHITECTURE POSITION:
10
+ - Enrichment Layer: Provides byte-level precision
11
+ - Fallback Parser: Robust parsing for malformed code
12
+ - Span Authority: Source of truth for byte positions
13
+
14
+ KEY FEATURES:
15
+ 1. Byte-level accurate spans (exact source positions)
16
+ 2. Language-aware parsing (supports multiple languages)
17
+ 3. Robust against syntax errors
18
+ 4. Extracts structural nodes even from partial code
19
+
20
+ FLOW:
21
+ File → Tree-sitter parser → Structural nodes → Spans for enrichment
22
+
23
+ USAGE:
24
+ from ts_chunker import extract_ts_chunks
25
+ chunks = extract_ts_chunks(Path("file.py"))
26
+
27
+ NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
28
+ precise byte spans and serve as fallback for syntax errors.
29
+ """
30
+
31
+ from pathlib import Path
32
+ from typing import List, Optional, Literal, Dict, Tuple
33
+
34
+ from tree_sitter import Parser, Language, Node
35
+ import tree_sitter_python as tspython
36
+
37
+ from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType
38
+
39
+ # ----------------------------
40
+ # Types
41
+ # ----------------------------
42
+
43
+ TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
44
+ "module": "module",
45
+ "class_definition": "class",
46
+ "function_definition": "function",
47
+ "async_function_definition": "function",
48
+ "import_statement": "imports",
49
+ "import_from_statement": "imports",
50
+ }
51
+
52
+ MAX_TS_DEPTH = 3 # module → imports → class/function → method
53
+
54
+
55
+ # ----------------------------
56
+ # Helpers
57
+ # ----------------------------
58
+
59
+ def _safe_decode(data: bytes) -> str:
60
+ try:
61
+ return data.decode("utf-8")
62
+ except UnicodeDecodeError:
63
+ return data.decode("utf-8", errors="ignore")
64
+
65
+
66
+ def _get_node_name(node: Node) -> Optional[str]:
67
+ """
68
+ Extract identifier name for class / function nodes.
69
+ """
70
+ for child in node.children:
71
+ if child.type == "identifier":
72
+ text = child.text
73
+ if isinstance(text, (bytes, bytearray)):
74
+ return _safe_decode(text)
75
+ return None
76
+
77
+
78
+ # ----------------------------
79
+ # Public API
80
+ # ----------------------------
81
+
82
+ def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
83
+ source_bytes = file_path.read_bytes()
84
+
85
+ language = Language(tspython.language())
86
+ parser = Parser(language=language)
87
+
88
+ tree = parser.parse(source_bytes)
89
+ root = tree.root_node
90
+
91
+ chunks: List[CodeChunk] = []
92
+
93
+ def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
94
+ if depth > MAX_TS_DEPTH:
95
+ return
96
+
97
+ node_type = node.type
98
+
99
+ if node_type in TS_TO_CHUNK_TYPE:
100
+ code_bytes = source_bytes[node.start_byte : node.end_byte]
101
+ code = _safe_decode(code_bytes)
102
+
103
+ chunk_type = TS_TO_CHUNK_TYPE[node_type]
104
+ name = _get_node_name(node)
105
+
106
+ # For imports, use the full import as name
107
+ if chunk_type == "imports":
108
+ name = code.strip()
109
+
110
+ # Create chunk with byte-level precision
111
+ chunks.append(
112
+ CodeChunk(
113
+ chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
114
+ file_path=str(file_path),
115
+ language="python",
116
+ chunk_type=chunk_type,
117
+ code=code,
118
+ ast=ChunkAST(
119
+ symbol_type=None, # TS doesn't provide semantic types
120
+ name=name,
121
+ parent=None, # Parent relationships from AST
122
+ docstring=None,
123
+ decorators=[],
124
+ imports=[],
125
+ node_type=node_type,
126
+ ),
127
+ span=ChunkSpan(
128
+ start_byte=node.start_byte,
129
+ end_byte=node.end_byte,
130
+ start_line=node.start_point[0] + 1,
131
+ end_line=node.end_point[0] + 1,
132
+ char_count=len(code),
133
+ ),
134
+ hierarchy=ChunkHierarchy(
135
+ is_primary=False, # Tree-sitter chunks are for span enrichment only
136
+ is_extracted=True,
137
+ depth=depth,
138
+ parent_id=None, # Parent relationships from AST
139
+ ),
140
+ metadata={
141
+ "byte_span": {
142
+ "start": node.start_byte,
143
+ "end": node.end_byte,
144
+ },
145
+ "tree_sitter_node_type": node_type,
146
+ "is_exact_span": True,
147
+ },
148
+ )
149
+ )
150
+
151
+ for child in node.children:
152
+ walk(child, depth + 1, node)
153
+
154
+ walk(root)
155
+ return chunks
scripts/core/training/__init__.py ADDED
File without changes
scripts/core/training/model.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import AutoModel, AutoConfig
4
+
5
+ class CodeEmbedder(nn.Module):
6
+ """
7
+ A wrapper around a Transformer model (default: CodeBERT) to produce
8
+ dense vector embeddings for code snippets using Mean Pooling.
9
+ """
10
+ def __init__(self, model_name_or_path="microsoft/codebert-base", trust_remote_code=False):
11
+ super(CodeEmbedder, self).__init__()
12
+ self.config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
13
+ self.encoder = AutoModel.from_pretrained(model_name_or_path, config=self.config, trust_remote_code=trust_remote_code)
14
+
15
+ def mean_pooling(self, token_embeddings, attention_mask):
16
+ """
17
+ Average the token embeddings, ignoring padding tokens.
18
+ """
19
+ # attention_mask: (batch_size, seq_len)
20
+ # token_embeddings: (batch_size, seq_len, hidden_dim)
21
+
22
+ # Expand mask to match embedding dimensions
23
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
24
+
25
+ # Sum embeddings (ignoring padding)
26
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
27
+
28
+ # Count non-padding tokens (prevent division by zero with clamp)
29
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
30
+
31
+ return sum_embeddings / sum_mask
32
+
33
+ def forward(self, input_ids, attention_mask):
34
+ # Pass through the transformer
35
+ outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
36
+
37
+ # Extract last hidden state
38
+ # Shape: (batch_size, seq_len, hidden_dim)
39
+ last_hidden_state = outputs.last_hidden_state
40
+
41
+ # Perform Mean Pooling (Better than CLS token for sentence similarity)
42
+ embeddings = self.mean_pooling(last_hidden_state, attention_mask)
43
+
44
+ # Normalize embeddings (Optional but recommended for cosine similarity)
45
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
46
+
47
+ return embeddings
scripts/core/training/test_model.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from transformers import AutoTokenizer, AutoModel
4
+
5
+ # 1. Load Model from Hugging Face (Your Team's Checkpoint)
6
+ MODEL_NAME = "shubharuidas/codebert-base-code-embed-mrl-langchain-langgraph"
7
+
8
+ import time
9
+
10
+ print(f"Downloading model: {MODEL_NAME}...")
11
+ MAX_RETRIES = 3
12
+ for attempt in range(MAX_RETRIES):
13
+ try:
14
+ print(f"Attempt {attempt+1}/{MAX_RETRIES}...")
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
16
+ model = AutoModel.from_pretrained(MODEL_NAME)
17
+ print("Model loaded successfully!")
18
+ break
19
+ except Exception as e:
20
+ print(f"Attempt {attempt+1} failed: {e}")
21
+ if attempt == MAX_RETRIES - 1:
22
+ print("Failed to load model after multiple attempts.")
23
+ print("Tip: Check internet connection or repo visibility.")
24
+ exit(1)
25
+ time.sleep(5) # Wait before retry
26
+
27
+ # 2. Define Inputs (Query vs Code)
28
+ query = "How to create a state graph in langgraph?"
29
+ code = """
30
+ from langgraph.graph import StateGraph
31
+
32
+ def create_workflow():
33
+ workflow = StateGraph(AgentState)
34
+ workflow.add_node("agent", agent_node)
35
+ return workflow.compile()
36
+ """
37
+ irrelevant_code = "def fast_inverse_sqrt(number): return number ** -0.5"
38
+
39
+ # 3. Embed & Compare
40
+ def embed(text):
41
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
42
+ with torch.no_grad():
43
+ outputs = model(**inputs)
44
+ # Mean pooling for sentence representation
45
+ embeddings = outputs.last_hidden_state.mean(dim=1)
46
+ return F.normalize(embeddings, p=2, dim=1)
47
+
48
+ print("\nRunning Inference Test...")
49
+ query_emb = embed(query)
50
+ code_emb = embed(code)
51
+ irrelevant_emb = embed(irrelevant_code)
52
+
53
+ # 4. Calculate Similarity
54
+ sim_positive = F.cosine_similarity(query_emb, code_emb).item()
55
+ sim_negative = F.cosine_similarity(query_emb, irrelevant_emb).item()
56
+
57
+ print(f"Query: '{query}'")
58
+ print(f"Similarity to Relevant Code: {sim_positive:.4f} (Should be high)")
59
+ print(f"Similarity to Irrelevant Code: {sim_negative:.4f} (Should be low)")
60
+
61
+ if sim_positive > sim_negative:
62
+ print("\nSUCCESS: Model correctly ranks relevant code higher.")
63
+ else:
64
+ print("\n⚠️ WARNING: Model performance might be poor.")
scripts/core/training/train.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import torch
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from transformers import AutoTokenizer
6
+
7
+ from scripts.core.training.model import CodeEmbedder
8
+ from scripts.core.training.trainer import CodeTrainer
9
+
10
+ import json
11
+
12
+ # Real Dataset class for Triplet Training
13
+ class RealCodeDataset(Dataset):
14
+ def __init__(self, jsonl_path, tokenizer, max_length=512):
15
+ self.tokenizer = tokenizer
16
+ self.max_length = max_length
17
+ self.data = []
18
+
19
+ print(f"Loading data from {jsonl_path}...")
20
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
21
+ for line in f:
22
+ if line.strip():
23
+ self.data.append(json.loads(line))
24
+ print(f"Loaded {len(self.data)} triplets.")
25
+
26
+ def __len__(self):
27
+ return len(self.data)
28
+
29
+ def __getitem__(self, idx):
30
+ item = self.data[idx]
31
+
32
+ # Helper to tokenize
33
+ def tokenize_text(text):
34
+ return self.tokenizer(
35
+ text,
36
+ return_tensors='pt',
37
+ padding='max_length',
38
+ truncation=True,
39
+ max_length=self.max_length
40
+ )
41
+
42
+ # Tokenize all three parts
43
+ anchor = tokenize_text(item['anchor'])
44
+ positive = tokenize_text(item['positive'])
45
+ negative = tokenize_text(item['negative'])
46
+
47
+ # Return a flat dict with prefixed keys
48
+ return {
49
+ 'anchor_input_ids': anchor['input_ids'].squeeze(0),
50
+ 'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
51
+ 'positive_input_ids': positive['input_ids'].squeeze(0),
52
+ 'positive_attention_mask': positive['attention_mask'].squeeze(0),
53
+ 'negative_input_ids': negative['input_ids'].squeeze(0),
54
+ 'negative_attention_mask': negative['attention_mask'].squeeze(0)
55
+ }
56
+
57
+ # Dummy Dataset class for MVP testing without the robust data pipeline availability
58
+ class DummyCodeDataset(Dataset):
59
+ def __init__(self, tokenizer, size=100):
60
+ self.tokenizer = tokenizer
61
+ self.size = size
62
+ # Generate dummy triplet structure
63
+ self.data = [{"anchor": "def hello(): return 'world'", "positive": "def hi(): return 'earth'", "negative": "class Foo: pass"}] * size
64
+
65
+ def __len__(self):
66
+ return self.size
67
+
68
+ def __getitem__(self, idx):
69
+ item = self.data[idx]
70
+
71
+ # Helper to tokenize
72
+ def tokenize_text(text):
73
+ return self.tokenizer(
74
+ text,
75
+ return_tensors='pt',
76
+ padding='max_length',
77
+ truncation=True,
78
+ max_length=128
79
+ )
80
+
81
+ anchor = tokenize_text(item['anchor'])
82
+ positive = tokenize_text(item['positive'])
83
+ negative = tokenize_text(item['negative'])
84
+
85
+ return {
86
+ 'anchor_input_ids': anchor['input_ids'].squeeze(0),
87
+ 'anchor_attention_mask': anchor['attention_mask'].squeeze(0),
88
+ 'positive_input_ids': positive['input_ids'].squeeze(0),
89
+ 'positive_attention_mask': positive['attention_mask'].squeeze(0),
90
+ 'negative_input_ids': negative['input_ids'].squeeze(0),
91
+ 'negative_attention_mask': negative['attention_mask'].squeeze(0)
92
+ }
93
+
94
+ def main():
95
+ parser = argparse.ArgumentParser(description="Train CodeMode Embeddings")
96
+
97
+ parser.add_argument("--model_name", type=str, default="microsoft/codebert-base", help="Hub model name")
98
+ parser.add_argument("--data_path", type=str, required=False, help="Path to parsed chunks.jsonl")
99
+ parser.add_argument("--output_dir", type=str, default="./output", help="Where to save checkpoints")
100
+ parser.add_argument("--epochs", type=int, default=3)
101
+ parser.add_argument("--batch_size", type=int, default=8)
102
+ parser.add_argument("--accumulation_steps", type=int, default=4, help="Gradient Accumulation Steps")
103
+ parser.add_argument("--lr", type=float, default=2e-5)
104
+ parser.add_argument("--dry_run", action="store_true", help="Run with dummy data for 1 epoch")
105
+
106
+ args = parser.parse_args()
107
+
108
+ print(f"Initializing Training Pipeline...")
109
+ print(f" Model: {args.model_name}")
110
+ print(f" Output: {args.output_dir}")
111
+ print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
112
+
113
+ # 1. Initialize Tokenizer
114
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
115
+
116
+ # 2. Load Dataset (Real or Dummy)
117
+ if args.data_path and os.path.exists(args.data_path):
118
+ train_dataset = RealCodeDataset(args.data_path, tokenizer)
119
+ else:
120
+ print("No data path provided or file missing. Using DUMMY data for verification.")
121
+ train_dataset = DummyCodeDataset(tokenizer, size=100)
122
+
123
+ train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
124
+
125
+ # 3. Initialize Model
126
+ model = CodeEmbedder(model_name_or_path=args.model_name)
127
+
128
+ # 4. Initialize Trainer
129
+ trainer = CodeTrainer(
130
+ model=model,
131
+ train_loader=train_loader,
132
+ epochs=args.epochs,
133
+ learning_rate=args.lr,
134
+ accumulation_steps=args.accumulation_steps,
135
+ mixed_precision=True, # Hardcoded True for the "Zero-Cost" philosophy
136
+ output_dir=args.output_dir
137
+ )
138
+
139
+ # 5. Connect and Train
140
+ trainer.train()
141
+
142
+ print("Training Complete.")
143
+
144
+ if __name__ == "__main__":
145
+ main()
scripts/core/training/trainer.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.optim import AdamW
4
+ from torch.utils.data import DataLoader
5
+ from tqdm import tqdm
6
+ import os
7
+ import logging
8
+ from .model import CodeEmbedder
9
+
10
+ # Setup Logger
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class CodeTrainer:
15
+ def __init__(
16
+ self,
17
+ model: CodeEmbedder,
18
+ train_loader: DataLoader,
19
+ val_loader: DataLoader = None,
20
+ epochs: int = 3,
21
+ learning_rate: float = 2e-5,
22
+ accumulation_steps: int = 1,
23
+ mixed_precision: bool = True,
24
+ output_dir: str = "./output",
25
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
26
+ ):
27
+ self.model = model.to(device)
28
+ self.train_loader = train_loader
29
+ self.val_loader = val_loader
30
+ self.epochs = epochs
31
+ self.lr = learning_rate
32
+ self.accumulation_steps = accumulation_steps
33
+ self.mixed_precision = mixed_precision
34
+ self.output_dir = output_dir
35
+ self.device = device
36
+
37
+ # Optimizer
38
+ self.optimizer = AdamW(self.model.parameters(), lr=self.lr)
39
+
40
+ # Scheduler (Optional: constant for now, can transform to Linear later)
41
+ # self.scheduler = ...
42
+
43
+ # Mixed Precision Scaler
44
+ self.scaler = torch.cuda.amp.GradScaler(enabled=self.mixed_precision)
45
+
46
+ # Loss Function: Triplet Margin Loss (Standard for Sentence Embeddings)
47
+ # Tries to maximize distance between Anchor-Negative and minimize Anchor-Positive
48
+ self.criterion = nn.TripletMarginLoss(margin=1.0, p=2)
49
+
50
+ def train_step(self, batch):
51
+ """
52
+ Runs one training step. Returns loss.
53
+ """
54
+ # Unpack the Triplet Batch
55
+ # We assume the Dataset returns keys: 'anchor_input_ids', 'anchor_attention_mask', etc.
56
+
57
+ # Helper to move dict to device
58
+ to_device = lambda x: x.to(self.device)
59
+
60
+ # Autocast for Mixed Precision
61
+ with torch.cuda.amp.autocast(enabled=self.mixed_precision):
62
+ # 1. Forward Pass for all 3 components
63
+ anchor_emb = self.model(to_device(batch['anchor_input_ids']), to_device(batch['anchor_attention_mask']))
64
+ positive_emb = self.model(to_device(batch['positive_input_ids']), to_device(batch['positive_attention_mask']))
65
+ negative_emb = self.model(to_device(batch['negative_input_ids']), to_device(batch['negative_attention_mask']))
66
+
67
+ # 2. Compute Triplet Loss
68
+ loss = self.criterion(anchor_emb, positive_emb, negative_emb)
69
+
70
+ return loss
71
+
72
+ def train(self):
73
+ logger.info(f"Starting training on {self.device}...")
74
+ logger.info(f"Batch Size: {self.train_loader.batch_size}, Accumulation Steps: {self.accumulation_steps}")
75
+ logger.info(f"Effective Batch Size: {self.train_loader.batch_size * self.accumulation_steps}")
76
+
77
+ self.model.train()
78
+
79
+ for epoch in range(self.epochs):
80
+ total_loss = 0
81
+ self.optimizer.zero_grad()
82
+
83
+ progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.epochs}")
84
+
85
+ for step, batch in enumerate(progress_bar):
86
+
87
+ # Forward + Loss Calculation
88
+ loss = self.train_step(batch)
89
+
90
+ # Gradient Accumulation: Normalize loss
91
+ loss = loss / self.accumulation_steps
92
+
93
+ # Backward Pass (Scaled)
94
+ self.scaler.scale(loss).backward()
95
+
96
+ if (step + 1) % self.accumulation_steps == 0:
97
+ # Update Weights
98
+ self.scaler.step(self.optimizer)
99
+ self.scaler.update()
100
+ self.optimizer.zero_grad()
101
+
102
+ total_loss += loss.item() * self.accumulation_steps
103
+ progress_bar.set_postfix({'loss': total_loss / (step + 1)})
104
+
105
+ # Save Checkpoint
106
+ self.save_model(epoch+1)
107
+
108
+ def save_model(self, epoch):
109
+ save_path = os.path.join(self.output_dir, f"checkpoint-{epoch}")
110
+ os.makedirs(save_path, exist_ok=True)
111
+
112
+ logger.info(f"Saving model to {save_path}...")
113
+
114
+ # Save explicitly as safetensors via transformers API
115
+ self.model.encoder.save_pretrained(save_path, safe_serialization=True)
116
+ self.model.config.save_pretrained(save_path)
117
+ # Note: We save the 'encoder' which is the AutoModel,
118
+ # so it can be loaded easily by others.
scripts/core/utils/__init__.py ADDED
File without changes
scripts/core/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
scripts/core/utils/__pycache__/id_utils.cpython-311.pyc ADDED
Binary file (3.18 kB). View file
 
scripts/core/utils/id_utils.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deterministic ID generation for code chunks.
3
+
4
+ This module provides deterministic hashing for chunk IDs, ensuring that
5
+ identical code chunks receive the same ID across runs. This is crucial for:
6
+ 1. Version tracking and change detection
7
+ 2. Cache consistency
8
+ 3. Reproducible datasets
9
+ 4. Efficient deduplication
10
+
11
+ ID GENERATION STRATEGY:
12
+ Hash = SHA256(file_path + chunk_type + name + parent +
13
+ start_line + end_line + code + byte_spans)
14
+
15
+ Result: prefix_hash (e.g., "primary_5c442008")
16
+
17
+ KEY PROPERTIES:
18
+ 1. Deterministic: Same input → same ID
19
+ 2. Content-aware: Code changes → ID changes
20
+ 3. Position-aware: Line/byte changes → ID changes
21
+ 4. Hierarchical: Parent relationships affect ID
22
+
23
+ USE CASE:
24
+ Ensures that during RAG operations, identical code chunks are
25
+ recognized as the same entity, improving retrieval accuracy.
26
+
27
+ EXAMPLE:
28
+ deterministic_chunk_id(
29
+ file_path="src/module.py",
30
+ chunk_type="class",
31
+ name="MyClass",
32
+ parent="module",
33
+ start_line=10,
34
+ end_line=50,
35
+ code="class MyClass: ...",
36
+ start_byte=100,
37
+ end_byte=500
38
+ )
39
+ → "primary_a1b2c3d4"
40
+ """
41
+
42
+ import hashlib
43
+ from typing import Optional
44
+
45
+ def deterministic_chunk_id(
46
+ *,
47
+ file_path: str,
48
+ chunk_type: str,
49
+ name: Optional[str],
50
+ parent: Optional[str],
51
+ start_line: Optional[int],
52
+ end_line: Optional[int],
53
+ code: str,
54
+ prefix: str = "primary",
55
+ start_byte: Optional[int] = None,
56
+ end_byte: Optional[int] = None,
57
+ ) -> str:
58
+ """
59
+ Generate deterministic chunk ID that includes code content.
60
+
61
+ Args:
62
+ file_path: Path to source file
63
+ chunk_type: Type of chunk (function, class, method, etc.)
64
+ name: Name of the symbol
65
+ parent: Parent symbol name
66
+ start_line: Starting line number
67
+ end_line: Ending line number
68
+ code: Actual code content
69
+ prefix: ID prefix (primary/secondary)
70
+ start_byte: Starting byte offset
71
+ end_byte: Ending byte offset
72
+
73
+ Returns:
74
+ Deterministic chunk ID
75
+ """
76
+ # Create a payload that uniquely identifies this chunk
77
+ payload = f"""
78
+ {file_path}
79
+ {chunk_type}
80
+ {name}
81
+ {parent}
82
+ {start_line}
83
+ {end_line}
84
+ {start_byte}
85
+ {end_byte}
86
+ {code}
87
+ """.strip()
88
+
89
+ # Generate hash and use first 8 chars for readability
90
+ hash_digest = hashlib.sha256(payload.encode("utf-8")).hexdigest()[:8]
91
+ return f"{prefix}_{hash_digest}"
scripts/generate_all_frameworks.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate training datasets for ALL frameworks automatically.
3
+
4
+ This script auto-discovers all chunk files and processes them,
5
+ generating separate datasets for each framework PLUS a combined dataset.
6
+
7
+ Usage:
8
+ python scripts/generate_all_frameworks.py
9
+
10
+ Output Structure:
11
+ data/processed/training_crewai/
12
+ - positive_pairs.json
13
+ - triplets.json
14
+ data/processed/training_langgraph/
15
+ - positive_pairs.json
16
+ - triplets.json
17
+ data/processed/training_combined/
18
+ - positive_pairs.json (ALL frameworks merged)
19
+ - triplets.json (ALL frameworks merged)
20
+ """
21
+
22
+ import sys
23
+ import json
24
+ from pathlib import Path
25
+ from typing import List, Tuple
26
+ from dataclasses import asdict
27
+
28
+ # Add project root to path
29
+ PROJECT_ROOT = Path(__file__).parent.parent
30
+ sys.path.insert(0, str(PROJECT_ROOT))
31
+
32
+ from src.task_3_data_engineering.export.pairs_triplets_generator import (
33
+ generate_pairs_and_triplets,
34
+ PositivePair,
35
+ Triplet
36
+ )
37
+
38
+
39
+ def discover_all_chunk_files() -> List[Tuple[Path, str]]:
40
+ """
41
+ Discover all chunk files in the workspace.
42
+
43
+ Returns:
44
+ List of (chunk_path, framework_name) tuples
45
+ """
46
+ chunk_files = []
47
+
48
+ # Check local chunks
49
+ local_paths = [
50
+ PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
51
+ PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
52
+ ]
53
+
54
+ for path in local_paths:
55
+ if path.exists():
56
+ # Extract framework from parent directory or use "local"
57
+ if "Local_saved_files" in str(path):
58
+ framework = "crewai"
59
+ elif "sample_code" in str(path):
60
+ framework = "sample"
61
+ else:
62
+ framework = path.parent.name
63
+ chunk_files.append((path, framework))
64
+
65
+ # Check repository chunks
66
+ repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
67
+ if repos_dir.exists():
68
+ for repo_dir in repos_dir.iterdir():
69
+ if repo_dir.is_dir():
70
+ for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
71
+ # Extract framework from filename or directory
72
+ framework = jsonl_file.stem.replace("_chunks", "").split("_")[0]
73
+ chunk_files.append((jsonl_file, framework))
74
+
75
+ return chunk_files
76
+
77
+
78
+ def merge_datasets(all_pairs: List[List[PositivePair]],
79
+ all_triplets: List[List[Triplet]],
80
+ output_dir: Path) -> None:
81
+ """Merge all framework datasets into combined files (JSON + JSONL)."""
82
+ output_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ # Flatten lists
85
+ combined_pairs = []
86
+ for pairs in all_pairs:
87
+ combined_pairs.extend(pairs)
88
+
89
+ combined_triplets = []
90
+ for triplets in all_triplets:
91
+ combined_triplets.extend(triplets)
92
+
93
+ # Export combined positive pairs - JSON
94
+ pairs_json_path = output_dir / "positive_pairs.json"
95
+ with open(pairs_json_path, "w", encoding="utf-8") as f:
96
+ json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False)
97
+ print(f"✅ Combined positive pairs (JSON): {pairs_json_path}")
98
+
99
+ # Export combined positive pairs - JSONL
100
+ pairs_jsonl_path = output_dir / "positive_pairs.jsonl"
101
+ with open(pairs_jsonl_path, "w", encoding="utf-8") as f:
102
+ for p in combined_pairs:
103
+ f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n")
104
+ print(f"✅ Combined positive pairs (JSONL): {pairs_jsonl_path}")
105
+
106
+ # Export combined triplets - JSON
107
+ triplets_json_path = output_dir / "triplets.json"
108
+ with open(triplets_json_path, "w", encoding="utf-8") as f:
109
+ json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False)
110
+ print(f"✅ Combined triplets (JSON): {triplets_json_path}")
111
+
112
+ # Export combined triplets - JSONL
113
+ triplets_jsonl_path = output_dir / "triplets.jsonl"
114
+ with open(triplets_jsonl_path, "w", encoding="utf-8") as f:
115
+ for t in combined_triplets:
116
+ f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
117
+ print(f"✅ Combined triplets (JSONL): {triplets_jsonl_path}")
118
+
119
+ return len(combined_pairs), len(combined_triplets)
120
+
121
+
122
+ def main():
123
+ """Generate datasets for all discovered frameworks + combined dataset."""
124
+ print("=" * 80)
125
+ print("🚀 MULTI-FRAMEWORK TRAINING DATA GENERATOR")
126
+ print("=" * 80)
127
+
128
+ # Discover all chunk files
129
+ print("\n🔍 Discovering chunk files...")
130
+ chunk_files = discover_all_chunk_files()
131
+
132
+ if not chunk_files:
133
+ print("❌ No chunk files found!")
134
+ print("\nPlease ensure chunks exist in:")
135
+ print(" - data/processed/chunks/Local_saved_files/")
136
+ print(" - data/processed/repos/*/")
137
+ return
138
+
139
+ print(f"✅ Found {len(chunk_files)} chunk file(s):\n")
140
+ for path, framework in chunk_files:
141
+ print(f" 📦 {framework}: {path.name}")
142
+
143
+ # Process each framework
144
+ print("\n" + "=" * 80)
145
+ print("🔄 PROCESSING INDIVIDUAL FRAMEWORKS")
146
+ print("=" * 80 + "\n")
147
+
148
+ results = []
149
+ all_pairs = []
150
+ all_triplets = []
151
+
152
+ for i, (chunks_path, framework) in enumerate(chunk_files, 1):
153
+ print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...")
154
+ print("-" * 60)
155
+
156
+ output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}"
157
+
158
+ try:
159
+ pairs, triplets = generate_pairs_and_triplets(
160
+ chunks_path=chunks_path,
161
+ output_dir=output_dir,
162
+ num_pairs=100,
163
+ num_triplets=100,
164
+ variance=5,
165
+ export_format="both" # JSON + JSONL
166
+ )
167
+
168
+ # Collect for combined dataset
169
+ all_pairs.append(pairs)
170
+ all_triplets.append(triplets)
171
+
172
+ results.append({
173
+ "framework": framework,
174
+ "status": "✅ SUCCESS",
175
+ "pairs": len(pairs),
176
+ "variations": sum(len(p.variations) for p in pairs),
177
+ "triplets": len(triplets),
178
+ "output": output_dir
179
+ })
180
+
181
+ except Exception as e:
182
+ results.append({
183
+ "framework": framework,
184
+ "status": f"❌ FAILED: {str(e)}",
185
+ "output": output_dir
186
+ })
187
+
188
+ # Create combined dataset
189
+ print("\n" + "=" * 80)
190
+ print("🔗 CREATING COMBINED DATASET (ALL FRAMEWORKS)")
191
+ print("=" * 80 + "\n")
192
+
193
+ combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined"
194
+ total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir)
195
+
196
+ # Final summary
197
+ print("\n" + "=" * 80)
198
+ print("📊 FINAL SUMMARY")
199
+ print("=" * 80 + "\n")
200
+
201
+ print("INDIVIDUAL FRAMEWORK DATASETS:")
202
+ print("-" * 40)
203
+ for result in results:
204
+ print(f"\n📦 {result['framework'].upper()}")
205
+ print(f" Status: {result['status']}")
206
+ if "pairs" in result:
207
+ print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)")
208
+ print(f" - triplets.json: {result['triplets']} docs")
209
+ print(f" 📁 {result['output']}")
210
+
211
+ print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):")
212
+ print("-" * 40)
213
+ print(f"📁 {combined_dir}")
214
+ print(f" - positive_pairs.json: {total_pairs} docs")
215
+ print(f" - triplets.json: {total_triplets} docs")
216
+
217
+ # File count summary
218
+ successful = sum(1 for r in results if "SUCCESS" in r["status"])
219
+ total_files = (successful * 4) + 4 # 4 per framework + 4 combined
220
+
221
+ print(f"\n\n📄 TOTAL FILES GENERATED: {total_files}")
222
+ print(f" - {successful} frameworks × 4 files = {successful * 4} files")
223
+ print(f" - Combined dataset = 4 files")
224
+ print("=" * 80)
225
+
226
+
227
+ if __name__ == "__main__":
228
+ main()
scripts/run_pairs_triplets_pipeline.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to generate positive pairs and triplets from code chunks.
3
+
4
+ This script loads code chunks and generates:
5
+ 1. Positive Pairs: (question, code) with 4-5 variations per sample
6
+ 2. Triplets: (anchor_question, positive_code, negative_code)
7
+
8
+ Usage:
9
+ python -m scripts.run_pairs_triplets_pipeline --chunks <path> --output <dir>
10
+ python -m scripts.run_pairs_triplets_pipeline --help
11
+
12
+ Examples:
13
+ # Generate from local chunks with default settings
14
+ python -m scripts.run_pairs_triplets_pipeline \\
15
+ --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
16
+ --output data/processed/training
17
+
18
+ # Generate from repository chunks
19
+ python -m scripts.run_pairs_triplets_pipeline \\
20
+ --chunks data/processed/repos/langgraph_20260116_123638/langgraph_chunks.jsonl \\
21
+ --output data/processed/training/langgraph
22
+
23
+ # Custom settings
24
+ python -m scripts.run_pairs_triplets_pipeline \\
25
+ --chunks data/processed/chunks/Local_saved_files/chunks.jsonl \\
26
+ --output data/processed/training \\
27
+ --pairs 100 --triplets 100 --variance 5
28
+ """
29
+
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ # Add project root to path
34
+ PROJECT_ROOT = Path(__file__).parent.parent
35
+ sys.path.insert(0, str(PROJECT_ROOT))
36
+
37
+ from src.task_3_data_engineering.export.pairs_triplets_generator import (
38
+ generate_pairs_and_triplets,
39
+ main as cli_main
40
+ )
41
+
42
+
43
+ def run_default_pipeline():
44
+ """Run with default settings for the available chunks."""
45
+
46
+ # Try multiple possible chunk locations
47
+ possible_paths = [
48
+ PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
49
+ PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
50
+ ]
51
+
52
+ # Find all chunks.jsonl files in chunks folder subdirectories
53
+ chunks_dir = PROJECT_ROOT / "data" / "processed" / "chunks"
54
+ if chunks_dir.exists():
55
+ for subdir in chunks_dir.iterdir():
56
+ if subdir.is_dir():
57
+ chunks_file = subdir / "chunks.jsonl"
58
+ if chunks_file.exists() and chunks_file not in possible_paths:
59
+ possible_paths.append(chunks_file)
60
+
61
+ # Find repository chunks
62
+ repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
63
+ if repos_dir.exists():
64
+ for repo_dir in repos_dir.iterdir():
65
+ if repo_dir.is_dir():
66
+ for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
67
+ possible_paths.append(jsonl_file)
68
+
69
+ chunks_path = None
70
+ for path in possible_paths:
71
+ if path.exists():
72
+ chunks_path = path
73
+ break
74
+
75
+ if chunks_path is None:
76
+ print("❌ No chunks files found. Please specify a chunks file with --chunks")
77
+ print("\nPossible locations checked:")
78
+ for p in possible_paths[:5]:
79
+ print(f" - {p}")
80
+ return
81
+
82
+ output_dir = PROJECT_ROOT / "data" / "processed" / "training"
83
+
84
+ print("=" * 60)
85
+ print("🚀 Positive Pairs & Triplets Generator")
86
+ print("=" * 60)
87
+ print(f"\n📂 Chunks Path: {chunks_path}")
88
+ print(f"📁 Output Dir: {output_dir}")
89
+ print(f"📊 Settings: pairs=100, triplets=100, variance=5")
90
+ print("\n" + "-" * 60)
91
+
92
+ pairs, triplets = generate_pairs_and_triplets(
93
+ chunks_path=chunks_path,
94
+ output_dir=output_dir,
95
+ num_pairs=100,
96
+ num_triplets=100,
97
+ variance=5,
98
+ export_format="both"
99
+ )
100
+
101
+ print("\n" + "=" * 60)
102
+ print("✅ Pipeline Complete!")
103
+ print("=" * 60)
104
+ print(f"\n📁 Output files saved to: {output_dir}")
105
+ print(" - positive_pairs.jsonl")
106
+ print(" - positive_pairs.json")
107
+ print(" - triplets.jsonl")
108
+ print(" - triplets.json")
109
+
110
+
111
+ if __name__ == "__main__":
112
+ import argparse
113
+
114
+ # Check if any arguments provided
115
+ if len(sys.argv) > 1:
116
+ # Use CLI with provided arguments
117
+ cli_main()
118
+ else:
119
+ # Run with defaults
120
+ run_default_pipeline()
scripts/run_python_pipeline.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local Codebase Pipeline Runner - Processes local codebases for dataset creation.
3
+
4
+ This is the main entry point for processing LOCAL CODEBASES (not Git repos).
5
+ It orchestrates the entire chunking pipeline for local files, handling both
6
+ code files and documentation with intelligent fallback strategies.
7
+
8
+ ARCHITECTURE POSITION:
9
+ - Local Pipeline Orchestrator: Coordinates local file processing
10
+ - Fallback Handler: Intelligent fallback from code to documentation
11
+ - Dataset Exporter: Creates final JSONL datasets with statistics
12
+
13
+ KEY FEATURES:
14
+ 1. Unified processing of Python files and documentation
15
+ 2. Intelligent fallback (failed code chunking → documentation chunking)
16
+ 3. Hierarchical chunking for Python files
17
+ 4. Documentation-aware chunking for markdown/text files
18
+ 5. Dataset statistics and metadata generation
19
+
20
+ DATA FLOW:
21
+ Local files → Type detection → Python chunking (or fallback) →
22
+ Documentation chunking → JSONL export → Statistics
23
+
24
+ USE CASES:
25
+ - Processing locally saved code examples
26
+ - Creating datasets from example repositories
27
+ - Testing chunking strategies on local files
28
+
29
+ USAGE:
30
+ python run_python_pipeline.py --name crewai_examples --include crewai
31
+ python run_python_pipeline.py --name test_dataset --exclude large_repos
32
+ """
33
+
34
+ from pathlib import Path
35
+ import json
36
+ import argparse
37
+
38
+ from src.task_3_data_engineering.chunking.hierarchical_chunker import HierarchicalChunker
39
+ from src.task_3_data_engineering.export.jsonl_exporter import export_chunks_jsonl
40
+ from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
41
+ from src.task_3_data_engineering.export.dataset_metadata import write_dataset_metadata
42
+ from src.task_3_data_engineering.chunking.doc_chunker import chunk_document , wrap_doc_chunks
43
+
44
+
45
+ INPUT_DIR = Path("data/raw/codebases")
46
+ BASE_OUTPUT_DIR = Path("data/processed/chunks")
47
+
48
+ DOC_EXTS = {".md", ".txt", ".rst"}
49
+
50
+
51
+ def run(dataset_name: str, include: list[str] | None, exclude: list[str] | None):
52
+ output_dir = BASE_OUTPUT_DIR / dataset_name
53
+ output_dir.mkdir(parents=True, exist_ok=True)
54
+
55
+ chunker = HierarchicalChunker()
56
+ all_chunks = []
57
+
58
+ files = [p for p in INPUT_DIR.rglob("*") if p.is_file()]
59
+
60
+ for file_path in files:
61
+ rel = file_path.relative_to(INPUT_DIR).parts
62
+ if include and rel[0] not in include:
63
+ continue
64
+ if exclude and rel[0] in exclude:
65
+ continue
66
+
67
+ print(f"Processing: {file_path}")
68
+
69
+ # ---- Python files ----
70
+ if file_path.suffix == ".py":
71
+ try:
72
+ code_chunks = chunker.chunk_file(file_path)
73
+ if code_chunks:
74
+ all_chunks.extend(code_chunks)
75
+ continue
76
+ except Exception:
77
+ pass # fallback to doc mode
78
+
79
+ # ---- Documentation / text ----
80
+ if file_path.suffix.lower() in DOC_EXTS or file_path.suffix == ".py":
81
+ try:
82
+ raw_text = file_path.read_text(encoding="utf-8", errors="ignore")
83
+ except Exception:
84
+ continue
85
+
86
+ if not raw_text.strip():
87
+ continue
88
+
89
+ doc_chunks = chunk_document(
90
+ raw_text=raw_text,
91
+ source_name=str(file_path),
92
+ source_url=None,
93
+ )
94
+
95
+ all_chunks.extend(wrap_doc_chunks(doc_chunks))
96
+
97
+ # ---- Export ----
98
+ export_chunks_jsonl(all_chunks, output_dir / "chunks.jsonl", print_stats=True)
99
+
100
+ stats = compute_dataset_stats(all_chunks)
101
+
102
+ primary = [c for c in all_chunks if c.hierarchy.is_primary]
103
+ stats["hierarchy"] = {
104
+ "primary_chunks": len(primary),
105
+ "secondary_chunks": len(all_chunks) - len(primary),
106
+ }
107
+
108
+ with (output_dir / "dataset_stats.json").open("w", encoding="utf-8") as f:
109
+ json.dump(stats, f, indent=2)
110
+
111
+ write_dataset_metadata(
112
+ chunks=all_chunks,
113
+ output_path=output_dir / "dataset_metadata.json",
114
+ dataset_name=dataset_name,
115
+ dataset_version="v1",
116
+ )
117
+
118
+ print("\n✅ Dataset built successfully")
119
+ print(f" - Files: {len({c.file_path for c in all_chunks})}")
120
+ print(f" - Chunks: {len(all_chunks)}")
121
+ print(f" - Output: {output_dir}")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ parser = argparse.ArgumentParser()
126
+ parser.add_argument("--name", required=True)
127
+ parser.add_argument("--include", nargs="+")
128
+ parser.add_argument("--exclude", nargs="+")
129
+ args = parser.parse_args()
130
+
131
+ run(args.name, args.include, args.exclude)
scripts/run_repo_pipeline.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Git Repository Pipeline Runner - Processes Git repositories at scale.
3
+
4
+ This is the main entry point for processing GIT REPOSITORIES. It provides
5
+ enhanced features for repository analysis, including git metadata extraction,
6
+ agentic framework detection, and comprehensive statistics generation.
7
+
8
+ ARCHITECTURE POSITION:
9
+ - Repository Pipeline Orchestrator: Coordinates Git repo processing
10
+ - Enhanced Metadata Collector: Extracts git history and agentic patterns
11
+ - Production Pipeline: Handles large repositories with performance tracking
12
+
13
+ KEY FEATURES:
14
+ 1. Complete repository processing with git metadata
15
+ 2. Extension-aware filtering (None = full repository)
16
+ 3. Performance tracking (files/sec, chunks/sec)
17
+ 4. Agentic framework detection (via RepoMetadataExtractor)
18
+ 5. Comprehensive output (JSONL chunks + metadata + statistics)
19
+
20
+ DATA FLOW:
21
+ Repo URL → Clone → Metadata extraction → File listing → Chunking →
22
+ Enhanced export → Statistics → Comprehensive output package
23
+
24
+ USE CASES:
25
+ - Processing complete Git repositories for training data
26
+ - Creating agentic-aware datasets
27
+ - Benchmarking chunking performance
28
+ - Production dataset generation
29
+
30
+ USAGE:
31
+ python run_repo_pipeline.py single https://github.com/crewAIInc/crewAI
32
+ python run_repo_pipeline.py single https://github.com/autogen/autogen --extensions .py .md
33
+ python run_repo_pipeline.py single https://github.com/langchain --max-files 1000
34
+ """
35
+
36
+ from pathlib import Path
37
+ import json
38
+ from typing import Dict, Any, Optional, Set, List
39
+ import argparse
40
+ import time
41
+ from datetime import datetime
42
+
43
+ # Import enhanced components
44
+ from src.task_3_data_engineering.ingestion.git_crawler import GitCrawler
45
+ from src.task_3_data_engineering.ingestion.repo_metadata import RepoMetadataExtractor
46
+ from src.task_3_data_engineering.chunking.repo_chunker import RepoChunker
47
+ from src.task_3_data_engineering.analysis.dataset_stats import compute_dataset_stats
48
+ from src.task_3_data_engineering.export.enhanced_jsonl_exporter import export_repo_chunks_jsonl
49
+
50
+
51
+ class EnhancedRepoPipeline:
52
+ """Enhanced pipeline with agentic focus"""
53
+
54
+ def __init__(
55
+ self,
56
+ output_base: Path = Path("data/processed/repos"),
57
+ use_hierarchical: bool = True,
58
+ ):
59
+ self.crawler = GitCrawler()
60
+ self.chunker = RepoChunker(use_hierarchical=use_hierarchical)
61
+ self.output_base = output_base
62
+ self.output_base.mkdir(parents=True, exist_ok=True)
63
+
64
+ def process_repository(
65
+ self,
66
+ repo_url: str,
67
+ extensions: Optional[Set[str]] = None,
68
+ output_name: Optional[str] = None,
69
+ include_binary: bool = False,
70
+ max_files: Optional[int] = None,
71
+ skip_git_metadata: bool = False,
72
+ ) -> Dict[str, Any]:
73
+ """
74
+ Process repository with enhanced features
75
+
76
+ IMPORTANT FIX:
77
+ - extensions=None => FULL repository (no filtering)
78
+ - extensions=set() => filtered repository
79
+ """
80
+
81
+ start_time = time.time()
82
+ print(f"🚀 Processing repository: {repo_url}")
83
+ print("-" * 60)
84
+
85
+ # 1. Clone repository
86
+ repo_path = self.crawler.clone_repository(repo_url)
87
+ if not repo_path:
88
+ raise RuntimeError(f"Failed to clone {repo_url}")
89
+
90
+ # 2. Determine output name
91
+ if not output_name:
92
+ output_name = repo_path.name
93
+
94
+ # 3. Log extension behavior (FIXED)
95
+ if extensions:
96
+ print(f"📁 Extension filter enabled: {sorted(extensions)}")
97
+ else:
98
+ print("📁 No extension filter → processing FULL repository")
99
+
100
+ # 4. Extract repository metadata
101
+ print("📊 Extracting repository metadata...")
102
+ metadata = {}
103
+
104
+ if not skip_git_metadata:
105
+ extractor = RepoMetadataExtractor(repo_path)
106
+ metadata = extractor.extract_comprehensive_metadata()
107
+
108
+ # 5. List files (CORE LOGIC UNCHANGED)
109
+ print("📁 Listing repository files...")
110
+ file_infos, file_stats = self.crawler.list_files_with_info(
111
+ repo_path,
112
+ extensions=extensions, # None => full repo
113
+ skip_binary=not include_binary,
114
+ )
115
+
116
+ # 6. Optional file limiting
117
+ if max_files and len(file_infos) > max_files:
118
+ print(f"⚠️ Limiting to {max_files} files (out of {len(file_infos)})")
119
+ file_infos = file_infos[:max_files]
120
+
121
+ print(f"📊 Found {len(file_infos)} files to process")
122
+
123
+ # 7. Create output directory
124
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
125
+ output_dir = self.output_base / f"{output_name}_{timestamp}"
126
+ output_dir.mkdir(parents=True, exist_ok=True)
127
+
128
+ # 8. Repository-level metadata
129
+ # Get actual repo name from metadata
130
+ actual_repo_name = metadata.get("basic", {}).get("repo_name", output_name)
131
+
132
+ repo_metadata = {
133
+ "repo_url": repo_url,
134
+ "repo_name": actual_repo_name, # ✅ Use actual repo name
135
+ "folder_name": output_name, # ✅ Track user's folder
136
+ "local_path": str(repo_path),
137
+ "extensions_included": list(extensions) if extensions else "ALL",
138
+ "timestamp": timestamp,
139
+ **metadata,
140
+ }
141
+
142
+ metadata_file = output_dir / "repository_metadata.json"
143
+ with open(metadata_file, "w", encoding="utf-8") as f:
144
+ json.dump(repo_metadata, f, indent=2, default=str)
145
+
146
+ # 9. Chunk processing
147
+ all_chunks = []
148
+ processing_stats = {
149
+ "total_files": len(file_infos),
150
+ "processed": 0,
151
+ "failed": 0,
152
+ "file_types": {},
153
+ "chunk_types": {},
154
+ }
155
+
156
+ print("\n🔧 Processing files...")
157
+ print("-" * 60)
158
+
159
+ for idx, file_info in enumerate(file_infos, start=1):
160
+ try:
161
+ if idx % 10 == 0:
162
+ print(f" [{idx}/{len(file_infos)}] Processing...")
163
+
164
+ file_metadata = {
165
+ **repo_metadata,
166
+ "file_info": {
167
+ "relative_path": file_info.relative_path,
168
+ "size_bytes": file_info.size,
169
+ "extension": file_info.extension,
170
+ "is_binary": file_info.is_binary,
171
+ },
172
+ }
173
+
174
+ chunks = self.chunker.chunk_file(
175
+ file_info.path,
176
+ file_metadata,
177
+ )
178
+
179
+ all_chunks.extend(chunks)
180
+ processing_stats["processed"] += 1
181
+ processing_stats["file_types"][file_info.extension] = (
182
+ processing_stats["file_types"].get(file_info.extension, 0) + 1
183
+ )
184
+
185
+ for chunk in chunks:
186
+ ct = chunk.chunk_type
187
+ processing_stats["chunk_types"][ct] = (
188
+ processing_stats["chunk_types"].get(ct, 0) + 1
189
+ )
190
+
191
+ except Exception as e:
192
+ print(f"⚠️ Error processing {file_info.relative_path}: {str(e)[:120]}")
193
+ processing_stats["failed"] += 1
194
+
195
+ # 10. Export chunks
196
+ print("\n💾 Exporting chunks...")
197
+ output_file = output_dir / f"{output_name}_chunks.jsonl"
198
+
199
+ export_repo_chunks_jsonl(
200
+ chunks=all_chunks,
201
+ output_path=output_file,
202
+ repo_metadata=repo_metadata,
203
+ print_stats=True,
204
+ )
205
+
206
+ # 11. Compute statistics
207
+ print("📈 Computing statistics...")
208
+ chunk_stats = compute_dataset_stats(all_chunks)
209
+
210
+ total_time = time.time() - start_time
211
+
212
+ final_stats = {
213
+ "repository_info": {
214
+ "name": actual_repo_name, # ✅ USE actual_repo_name
215
+ "folder_name": output_name, # ✅ ADD folder_name field
216
+ "url": repo_url,
217
+ "path": str(repo_path),
218
+ "timestamp": timestamp,
219
+ },
220
+ "processing_stats": processing_stats,
221
+ "chunk_statistics": chunk_stats,
222
+ "performance": {
223
+ "total_time_seconds": round(total_time, 2),
224
+ "files_per_second": round(len(file_infos) / total_time, 2),
225
+ "chunks_per_second": round(len(all_chunks) / total_time, 2),
226
+ },
227
+ "output_files": {
228
+ "chunks": str(output_file),
229
+ "metadata": str(metadata_file),
230
+ },
231
+ }
232
+
233
+ stats_file = output_dir / f"{output_name}_stats.json"
234
+ with open(stats_file, "w", encoding="utf-8") as f:
235
+ json.dump(final_stats, f, indent=2)
236
+
237
+ # 12. Summary
238
+ print("\n" + "=" * 70)
239
+ print("✅ REPOSITORY PROCESSING COMPLETE")
240
+ print("=" * 70)
241
+ print(f"📁 Repository: {output_name}")
242
+ print(f"📄 Files: {len(file_infos)}")
243
+ print(f"🧩 Chunks: {len(all_chunks)}")
244
+ print(f"⏱️ Time: {final_stats['performance']['total_time_seconds']}s")
245
+ print(f"💾 Output: {output_dir}")
246
+ print("=" * 70)
247
+
248
+ return final_stats
249
+
250
+
251
+ def main():
252
+ """Enhanced CLI for repository pipeline (FIXED)"""
253
+
254
+ parser = argparse.ArgumentParser(
255
+ description="Process Git repositories for agentic datasets"
256
+ )
257
+
258
+ subparsers = parser.add_subparsers(dest="command", required=True)
259
+
260
+ # ---- Single repo ----
261
+ single = subparsers.add_parser("single", help="Process single repository")
262
+ single.add_argument("repo_url", help="Git repository URL")
263
+ single.add_argument("--name", help="Custom output name")
264
+ single.add_argument(
265
+ "--extensions",
266
+ nargs="+",
267
+ default=None,
268
+ help="Optional file extensions (.py .md). If omitted, FULL repo is processed.",
269
+ )
270
+ single.add_argument("--max-files", type=int, help="Limit number of files")
271
+ single.add_argument("--skip-git-metadata", action="store_true")
272
+ single.add_argument("--include-binary", action="store_true")
273
+
274
+ args = parser.parse_args()
275
+ pipeline = EnhancedRepoPipeline()
276
+
277
+ if args.command == "single":
278
+ pipeline.process_repository(
279
+ repo_url=args.repo_url,
280
+ output_name=args.name,
281
+ extensions=set(args.extensions) if args.extensions else None,
282
+ max_files=args.max_files,
283
+ skip_git_metadata=args.skip_git_metadata,
284
+ include_binary=args.include_binary,
285
+ )
286
+
287
+
288
+ if __name__ == "__main__":
289
+ main()
scripts/triplets_synthesis.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Synthesize triplet and positive pair datasets from chunked code files.'''
3
+
4
+ import argparse
5
+ import json
6
+ import random
7
+ import hashlib
8
+ from pathlib import Path
9
+ from typing import Dict, List
10
+ from datetime import datetime
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+
14
+
15
+ # ============================
16
+ # CONFIG
17
+ # ============================
18
+
19
+ MAX_DOCUMENTS = 200
20
+ POSITIVE_VARIANTS = 5
21
+ TFIDF_MAX_FEATURES = 5000
22
+ RANDOM_SEED = 42
23
+
24
+ BASE_OUTPUT_DIR = Path("data/synthetic")
25
+
26
+ random.seed(RANDOM_SEED)
27
+
28
+
29
+ # ============================
30
+ # UTILITIES
31
+ # ============================
32
+
33
+ def load_chunks(file_path):
34
+ path = Path(file_path)
35
+
36
+ if path.suffix == ".jsonl":
37
+ chunks = []
38
+ with open(path, "r", encoding="utf-8") as f:
39
+ for line_no, line in enumerate(f, 1):
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+ try:
44
+ chunks.append(json.loads(line))
45
+ except json.JSONDecodeError as e:
46
+ raise ValueError(
47
+ f"Invalid JSON on line {line_no} in {path}"
48
+ ) from e
49
+ return chunks
50
+
51
+ elif path.suffix == ".json":
52
+ with open(path, "r", encoding="utf-8") as f:
53
+ data = json.load(f)
54
+ if not isinstance(data, list):
55
+ raise ValueError(f"{path} must contain a list of chunks")
56
+ return data
57
+
58
+ else:
59
+ raise ValueError(
60
+ f"Unsupported file format {path.suffix}. Use .json or .jsonl"
61
+ )
62
+
63
+
64
+
65
+ def save_jsonl(path: Path, records: List[Dict]):
66
+ path.parent.mkdir(parents=True, exist_ok=True)
67
+ with path.open("w", encoding="utf-8") as f:
68
+ for r in records:
69
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
70
+
71
+
72
+ def save_json(path: Path, data):
73
+ path.parent.mkdir(parents=True, exist_ok=True)
74
+ with path.open("w", encoding="utf-8") as f:
75
+ json.dump(data, f, indent=2)
76
+
77
+
78
+ def stable_document_id(chunk: Dict, idx: int) -> str:
79
+ """
80
+ Generate a canonical, stable document_id.
81
+ """
82
+ base = f"{chunk.get('file_path','unknown')}::{idx}"
83
+ return "doc_" + hashlib.sha1(base.encode()).hexdigest()
84
+
85
+
86
+ def infer_framework(input_path: Path) -> str:
87
+ """
88
+ Infer framework from path (fallback-safe).
89
+ """
90
+ parts = [p.lower() for p in input_path.parts]
91
+ for fw in ["crewai", "langchain", "langgraph", "autogen"]:
92
+ if fw in parts:
93
+ return fw
94
+ return "unknown"
95
+
96
+
97
+ # ============================
98
+ # ANCHOR GENERATION (LLM PLACEHOLDER)
99
+ # ============================
100
+
101
+ def generate_anchor_questions(code: str, n: int) -> List[str]:
102
+ """
103
+ Deterministic placeholder (LLM-ready).
104
+ """
105
+ symbol = code.split("(")[0].replace("def ", "").replace("class ", "").strip()
106
+
107
+ templates = [
108
+ f"How does {symbol} work in Python?",
109
+ f"How to implement {symbol}?",
110
+ f"Example usage of {symbol}",
111
+ f"Explain the {symbol} logic",
112
+ f"Best practices for {symbol}",
113
+ ]
114
+
115
+ random.shuffle(templates)
116
+ return templates[:n]
117
+
118
+
119
+ # ============================
120
+ # NEGATIVE MINING
121
+ # ============================
122
+
123
+ def build_tfidf(chunks: List[Dict]):
124
+ corpus = [c["code"] for c in chunks]
125
+ vectorizer = TfidfVectorizer(
126
+ stop_words="english",
127
+ max_features=TFIDF_MAX_FEATURES
128
+ )
129
+ matrix = vectorizer.fit_transform(corpus)
130
+ return vectorizer, matrix
131
+
132
+
133
+ def mine_hard_negative(
134
+ anchor: str,
135
+ positive_idx: int,
136
+ chunks: List[Dict],
137
+ vectorizer,
138
+ matrix,
139
+ ) -> Dict:
140
+ query_vec = vectorizer.transform([anchor])
141
+ scores = cosine_similarity(query_vec, matrix)[0]
142
+
143
+ ranked = sorted(
144
+ [(i, s) for i, s in enumerate(scores)],
145
+ key=lambda x: x[1],
146
+ reverse=True,
147
+ )
148
+
149
+ for idx, _ in ranked:
150
+ if idx != positive_idx:
151
+ return chunks[idx]
152
+
153
+ raise RuntimeError("No negative candidate found")
154
+
155
+
156
+ # ============================
157
+ # MAIN PIPELINE
158
+ # ============================
159
+
160
+ def generate_datasets(input_path: Path, run_name: str):
161
+ output_dir = BASE_OUTPUT_DIR / run_name
162
+ framework = infer_framework(input_path)
163
+
164
+ chunks = load_chunks(input_path)
165
+ # Filter only semantic code chunks
166
+ chunks = [
167
+ c for c in chunks
168
+ if c.get("chunk_type") in {"class", "method", "function"}
169
+ and "code" in c
170
+ ]
171
+
172
+ random.shuffle(chunks)
173
+ chunks = chunks[:MAX_DOCUMENTS]
174
+
175
+ # Assign canonical document_id
176
+ for idx, c in enumerate(chunks):
177
+ c["document_id"] = stable_document_id(c, idx)
178
+
179
+ vectorizer, matrix = build_tfidf(chunks)
180
+
181
+ positive_pairs = []
182
+ triplets = []
183
+
184
+ for idx, chunk in enumerate(chunks):
185
+ code = chunk["code"]
186
+ doc_id = chunk["document_id"]
187
+
188
+ # -------- POSITIVE PAIRS --------
189
+ anchors = generate_anchor_questions(code, POSITIVE_VARIANTS)
190
+ for a in anchors:
191
+ positive_pairs.append({
192
+ "document_id": doc_id,
193
+ "anchor": a,
194
+ "positive": code,
195
+ "framework": framework,
196
+ "source": "synthetic_positive_v2",
197
+ })
198
+
199
+ # -------- TRIPLET --------
200
+ anchor = anchors[0]
201
+ negative_chunk = mine_hard_negative(
202
+ anchor, idx, chunks, vectorizer, matrix
203
+ )
204
+
205
+ triplets.append({
206
+ "document_id": doc_id,
207
+ "anchor": anchor,
208
+ "positive": code,
209
+ "negative": negative_chunk["code"],
210
+ "framework": framework,
211
+ "source": "synthetic_triplet_v2",
212
+ })
213
+
214
+ # -------- SAVE --------
215
+ save_jsonl(output_dir / "positive_pairs.jsonl", positive_pairs)
216
+ save_jsonl(output_dir / "triplets.jsonl", triplets)
217
+
218
+ save_json(output_dir / "positive_pairs.json", positive_pairs)
219
+ save_json(output_dir / "triplets.json", triplets)
220
+
221
+ metadata = {
222
+ "name": run_name,
223
+ "framework": framework,
224
+ "input_file": str(input_path),
225
+ "num_chunks": len(chunks),
226
+ "positive_pairs": len(positive_pairs),
227
+ "triplets": len(triplets),
228
+ "created_at": datetime.utcnow().isoformat(),
229
+ "random_seed": RANDOM_SEED,
230
+ }
231
+
232
+ save_json(output_dir / "metadata.json", metadata)
233
+
234
+ print(f"✅ Dataset generated at: {output_dir}")
235
+
236
+
237
+ # ============================
238
+ # ENTRY POINT
239
+ # ============================
240
+
241
+ if __name__ == "__main__":
242
+ parser = argparse.ArgumentParser()
243
+ parser.add_argument("--input", required=True, help="Chunked JSONL file")
244
+ parser.add_argument("--name", required=True, help="Synthetic dataset name")
245
+
246
+ args = parser.parse_args()
247
+
248
+ generate_datasets(
249
+ input_path=Path(args.input),
250
+ run_name=args.name,
251
+ )
252
+
253
+ # # For document id
254
+
255
+ # document_id := sha1(
256
+ # normalized_repo_path +
257
+ # file_path +
258
+ # top_level_symbol
259
+ # )