sayedM commited on
Commit
f9618f8
Β·
verified Β·
1 Parent(s): 95d7faf

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ code_faiss.index filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import requests
4
+ import json
5
+ import os
6
+ from retrive_docs import load_faiss_index_and_metadata, retrieve_relevant_chunks ,print_results
7
+
8
+ # --- CONFIGURATION ---
9
+ INDEX_PATH = "code_faiss.index"
10
+ METADATA_PATH = "code_metadata.json"
11
+ CHUNKS_JSON_PATH = "code_chunks.json"
12
+ EMBEDDING_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
13
+ # TOP_K has been removed from here and is now a user input
14
+
15
+ # --- SYSTEM PROMPT ---
16
+ # This prompt is crucial for guiding the LLM's behavior.
17
+ SYSTEM_PROMPT = """
18
+ You are an expert software developer and technical analyst. Your task is to help a user understand a codebase and debug potential issues.
19
+
20
+ You have been provided with a user's question and a set of the most relevant code chunks retrieved from the codebase based on their query.
21
+
22
+ Your mission is to synthesize this information and provide a clear, accurate, and helpful response.
23
+
24
+ Follow these instructions carefully:
25
+ 1. **Analyze the Goal:** First, understand the user's primary goal. Are they reporting a bug, asking for an explanation, or trying to understand how something works?
26
+ 2. **Base Your Answer on Provided Context:** Your primary source of truth is the retrieved code chunks. Ground your entire analysis in the code provided. Do not invent functionality or assume the existence of code that is not present in the context.
27
+ 3. **Directly Address the Query:** Directly answer the user's question. If the context contains a definitive answer (e.g., a warning message about a known bug), state it clearly and quote the relevant code.
28
+ 4. **Synthesize and Hypothesize:** If the answer is not immediately obvious, synthesize information from multiple chunks. Form a hypothesis about the cause of the bug or the functionality in question, explaining your reasoning by referencing specific lines of code.
29
+ 5. **Provide Actionable Recommendations:** Conclude with clear, actionable advice. This could be a suggested code change, a command to run, or a recommendation to avoid a specific feature based on the evidence in the code.
30
+ 6. **Acknowledge Limitations:** If the provided code chunks are insufficient to fully answer the question, state this clearly. Explain what additional information would be needed.
31
+ 7. **Structure Your Response:** Format your response using Markdown for readability. Use code blocks for code snippets and bold text to highlight key findings.
32
+ 8. **show output reference at the end:** to keep a trust show the source where you get the information from, like if it included in line .. or code ... if available only in the context.
33
+ """
34
+
35
+ # --- LOAD DATA ON STARTUP ---
36
+ print("--- Initializing Application ---")
37
+ # Check if all required files exist before launching the UI
38
+ if not all(os.path.exists(p) for p in [INDEX_PATH, METADATA_PATH, CHUNKS_JSON_PATH]):
39
+ print("ERROR: One or more required data files are missing.")
40
+ print("Please make sure 'code_faiss.index', 'code_metadata.json', and 'code_chunks.json' are in the same directory.")
41
+ # Gradio doesn't have a clean way to exit, so we'll show an error in the UI
42
+ index, metadata, chunks_dict = None, None, None
43
+ else:
44
+ index, metadata, chunks_dict = load_faiss_index_and_metadata(
45
+ index_path=INDEX_PATH,
46
+ metadata_path=METADATA_PATH,
47
+ chunks_json_path=CHUNKS_JSON_PATH
48
+ )
49
+ print("--- Initialization Complete ---")
50
+
51
+
52
+ def get_expert_analysis(api_key, api_url, llm_model_name, top_k, user_query):
53
+ """
54
+ The main function that orchestrates the RAG pipeline.
55
+ """
56
+ if not all([api_key, api_url, llm_model_name, user_query]):
57
+ return "Error: API Key, API URL, Model Name, and Question are all required."
58
+
59
+ if index is None:
60
+ return "Error: FAISS index and data could not be loaded. Please check the console for errors and restart."
61
+
62
+ # 1. RETRIEVAL: Get relevant code chunks
63
+ print("\n--- Starting Retrieval ---")
64
+ retrieved_results = retrieve_relevant_chunks(
65
+ query=user_query,
66
+ model_name=EMBEDDING_MODEL_NAME,
67
+ index=index,
68
+ metadata=metadata,
69
+ chunks_dict=chunks_dict,
70
+ top_k=top_k # Use the value from the UI
71
+ )
72
+
73
+ if not retrieved_results:
74
+ return "Could not find any relevant code chunks for your query. Please try rephrasing it."
75
+
76
+ context_str = print_results(retrieved_results)
77
+
78
+ print("--- Starting Generation ---")
79
+ final_user_prompt = f"""
80
+ {context_str}
81
+
82
+ --- User's Question ---
83
+ {user_query}
84
+
85
+ --- Analysis and Answer ---
86
+ Based on the provided code context, here is the analysis of your question:
87
+ """
88
+
89
+ headers = {
90
+ "Authorization": f"Bearer {api_key}",
91
+ "Content-Type": "application/json",
92
+ }
93
+ payload = {
94
+ "model": llm_model_name,
95
+ "messages": [
96
+ {"role": "system", "content": SYSTEM_PROMPT},
97
+ {"role": "user", "content": final_user_prompt}
98
+ ]
99
+ }
100
+
101
+ try:
102
+ print(f"Sending request to LLM: {llm_model_name} at {api_url}")
103
+ response = requests.post(api_url, headers=headers, data=json.dumps(payload))
104
+ response.raise_for_status()
105
+
106
+ response_json = response.json()
107
+ llm_answer = response_json['choices'][0]['message']['content']
108
+ print("--- Generation Complete ---")
109
+
110
+ full_response = f"## Expert Analysis\n\n{llm_answer}\n\n---\n\n### Retrieved Context\n\nThis analysis was based on the following retrieved code chunks:\n\n{context_str}"
111
+ return full_response
112
+
113
+ except requests.exceptions.RequestException as e:
114
+ print(f"Error calling LLM API: {e}")
115
+ return f"Error: Failed to connect to the LLM API. Please check your API URL, API key, and network connection.\n\nDetails: {e}"
116
+ except (KeyError, IndexError) as e:
117
+ print(f"Error parsing LLM response: {e}")
118
+ return f"Error: Received an unexpected response from the LLM API. Please check the model name and try again.\n\nResponse: {response.text}"
119
+
120
+
121
+ # --- GRADIO UI ---
122
+ with gr.Blocks(theme=gr.themes.Soft(), title="RAG Code Assistant") as demo:
123
+ gr.Markdown("# RAG-Powered Code Assistant")
124
+ gr.Markdown("This tool uses a local code database (FAISS) and a Large Language Model (LLM) to answer questions about your codebase.")
125
+
126
+ with gr.Row():
127
+ with gr.Column(scale=1):
128
+ api_key_input = gr.Textbox(
129
+ label="API Key",
130
+ type="password",
131
+ placeholder="Enter your API key here"
132
+ )
133
+ api_url_input = gr.Textbox(
134
+ label="API Endpoint URL",
135
+ value="https://openrouter.ai/api/v1/chat/completions",
136
+ placeholder="Enter the chat completions endpoint URL"
137
+ )
138
+ llm_model_input = gr.Textbox(
139
+ label="LLM Model Name",
140
+ value="moonshotai/kimi-k2:free",
141
+ placeholder="e.g., moonshotai/kimi-k2:free"
142
+ )
143
+ # New Dropdown for Top-K selection
144
+ top_k_input = gr.Dropdown(
145
+ label="Number of Chunks to Retrieve (Top K)",
146
+ choices=[5, 6, 7, 8, 9, 10],
147
+ value=10,
148
+ )
149
+ user_query_input = gr.Textbox(
150
+ label="Your Question / Bug Report",
151
+ lines=8,
152
+ placeholder="e.g., 'When I use cache=True, my RAM usage explodes. Why?'"
153
+ )
154
+ submit_button = gr.Button("Get Analysis", variant="primary")
155
+
156
+ with gr.Column(scale=2):
157
+ gr.Markdown("## Analysis Result")
158
+ output_text = gr.Markdown()
159
+
160
+ # Update the inputs list for the click event
161
+ submit_button.click(
162
+ fn=get_expert_analysis,
163
+ inputs=[api_key_input, api_url_input, llm_model_input, top_k_input, user_query_input],
164
+ outputs=output_text
165
+ )
166
+
167
+
168
+
169
+ if __name__ == "__main__":
170
+ demo.launch(share=True)
code_chunks.json ADDED
The diff for this file is too large to render. See raw diff
 
code_faiss.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0969bcdcff8b67e7a45cae54ce8b7a26f38829fa139e2bcabbb56c0310c469e
3
+ size 3682349
code_metadata.json ADDED
The diff for this file is too large to render. See raw diff
 
create_chunks.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import json
4
+ from typing import Dict, List, Any, Optional, Union
5
+ from dataclasses import dataclass, asdict
6
+ from pathlib import Path
7
+ import hashlib
8
+
9
+ @dataclass
10
+ class CodeChunk:
11
+ """Represents a chunk of code with metadata"""
12
+ content: str
13
+ chunk_type: str # 'function', 'class', 'method', 'import', 'variable', 'comment', 'module'
14
+ name: str
15
+ file_path: str
16
+ start_line: int
17
+ end_line: int
18
+ start_col: int
19
+ end_col: int
20
+ parent_name: Optional[str] = None
21
+ docstring: Optional[str] = None
22
+ parameters: Optional[List[str]] = None
23
+ return_type: Optional[str] = None
24
+ decorators: Optional[List[str]] = None
25
+ complexity_score: Optional[int] = None
26
+ dependencies: Optional[List[str]] = None
27
+ chunk_id: Optional[str] = None
28
+
29
+ def __post_init__(self):
30
+ # Generate unique ID based on content and location
31
+ content_hash = hashlib.md5(
32
+ f"{self.file_path}:{self.start_line}:{self.end_line}:{self.content}".encode()
33
+ ).hexdigest()[:8]
34
+ self.chunk_id = f"{self.chunk_type}_{self.name}_{content_hash}"
35
+
36
+ class CodeChunker:
37
+ """Main class for chunking code using AST"""
38
+
39
+ def __init__(self, supported_extensions: List[str] = None):
40
+ self.supported_extensions = supported_extensions or ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h']
41
+ self.chunks: List[CodeChunk] = []
42
+
43
+ def chunk_file(self, file_path: str) -> List[CodeChunk]:
44
+ """Chunk a single file and return list of CodeChunk objects"""
45
+ file_path = Path(file_path)
46
+
47
+ if file_path.suffix not in self.supported_extensions:
48
+ return []
49
+
50
+ try:
51
+ with open(file_path, 'r', encoding='utf-8') as f:
52
+ content = f.read()
53
+ except Exception as e:
54
+ print(f"Error reading file {file_path}: {e}")
55
+ return []
56
+
57
+ if file_path.suffix == '.py':
58
+ return self._chunk_python_file(str(file_path), content)
59
+ else:
60
+ # For other languages, use simpler text-based chunking for now
61
+ return self._chunk_generic_file(str(file_path), content)
62
+
63
+ def _chunk_python_file(self, file_path: str, content: str) -> List[CodeChunk]:
64
+ """Chunk Python file using AST"""
65
+ chunks = []
66
+ lines = content.split('\n')
67
+
68
+ try:
69
+ tree = ast.parse(content)
70
+ except SyntaxError as e:
71
+ print(f"Syntax error in {file_path}: {e}")
72
+ return []
73
+
74
+ # Track imports at module level
75
+ imports = []
76
+
77
+ for node in ast.walk(tree):
78
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
79
+ imports.extend(self._extract_imports(node))
80
+
81
+ # Process top-level nodes
82
+ for node in tree.body:
83
+ chunk = self._process_node(node, file_path, lines, imports)
84
+ if chunk:
85
+ chunks.append(chunk)
86
+
87
+ return chunks
88
+
89
+ def _process_node(self, node: ast.AST, file_path: str, lines: List[str],
90
+ imports: List[str], parent_name: str = None) -> Optional[CodeChunk]:
91
+ """Process an AST node and create a CodeChunk"""
92
+
93
+ if isinstance(node, ast.FunctionDef):
94
+ return self._create_function_chunk(node, file_path, lines, imports, parent_name)
95
+
96
+ elif isinstance(node, ast.AsyncFunctionDef):
97
+ return self._create_function_chunk(node, file_path, lines, imports, parent_name, is_async=True)
98
+
99
+ elif isinstance(node, ast.ClassDef):
100
+ return self._create_class_chunk(node, file_path, lines, imports)
101
+
102
+ elif isinstance(node, ast.Assign):
103
+ return self._create_variable_chunk(node, file_path, lines, parent_name)
104
+
105
+ elif isinstance(node, (ast.Import, ast.ImportFrom)):
106
+ return self._create_import_chunk(node, file_path, lines)
107
+
108
+ return None
109
+
110
+ def _create_function_chunk(self, node: Union[ast.FunctionDef, ast.AsyncFunctionDef],
111
+ file_path: str, lines: List[str], imports: List[str],
112
+ parent_name: str = None, is_async: bool = False) -> CodeChunk:
113
+ """Create a chunk for a function or method"""
114
+
115
+ # Extract function content
116
+ start_line = node.lineno
117
+ end_line = node.end_lineno or start_line
118
+ content = '\n'.join(lines[start_line-1:end_line])
119
+
120
+ # Extract parameters
121
+ parameters = []
122
+ for arg in node.args.args:
123
+ param_str = arg.arg
124
+ if arg.annotation:
125
+ param_str += f": {ast.unparse(arg.annotation)}"
126
+ parameters.append(param_str)
127
+
128
+ # Extract return type
129
+ return_type = None
130
+ if node.returns:
131
+ return_type = ast.unparse(node.returns)
132
+
133
+ # Extract decorators
134
+ decorators = []
135
+ for decorator in node.decorator_list:
136
+ decorators.append(ast.unparse(decorator))
137
+
138
+ # Extract docstring
139
+ docstring = ast.get_docstring(node)
140
+
141
+ # Calculate complexity (simple metric based on control flow)
142
+ complexity = self._calculate_complexity(node)
143
+
144
+ chunk_type = "method" if parent_name else "function"
145
+ if is_async:
146
+ chunk_type = "async_" + chunk_type
147
+
148
+ return CodeChunk(
149
+ content=content,
150
+ chunk_type=chunk_type,
151
+ name=node.name,
152
+ file_path=file_path,
153
+ start_line=start_line,
154
+ end_line=end_line,
155
+ start_col=node.col_offset,
156
+ end_col=node.end_col_offset or 0,
157
+ parent_name=parent_name,
158
+ docstring=docstring,
159
+ parameters=parameters,
160
+ return_type=return_type,
161
+ decorators=decorators,
162
+ complexity_score=complexity,
163
+ dependencies=imports
164
+ )
165
+
166
+ def _create_class_chunk(self, node: ast.ClassDef, file_path: str,
167
+ lines: List[str], imports: List[str]) -> CodeChunk:
168
+ """Create a chunk for a class"""
169
+
170
+ start_line = node.lineno
171
+ end_line = node.end_lineno or start_line
172
+ content = '\n'.join(lines[start_line-1:end_line])
173
+
174
+ # Extract base classes
175
+ base_classes = []
176
+ for base in node.bases:
177
+ base_classes.append(ast.unparse(base))
178
+
179
+ # Extract decorators
180
+ decorators = []
181
+ for decorator in node.decorator_list:
182
+ decorators.append(ast.unparse(decorator))
183
+
184
+ # Extract docstring
185
+ docstring = ast.get_docstring(node)
186
+
187
+ return CodeChunk(
188
+ content=content,
189
+ chunk_type="class",
190
+ name=node.name,
191
+ file_path=file_path,
192
+ start_line=start_line,
193
+ end_line=end_line,
194
+ start_col=node.col_offset,
195
+ end_col=node.end_col_offset or 0,
196
+ docstring=docstring,
197
+ decorators=decorators,
198
+ dependencies=imports + base_classes
199
+ )
200
+
201
+ def _create_variable_chunk(self, node: ast.Assign, file_path: str,
202
+ lines: List[str], parent_name: str = None) -> Optional[CodeChunk]:
203
+ """Create a chunk for variable assignments"""
204
+
205
+ # Only process simple assignments at module level
206
+ if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
207
+ var_name = node.targets[0].id
208
+ start_line = node.lineno
209
+ end_line = node.end_lineno or start_line
210
+ content = '\n'.join(lines[start_line-1:end_line])
211
+
212
+ return CodeChunk(
213
+ content=content,
214
+ chunk_type="variable",
215
+ name=var_name,
216
+ file_path=file_path,
217
+ start_line=start_line,
218
+ end_line=end_line,
219
+ start_col=node.col_offset,
220
+ end_col=node.end_col_offset or 0,
221
+ parent_name=parent_name
222
+ )
223
+
224
+ return None
225
+
226
+ def _create_import_chunk(self, node: Union[ast.Import, ast.ImportFrom],
227
+ file_path: str, lines: List[str]) -> CodeChunk:
228
+ """Create a chunk for import statements"""
229
+
230
+ start_line = node.lineno
231
+ end_line = node.end_lineno or start_line
232
+ content = '\n'.join(lines[start_line-1:end_line])
233
+
234
+ # Extract imported names
235
+ imported_names = []
236
+ if isinstance(node, ast.Import):
237
+ for alias in node.names:
238
+ imported_names.append(alias.name)
239
+ else: # ImportFrom
240
+ for alias in node.names:
241
+ imported_names.append(alias.name)
242
+
243
+ return CodeChunk(
244
+ content=content,
245
+ chunk_type="import",
246
+ name=", ".join(imported_names),
247
+ file_path=file_path,
248
+ start_line=start_line,
249
+ end_line=end_line,
250
+ start_col=node.col_offset,
251
+ end_col=node.end_col_offset or 0
252
+ )
253
+
254
+ def _extract_imports(self, node: Union[ast.Import, ast.ImportFrom]) -> List[str]:
255
+ """Extract import names from import nodes"""
256
+ imports = []
257
+ if isinstance(node, ast.Import):
258
+ for alias in node.names:
259
+ imports.append(alias.name)
260
+ else: # ImportFrom
261
+ module = node.module or ""
262
+ for alias in node.names:
263
+ imports.append(f"{module}.{alias.name}" if module else alias.name)
264
+ return imports
265
+
266
+ def _calculate_complexity(self, node: ast.AST) -> int:
267
+ """Calculate cyclomatic complexity of a function"""
268
+ complexity = 1 # Base complexity
269
+
270
+ for child in ast.walk(node):
271
+ if isinstance(child, (ast.If, ast.While, ast.For, ast.AsyncFor)):
272
+ complexity += 1
273
+ elif isinstance(child, ast.ExceptHandler):
274
+ complexity += 1
275
+ elif isinstance(child, (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)):
276
+ complexity += 1
277
+
278
+ return complexity
279
+
280
+ def _chunk_generic_file(self, file_path: str, content: str) -> List[CodeChunk]:
281
+ """Generic chunking for non-Python files"""
282
+ chunks = []
283
+ lines = content.split('\n')
284
+
285
+ # Simple function detection for C/C++/Java/JavaScript
286
+ function_patterns = {
287
+ '.js': r'function\s+(\w+)',
288
+ '.ts': r'function\s+(\w+)',
289
+ '.java': r'(public|private|protected)?\s*(static)?\s*\w+\s+(\w+)\s*\(',
290
+ '.cpp': r'\w+\s+(\w+)\s*\(',
291
+ '.c': r'\w+\s+(\w+)\s*\(',
292
+ '.h': r'\w+\s+(\w+)\s*\('
293
+ }
294
+
295
+ # This is a simplified implementation - you'd want more sophisticated parsing
296
+ # for production use
297
+
298
+ return chunks
299
+
300
+ def chunk_directory(self, directory_path: str, recursive: bool = True) -> List[CodeChunk]:
301
+ """Chunk all supported files in a directory"""
302
+ all_chunks = []
303
+ directory_path = Path(directory_path)
304
+
305
+ if recursive:
306
+ pattern = "**/*"
307
+ else:
308
+ pattern = "*"
309
+
310
+ for file_path in directory_path.glob(pattern):
311
+ if file_path.is_file() and file_path.suffix in self.supported_extensions:
312
+ chunks = self.chunk_file(str(file_path))
313
+ all_chunks.extend(chunks)
314
+
315
+ self.chunks = all_chunks
316
+ return all_chunks
317
+
318
+ def save_chunks(self, output_file: str):
319
+ """Save chunks to JSON file"""
320
+ chunks_data = [asdict(chunk) for chunk in self.chunks]
321
+
322
+ with open(output_file, 'w', encoding='utf-8') as f:
323
+ json.dump(chunks_data, f, indent=2, ensure_ascii=False)
324
+
325
+ def load_chunks(self, input_file: str) -> List[CodeChunk]:
326
+ """Load chunks from JSON file"""
327
+ with open(input_file, 'r', encoding='utf-8') as f:
328
+ chunks_data = json.load(f)
329
+
330
+ self.chunks = [CodeChunk(**chunk_data) for chunk_data in chunks_data]
331
+ return self.chunks
332
+
333
+ def get_chunks_by_type(self, chunk_type: str) -> List[CodeChunk]:
334
+ """Filter chunks by type"""
335
+ return [chunk for chunk in self.chunks if chunk.chunk_type == chunk_type]
336
+
337
+ def get_chunks_by_file(self, file_path: str) -> List[CodeChunk]:
338
+ """Filter chunks by file path"""
339
+ return [chunk for chunk in self.chunks if chunk.file_path == file_path]
340
+
341
+ def search_chunks(self, query: str) -> List[CodeChunk]:
342
+ """Simple text search in chunks"""
343
+ results = []
344
+ query_lower = query.lower()
345
+
346
+ for chunk in self.chunks:
347
+ if (query_lower in chunk.content.lower() or
348
+ query_lower in chunk.name.lower() or
349
+ (chunk.docstring and query_lower in chunk.docstring.lower())):
350
+ results.append(chunk)
351
+
352
+ return results
353
+
354
+ # Example usage
355
+ if __name__ == "__main__":
356
+ # Initialize chunker
357
+ chunker = CodeChunker()
358
+
359
+ # Example: Chunk a single Python file
360
+ # chunks = chunker.chunk_file("example.py")
361
+
362
+ # Example: Chunk entire directory
363
+ chunks = chunker.chunk_directory("ultralytics", recursive=True)
364
+
365
+ # Example: Save chunks to file
366
+ chunker.save_chunks("code_chunks.json")
367
+
368
+ # Example: Search chunks
369
+ # results = chunker.search_chunks("database")
370
+
371
+ # Example: Get all functions
372
+ # functions = chunker.get_chunks_by_type("function")
373
+
374
+ print("Code chunking system initialized!")
375
+ print("Supported file extensions:", chunker.supported_extensions)
376
+ print("\nExample usage:")
377
+ print("1. chunker.chunk_file('path/to/file.py')")
378
+ print("2. chunker.chunk_directory('path/to/project', recursive=True)")
379
+ print("3. chunker.save_chunks('output.json')")
380
+ print("4. chunker.search_chunks('query')")
create_faiss.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import faiss
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from typing import List, Dict, Any
6
+
7
+ def create_code_vector_db(json_file_path: str, model_name: str, output_index_path: str, output_metadata_path: str):
8
+ """
9
+ Loads code chunks, filters them, generates embeddings, and saves a FAISS index
10
+ along with corresponding metadata.
11
+
12
+ Args:
13
+ json_file_path (str): Path to the code_chunks.json file.
14
+ model_name (str): The name of the SentenceTransformer model to use.
15
+ output_index_path (str): Path to save the FAISS index file.
16
+ output_metadata_path (str): Path to save the chunk metadata JSON file.
17
+ """
18
+ # 1. Load and Filter Chunks
19
+ print(f"Loading chunks from '{json_file_path}'...")
20
+ try:
21
+ with open(json_file_path, 'r', encoding='utf-8') as f:
22
+ all_chunks = json.load(f)
23
+ except FileNotFoundError:
24
+ print(f"Error: The file '{json_file_path}' was not found.")
25
+ return
26
+
27
+ # Filter for chunks that contain meaningful semantic information for a RAG system
28
+ target_types = {'function', 'class', 'method', 'async_function', 'async_method'}
29
+ filtered_chunks = [chunk for chunk in all_chunks if chunk.get('chunk_type') in target_types]
30
+
31
+ if not filtered_chunks:
32
+ print("No chunks of target types found. Exiting.")
33
+ return
34
+
35
+ print(f"Filtered chunks: Kept {len(filtered_chunks)} out of {len(all_chunks)} total chunks.")
36
+
37
+ # 2. Prepare Text for Embedding
38
+ # Combine code with metadata for richer semantic representation.
39
+ texts_to_embed = []
40
+ for chunk in filtered_chunks:
41
+ # A good practice is to create a descriptive text for each chunk
42
+ docstring = chunk.get('docstring', '') or "No docstring."
43
+ name = chunk.get('name', '')
44
+ chunk_type = chunk.get('chunk_type', '')
45
+
46
+ # Create a descriptive header for the code content
47
+ header = f"Type: {chunk_type}, Name: {name}\nDocstring: {docstring}\n---\n"
48
+ prepared_text = header + chunk['content']
49
+ texts_to_embed.append(prepared_text)
50
+
51
+ # 3. Generate Embeddings
52
+ print(f"Loading SentenceTransformer model: '{model_name}'...")
53
+ # Using a model well-suited for code is beneficial, but a general one works too.
54
+ # Consider models like 'microsoft/codebert-base' or 'all-MiniLM-L6-v2' for a start.
55
+ model = SentenceTransformer(model_name).half() # Convert the model to half precision for faster inference
56
+ # model to fp16 for faster inference
57
+ # model = SentenceTransformer(model_name, device='cpu').half()
58
+
59
+
60
+
61
+
62
+ print("Generating embeddings for filtered chunks... (This may take a while)")
63
+ # embeddings = model.encode(texts_to_embed, show_progress_bar=True)
64
+ # Define a batch size
65
+ batch_size = 2 # You can adjust this number based on your VRAM
66
+
67
+ print("Generating embeddings for filtered chunks... (This may take a while)")
68
+ embeddings = model.encode(
69
+ texts_to_embed,
70
+ batch_size=batch_size,
71
+ show_progress_bar=True
72
+ )
73
+
74
+ # Convert to float32 for FAISS
75
+ embeddings = np.array(embeddings).astype('float32')
76
+ dimension = embeddings.shape[1]
77
+ print(f"Embeddings generated with dimension: {dimension}")
78
+
79
+ # 4. Build and Save FAISS Index
80
+ print("Building FAISS index...")
81
+ index = faiss.IndexFlatL2(dimension)
82
+ index.add(embeddings)
83
+
84
+ print(f"Saving FAISS index to '{output_index_path}'...")
85
+ faiss.write_index(index, output_index_path)
86
+
87
+ # 5. Save Metadata for Mapping
88
+ # We need to save the original chunk info to map FAISS results back to the source code
89
+ metadata_to_save = [
90
+ {
91
+ "chunk_id": chunk.get("chunk_id"),
92
+ "file_path": chunk.get("file_path"),
93
+ "start_line": chunk.get("start_line"),
94
+ "end_line": chunk.get("end_line"),
95
+ "name": chunk.get("name"),
96
+ "chunk_type": chunk.get("chunk_type")
97
+ }
98
+ for chunk in filtered_chunks
99
+ ]
100
+
101
+ print(f"Saving metadata mapping to '{output_metadata_path}'...")
102
+ with open(output_metadata_path, 'w', encoding='utf-8') as f:
103
+ json.dump(metadata_to_save, f, indent=2)
104
+
105
+ print("\nProcess complete!")
106
+ print(f"FAISS index and metadata have been successfully saved.")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ # --- CONFIGURATION ---
111
+ CHUNKS_JSON_PATH = "code_chunks.json"
112
+
113
+ # Recommended model for general purpose, good balance of speed and quality.
114
+ # For more code-specific tasks, you might explore models like 'microsoft/codebert-base'.
115
+ MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
116
+
117
+ OUTPUT_INDEX_PATH = "code_faiss.index"
118
+ OUTPUT_METADATA_PATH = "code_metadata.json"
119
+
120
+ # --- EXECUTION ---
121
+ create_code_vector_db(
122
+ json_file_path=CHUNKS_JSON_PATH,
123
+ model_name=MODEL_NAME,
124
+ output_index_path=OUTPUT_INDEX_PATH,
125
+ output_metadata_path=OUTPUT_METADATA_PATH
126
+ )
127
+
llm_calling.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import retrive_docs
2
+
3
+
4
+ import json
5
+ from retrive_docs import load_faiss_index_and_metadata, retrieve_relevant_chunks, print_results
6
+
7
+ INDEX_PATH = "code_faiss.index"
8
+ METADATA_PATH = "code_metadata.json"
9
+ CHUNKS_JSON_PATH = "code_chunks.json"
10
+ MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B" # Must match the model used in create_faiss.py
11
+ TOP_K = 5 # Number of results to retrieve
12
+
13
+ # --- EXECUTION ---
14
+ # Load FAISS index and metadata
15
+ index, metadata, chunks_dict = load_faiss_index_and_metadata(
16
+ index_path=INDEX_PATH,
17
+ metadata_path=METADATA_PATH,
18
+ chunks_json_path=CHUNKS_JSON_PATH
19
+ )
20
+
21
+ if index is None or metadata is None or chunks_dict is None:
22
+ print("Failed to load index, metadata, or chunks. Exiting.")
23
+ exit(1)
24
+
25
+ # Get user query
26
+ print("\nEnter your query (e.g., 'function to process text data'):")
27
+ # query = input("> ")
28
+ query= '''
29
+ Bug
30
+ when i add (cache=True)in Classification Training , the Ram using is increasing every epoch , until it crash the training , start like from 3 to 6 to 11 to 15 ....... 50 , GB
31
+ but if i don't add it , the ram using work fine , it be like 4 GB and all training is fixed
32
+
33
+ i work on colab
34
+ !yolo task=classify mode=train cache=True model=yolov8n-cls.pt data='/content/Classification-1' epochs=5 batch=265 imgsz=128
35
+
36
+ Environment
37
+ No response
38
+
39
+ Minimal Reproducible Example
40
+ No response
41
+
42
+ Additional
43
+ No response'''
44
+ # Retrieve and display results
45
+ results = retrieve_relevant_chunks(
46
+ query=query,
47
+ model_name=MODEL_NAME,
48
+ index=index,
49
+ metadata=metadata,
50
+ chunks_dict=chunks_dict,
51
+ top_k=TOP_K
52
+ )
53
+
54
+
55
+ print(print_results(results))
56
+ #call llm
57
+ # import requests
58
+ # import json
59
+ # import time
60
+ # import os
61
+
62
+ # sys_prompt = "You ar "
63
+ # # Set API key and API base for the custom API server
64
+ # api_key = os.getenv("API_KEY") # Replace with your actual API key
65
+ # api_base_url = os.getenv("API_BASE_URL") # Replace with your API base URL
66
+
67
+ # # Setup headers for the request
68
+ # headers = {
69
+ # "Authorization": f"Bearer {api_key}",
70
+ # "Content-Type": "application/json"
71
+ # }
72
+
73
+ # # System message and query
74
+ # # sys_msg = "you are a helpful assistant"
75
+ # # query = "what is machine learning?"
76
+
77
+ # # Prepare the data payload for the POST request
78
+ # data = json.dumps({
79
+ # "model": "Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
80
+ # "messages": [
81
+ # {"role": "system", "content":sys_prompt },
82
+ # {"role": "user", "content": query}
83
+ # ],
84
+ # "temperature": 0.2
85
+ # })
86
+
87
+ # # Measure request execution time
88
+ # t1 = time.time()
89
+
90
+ # # Perform the POST request
91
+ # response = requests.post(f"{api_base_url}/chat/completions", headers=headers, data=data)
92
+ # print("Request time:", time.time() - t1)
93
+
94
+ # # Check the response and handle errors
95
+ # if response.status_code == 200:
96
+ # # Parse response if request was successful
97
+ # chat_response = response.json()
98
+ # print("Chat response:", chat_response['choices'][0]['message']['content'])
99
+ # else:
100
+ # # Print error information if something went wrong
101
+ # print("Failed to fetch response:", response.status_code, response.text)
102
+
103
+ # print("this output based on this query :",query)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.31.5
2
+ requests==2.31.0
3
+ sentence-transformers==2.7.0
4
+ faiss-cpu==1.8.0
5
+ numpy==1.26.4
6
+ torch==2.3.0
retrive_docs.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import faiss
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from typing import List, Dict, Any
6
+
7
+ model_name = "Qwen/Qwen3-Embedding-0.6B"
8
+ print(f"Loading SentenceTransformer model: '{model_name}'...")
9
+ device = 'cuda' if faiss.get_num_gpus() > 0 else 'cpu'
10
+ print(f"Using device: {device}")
11
+ model = SentenceTransformer(model_name, device=device)
12
+ if device == 'cuda':
13
+ model = model.half() # Use FP16 for GPU
14
+ def load_faiss_index_and_metadata(index_path: str, metadata_path: str, chunks_json_path: str) -> tuple:
15
+ """
16
+ Loads the FAISS index, metadata, and original code chunks.
17
+
18
+ Args:
19
+ index_path (str): Path to the FAISS index file.
20
+ metadata_path (str): Path to the metadata JSON file.
21
+ chunks_json_path (str): Path to the original code_chunks.json file.
22
+
23
+ Returns:
24
+ tuple: (FAISS index, metadata list, chunks dictionary)
25
+ """
26
+ print(f"Loading FAISS index from '{index_path}'...")
27
+ try:
28
+ index = faiss.read_index(index_path)
29
+ except Exception as e:
30
+ print(f"Error loading FAISS index: {e}")
31
+ return None, None, None
32
+
33
+ print(f"Loading metadata from '{metadata_path}'...")
34
+ try:
35
+ with open(metadata_path, 'r', encoding='utf-8') as f:
36
+ metadata = json.load(f)
37
+ except FileNotFoundError:
38
+ print(f"Error: The file '{metadata_path}' was not found.")
39
+ return None, None, None
40
+
41
+ print(f"Loading code chunks from '{chunks_json_path}'...")
42
+ try:
43
+ with open(chunks_json_path, 'r', encoding='utf-8') as f:
44
+ chunks = json.load(f)
45
+ # Create a dictionary for quick lookup by chunk_id
46
+ chunks_dict = {chunk['chunk_id']: chunk for chunk in chunks}
47
+ except FileNotFoundError:
48
+ print(f"Error: The file '{chunks_json_path}' was not found.")
49
+ return None, None, None
50
+
51
+ return index, metadata, chunks_dict
52
+
53
+ def retrieve_relevant_chunks(query: str, model_name: str, index: faiss.IndexFlatL2, metadata: List[Dict], chunks_dict: Dict, top_k: int = 5) -> List[Dict]:
54
+ """
55
+ Encodes the query and retrieves the top-k most relevant code chunks.
56
+
57
+ Args:
58
+ query (str): The user's input query.
59
+ model_name (str): The SentenceTransformer model to use.
60
+ index (faiss.IndexFlatL2): The loaded FAISS index.
61
+ metadata (List[Dict]): The metadata for the indexed chunks.
62
+ chunks_dict (Dict): Dictionary mapping chunk_id to chunk details.
63
+ top_k (int): Number of top results to return.
64
+
65
+ Returns:
66
+ List[Dict]: List of dictionaries containing the retrieved chunks and their metadata.
67
+ """
68
+ # Load the model
69
+
70
+
71
+ # Prepare query text (mimic the chunk format used during indexing)
72
+ query_text = f"Type: query\nDocstring: {query}\n---\n{query}"
73
+ query_embedding = model.encode([query_text], show_progress_bar=False).astype('float32')
74
+
75
+ # Perform FAISS search
76
+ print(f"Searching for top {top_k} relevant chunks...")
77
+ distances, indices = index.search(query_embedding, top_k)
78
+
79
+ # Collect results
80
+ results = []
81
+ for idx, distance in zip(indices[0], distances[0]):
82
+ if idx < len(metadata):
83
+ meta = metadata[idx]
84
+ chunk_id = meta['chunk_id']
85
+ chunk = chunks_dict.get(chunk_id, {})
86
+ results.append({
87
+ 'chunk_id': chunk_id,
88
+ 'file_path': meta.get('file_path', 'Unknown'),
89
+ 'start_line': meta.get('start_line', -1),
90
+ 'end_line': meta.get('end_line', -1),
91
+ 'name': meta.get('name', 'Unknown'),
92
+ 'chunk_type': meta.get('chunk_type', 'Unknown'),
93
+ 'docstring': chunk.get('docstring', 'No docstring.'),
94
+ 'content': chunk.get('content', 'No content available.'),
95
+ 'distance': float(distance) # Similarity score (L2 distance)
96
+ })
97
+ else:
98
+ print(f"Warning: Index {idx} out of range for metadata.")
99
+
100
+ return results
101
+
102
+ # def print_results(results: List[Dict]):
103
+ # """
104
+ # Prints the retrieved results in a readable format.
105
+
106
+ # Args:
107
+ # results (List[Dict]): List of retrieved chunk details.
108
+ # """
109
+ # if not results:
110
+ # print("No relevant chunks found.")
111
+ # return
112
+
113
+ # print("\n=== Retrieved Chunks ===")
114
+ # returned_text=""
115
+ # for i, result in enumerate(results, 1):
116
+ # # print(f"\nResult {i}:")
117
+ # # print(f"Chunk ID: {result['chunk_id']}")
118
+ # # print(f"Type: {result['chunk_type']}")
119
+ # # print(f"Name: {result['name']}")
120
+ # # print(f"File: {result['file_path']} (Lines {result['start_line']}–{result['end_line']})")
121
+ # # print(f"Distance: {result['distance']:.4f}")
122
+ # # print(f"Docstring: {result['docstring']}")
123
+ # # print("\nCode:")
124
+ # # print(result['content'])
125
+ # # print("-" * 80)
126
+ # returned_text=returned_text + "\n" +"chunk_id: " + "\n"+ f"File: {result['file_path']} (Lines {result['start_line']}–{result['end_line']})" + "\n" + result['chunk_id'] + "\n" +"code: " + result['content']
127
+ # # return in style
128
+ # return returned_text
129
+ # #return { {'results': for result in results }
130
+ # # In retrive_docs.py
131
+
132
+ def print_results(results: List[Dict]):
133
+ """
134
+ Formats the retrieved results into a Markdown string with GitHub links
135
+ and syntax highlighting.
136
+
137
+ Args:
138
+ results (List[Dict]): List of retrieved chunk details.
139
+ """
140
+ if not results:
141
+ return "No relevant chunks found."
142
+
143
+ GITHUB_BASE_URL = "https://github.com/ultralytics/ultralytics/blob/main/"
144
+ markdown_output = ""
145
+
146
+ for i, result in enumerate(results, 1):
147
+ file_path = result.get('file_path', 'Unknown')
148
+ start_line = result.get('start_line', -1)
149
+ end_line = result.get('end_line', -1)
150
+
151
+ # Construct a direct link to the code on GitHub
152
+ if file_path != 'Unknown' and start_line != -1:
153
+ github_link = f"{GITHUB_BASE_URL}{file_path}#L{start_line}-L{end_line}"
154
+ header = f"### {i}. [{file_path}]({github_link}) (Lines {start_line}–{end_line})"
155
+ else:
156
+ header = f"### {i}. {file_path} (Lines {start_line}–{end_line})"
157
+
158
+ markdown_output += f"{header}\n"
159
+ markdown_output += f"**Type:** `{result.get('chunk_type', 'N/A')}` **Name:** `{result.get('name', 'N/A')}`\n\n"
160
+ markdown_output += "```python\n"
161
+ markdown_output += result.get('content', '# No content available.') + "\n"
162
+ markdown_output += "```\n---\n"
163
+
164
+ return markdown_output
165
+
166
+ # if __name__ == "__main__":
167
+ # # --- CONFIGURATION ---
168
+ # INDEX_PATH = "code_faiss.index"
169
+ # METADATA_PATH = "code_metadata.json"
170
+ # CHUNKS_JSON_PATH = "code_chunks.json"
171
+ # MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B" # Must match the model used in create_faiss.py
172
+ # TOP_K = 5 # Number of results to retrieve
173
+
174
+ # # --- EXECUTION ---
175
+ # # Load FAISS index and metadata
176
+ # index, metadata, chunks_dict = load_faiss_index_and_metadata(
177
+ # index_path=INDEX_PATH,
178
+ # metadata_path=METADATA_PATH,
179
+ # chunks_json_path=CHUNKS_JSON_PATH
180
+ # )
181
+
182
+ # if index is None or metadata is None or chunks_dict is None:
183
+ # print("Failed to load index, metadata, or chunks. Exiting.")
184
+ # exit(1)
185
+
186
+ # # Get user query
187
+ # print("\nEnter your query (e.g., 'function to process text data'):")
188
+ # query = input("> ")
189
+
190
+ # # Retrieve and display results
191
+ # results = retrieve_relevant_chunks(
192
+ # query=query,
193
+ # model_name=MODEL_NAME,
194
+ # index=index,
195
+ # metadata=metadata,
196
+ # chunks_dict=chunks_dict,
197
+ # top_k=TOP_K
198
+ # )
199
+
200
+ # print_results(results)