Spaces:
Sleeping
Sleeping
Daniel Nichols
Mazin Karjikar
commited on
Add function parsing (#3)
Browse files* added source code function parsing functionality - still needs to be tested
* added parameter to slowest function parser, can now test multiple prompt formatter functionality
* cleaned up function parsing formatter
* now importing function grabber function instead of starting subprocess
---------
Co-authored-by: Mazin Karjikar <mazin@pssg-mordor.umiacs.umd.edu>
- requirements.txt +2 -1
- src/function_grabber.py +100 -0
- src/rag.py +63 -8
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ hatchet==1.4.0
|
|
3 |
google-generativeai==0.7.2
|
4 |
openai==1.37.0
|
5 |
tiktoken==0.7.0
|
6 |
-
llama-cpp-python==0.2.90
|
|
|
|
3 |
google-generativeai==0.7.2
|
4 |
openai==1.37.0
|
5 |
tiktoken==0.7.0
|
6 |
+
llama-cpp-python==0.2.90
|
7 |
+
clang==17.0.6
|
src/function_grabber.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" This script extracts the function that contains a specified line number in a C++ file.
|
2 |
+
"""
|
3 |
+
from argparse import ArgumentParser
|
4 |
+
import os
|
5 |
+
from typing import List, Tuple
|
6 |
+
import clang.cindex
|
7 |
+
|
8 |
+
|
9 |
+
def get_functions_at_lines(fpath: os.PathLike, lines: List[int], clang_path: os.PathLike = None) -> List[Tuple[str, Tuple[int, int]]]:
|
10 |
+
""" Find all the functions that contain the specified lines in a file.
|
11 |
+
"""
|
12 |
+
functions = []
|
13 |
+
for line in lines:
|
14 |
+
|
15 |
+
# check if we've already found this line
|
16 |
+
if any(start <= line <= end for _, (start, end) in functions):
|
17 |
+
continue
|
18 |
+
|
19 |
+
function_body, range = get_function_at_line(fpath, line, clang_path=clang_path)
|
20 |
+
if function_body:
|
21 |
+
functions.append((function_body, range))
|
22 |
+
return functions
|
23 |
+
|
24 |
+
|
25 |
+
def remove_macros(filename: str, line_numbers: List[int]) -> List[int]:
|
26 |
+
""" Remove all macros from a file. Lines is a list of numbers that you would like to have mapped
|
27 |
+
to their new line numbers after the macros are removed.
|
28 |
+
"""
|
29 |
+
with open(filename, 'r') as f:
|
30 |
+
lines = f.readlines()
|
31 |
+
|
32 |
+
new_line_numbers = []
|
33 |
+
new_lines = []
|
34 |
+
num_removed = 0
|
35 |
+
for i, line in enumerate(lines):
|
36 |
+
if line.startswith('#'):
|
37 |
+
num_removed += 1
|
38 |
+
else:
|
39 |
+
new_lines.append(line)
|
40 |
+
|
41 |
+
if i in line_numbers:
|
42 |
+
new_line_numbers.append(i - num_removed)
|
43 |
+
|
44 |
+
with open(filename, 'w') as f:
|
45 |
+
f.write(''.join(new_lines))
|
46 |
+
|
47 |
+
return new_line_numbers
|
48 |
+
|
49 |
+
|
50 |
+
def get_function_at_line(filename, line_number, clang_path=None):
|
51 |
+
if clang_path and not clang.cindex.Config.loaded:
|
52 |
+
clang.cindex.Config.set_library_file(clang_path)
|
53 |
+
index = clang.cindex.Index.create()
|
54 |
+
|
55 |
+
try:
|
56 |
+
translation_unit = index.parse(filename)
|
57 |
+
except clang.cindex.TranslationUnitLoadError:
|
58 |
+
return None, None
|
59 |
+
|
60 |
+
def find_function(node, line_number):
|
61 |
+
# Check if node is function-like and contains the line number
|
62 |
+
if node.kind == clang.cindex.CursorKind.FUNCTION_DECL or node.kind == clang.cindex.CursorKind.CXX_METHOD:
|
63 |
+
start_line = node.extent.start.line
|
64 |
+
end_line = node.extent.end.line
|
65 |
+
|
66 |
+
#print(f"Checking function {node.spelling} at lines {start_line} - {end_line}")
|
67 |
+
|
68 |
+
if start_line <= line_number <= end_line:
|
69 |
+
return node
|
70 |
+
|
71 |
+
for child in node.get_children():
|
72 |
+
result = find_function(child, line_number)
|
73 |
+
if result:
|
74 |
+
return result
|
75 |
+
return None
|
76 |
+
|
77 |
+
# Start from the root node (translation unit) and find the function
|
78 |
+
function_node = find_function(translation_unit.cursor, line_number)
|
79 |
+
|
80 |
+
if function_node:
|
81 |
+
start_line = function_node.extent.start.line
|
82 |
+
end_line = function_node.extent.end.line
|
83 |
+
with open(filename, 'r') as f:
|
84 |
+
lines = f.readlines()
|
85 |
+
return ''.join(lines[start_line - 1:end_line]), (start_line, end_line)
|
86 |
+
else:
|
87 |
+
return None, None
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
parser = ArgumentParser(description="Extract the function that contains a specified line number in a C++ file.")
|
92 |
+
parser.add_argument("filename", help="The C++ file to analyze")
|
93 |
+
parser.add_argument("line_number", type=int, help="The line number to search for")
|
94 |
+
parser.add_argument("--clang_path", help="Path to libclang.so if necessary")
|
95 |
+
args = parser.parse_args()
|
96 |
+
|
97 |
+
result, rnge = get_function_at_line(args.filename, args.line_number, clang_path=args.clang_path)
|
98 |
+
if result is None:
|
99 |
+
result = f"No function found at line {args.line_number}"
|
100 |
+
print(result, rnge)
|
src/rag.py
CHANGED
@@ -8,6 +8,7 @@ from typing import Optional, List, Mapping
|
|
8 |
from os import PathLike
|
9 |
from os.path import basename
|
10 |
import random
|
|
|
11 |
|
12 |
from profiles import Profile
|
13 |
|
@@ -63,8 +64,9 @@ class BasicPromptFormatter(PerfGuruPromptFormatter):
|
|
63 |
|
64 |
class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
65 |
|
66 |
-
def __init__(self):
|
67 |
super().__init__("slowest_function")
|
|
|
68 |
|
69 |
def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
|
70 |
if not code_paths:
|
@@ -83,11 +85,10 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
|
83 |
if error_fn:
|
84 |
error_fn("Profile type must be provided if a profile file is provided.")
|
85 |
return None
|
86 |
-
k = 1
|
87 |
profile = self._read_profile(profile_path, profile_type)
|
88 |
-
slowest = profile.gf.dataframe.nlargest(k, 'time')
|
89 |
-
function_names = [slowest['name'].values[i] for i in range(k) if i < len(slowest['name'].values)]
|
90 |
-
execution_times = [slowest['time'].values[i] for i in range(k) if i < len(slowest['name'].values)]
|
91 |
# print(profile_content)
|
92 |
hot_path = profile.gf.hot_path()
|
93 |
hot_path_functions = []
|
@@ -95,7 +96,7 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
|
95 |
for node in hot_path:
|
96 |
if "name" in node.frame.attrs:
|
97 |
hot_path_functions.append(node.frame["name"])
|
98 |
-
hot_path_functions = hot_path_functions[:k]
|
99 |
|
100 |
profile_content = (f"The slowest functions are {function_names} and they took {execution_times} seconds, respectively." +
|
101 |
f" Also, these functions were in the hot path: {hot_path_functions}.")
|
@@ -106,9 +107,63 @@ class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
|
106 |
|
107 |
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
108 |
|
109 |
-
|
110 |
-
# AVAILABLE_FORMATTERS.append(BasicPromptFormatter())
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
def select_random_formatter() -> PerfGuruPromptFormatter:
|
114 |
return random.choice(AVAILABLE_FORMATTERS)
|
|
|
8 |
from os import PathLike
|
9 |
from os.path import basename
|
10 |
import random
|
11 |
+
from function_grabber import get_function_at_line
|
12 |
|
13 |
from profiles import Profile
|
14 |
|
|
|
64 |
|
65 |
class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
66 |
|
67 |
+
def __init__(self, k):
|
68 |
super().__init__("slowest_function")
|
69 |
+
self.k = k
|
70 |
|
71 |
def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
|
72 |
if not code_paths:
|
|
|
85 |
if error_fn:
|
86 |
error_fn("Profile type must be provided if a profile file is provided.")
|
87 |
return None
|
|
|
88 |
profile = self._read_profile(profile_path, profile_type)
|
89 |
+
slowest = profile.gf.dataframe.nlargest(self.k, 'time')
|
90 |
+
function_names = [slowest['name'].values[i] for i in range(self.k) if i < len(slowest['name'].values)]
|
91 |
+
execution_times = [slowest['time'].values[i] for i in range(self.k) if i < len(slowest['name'].values)]
|
92 |
# print(profile_content)
|
93 |
hot_path = profile.gf.hot_path()
|
94 |
hot_path_functions = []
|
|
|
96 |
for node in hot_path:
|
97 |
if "name" in node.frame.attrs:
|
98 |
hot_path_functions.append(node.frame["name"])
|
99 |
+
hot_path_functions = hot_path_functions[:self.k]
|
100 |
|
101 |
profile_content = (f"The slowest functions are {function_names} and they took {execution_times} seconds, respectively." +
|
102 |
f" Also, these functions were in the hot path: {hot_path_functions}.")
|
|
|
107 |
|
108 |
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
109 |
|
110 |
+
class SlowestFunctionParsedPromptFormatter(PerfGuruPromptFormatter):
|
|
|
111 |
|
112 |
+
def __init__(self):
|
113 |
+
super().__init__("slowest_function_parsed")
|
114 |
+
|
115 |
+
def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
|
116 |
+
if not code_paths:
|
117 |
+
if error_fn:
|
118 |
+
error_fn("No code files provided. At least one code file must be provided.")
|
119 |
+
return None
|
120 |
+
|
121 |
+
concatenated_code = ""
|
122 |
+
profile_content = ""
|
123 |
+
|
124 |
+
if profile_path:
|
125 |
+
if not profile_type:
|
126 |
+
if error_fn:
|
127 |
+
error_fn("Profile type must be provided if a profile file is provided.")
|
128 |
+
return None
|
129 |
+
|
130 |
+
k = 1
|
131 |
+
profile = self._read_profile(profile_path, profile_type)
|
132 |
+
slowest = profile.gf.dataframe.nlargest(k, 'time')
|
133 |
+
function_name = slowest['name'].values[0] if len(slowest['name'].values) > 0 else None
|
134 |
+
line_number = slowest['line'].values[0] if len(slowest['line'].values) > 0 else None
|
135 |
+
code = None
|
136 |
+
|
137 |
+
if line_number:
|
138 |
+
filename = ""
|
139 |
+
code_file_contents = self._read_code_files(code_paths)
|
140 |
+
for code_path, content in code_file_contents.items():
|
141 |
+
filename = basename(code_path)
|
142 |
+
code, _ = get_function_at_line(filename, str(line_number))
|
143 |
+
if code:
|
144 |
+
break
|
145 |
+
|
146 |
+
if code:
|
147 |
+
concatenated_code = f"{fname}:\n{code}\n\n"
|
148 |
+
print("Only function code:", concatenated_code)
|
149 |
+
|
150 |
+
profile_content = (f"The slowest function is {function_name}.")
|
151 |
+
print(profile_content)
|
152 |
+
|
153 |
+
if concatenated_code == "":
|
154 |
+
code_file_contents = self._read_code_files(code_paths)
|
155 |
+
for code_path, content in code_file_contents.items():
|
156 |
+
fname = basename(code_path)
|
157 |
+
concatenated_code += f"{fname}:\n{content}\n\n"
|
158 |
+
|
159 |
+
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
160 |
+
|
161 |
+
AVAILABLE_FORMATTERS = []
|
162 |
+
AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=1))
|
163 |
+
AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=5))
|
164 |
+
AVAILABLE_FORMATTERS.append(SlowestFunctionPromptFormatter(k=10))
|
165 |
+
# AVAILABLE_FORMATTERS.append(BasicPromptFormatter())
|
166 |
+
AVAILABLE_FORMATTERS.append(SlowestFunctionParsedPromptFormatter())
|
167 |
|
168 |
def select_random_formatter() -> PerfGuruPromptFormatter:
|
169 |
return random.choice(AVAILABLE_FORMATTERS)
|