Spaces:
Sleeping
Sleeping
Mazin Karjikar
commited on
Quickstarting llama.cpp (#2)
Browse files* added functionality to use local models in .gguf file format
* made hpctoolkit work and fixed truncated output by setting context limit
* new log file per interaction, named by current timestamp down to the milisecond
- code_samples/p1.py +7 -0
- local_models/README.md +3 -0
- requirements.txt +2 -1
- src/models.py +37 -3
- src/perf_guru_logs/README.md +4 -0
- src/perfguru.py +6 -3
- src/profiles.py +7 -1
- src/rag.py +46 -2
- token_limits.json +2 -1
code_samples/p1.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def sort(arr):
|
3 |
+
for i in range(len(arr)):
|
4 |
+
for j in range(1,len(arr)):
|
5 |
+
if arr[j] < arr[j-1]:
|
6 |
+
arr[j],arr[j-1] = arr[j-1],arr[j]
|
7 |
+
return arr
|
local_models/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Local Models
|
2 |
+
|
3 |
+
### This folder stores the local models being used by PerfGuru. In GitHub, this folder will be empty due to the size of models. Otherwise, when PerfGuru is ran on a machine, some local models such as Meta-Llama-3 should be used with llama.cpp.
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ gradio==4.39.0
|
|
2 |
hatchet==1.4.0
|
3 |
google-generativeai==0.7.2
|
4 |
openai==1.37.0
|
5 |
-
tiktoken==0.7.0
|
|
|
|
2 |
hatchet==1.4.0
|
3 |
google-generativeai==0.7.2
|
4 |
openai==1.37.0
|
5 |
+
tiktoken==0.7.0
|
6 |
+
llama-cpp-python==0.2.90
|
src/models.py
CHANGED
@@ -6,6 +6,7 @@ import os
|
|
6 |
import random
|
7 |
import openai
|
8 |
import google.generativeai as genai
|
|
|
9 |
|
10 |
class ChatModel(ABC):
|
11 |
def __init__(self, name):
|
@@ -78,9 +79,43 @@ class GeminiModel(ChatModel):
|
|
78 |
yield response
|
79 |
|
80 |
|
81 |
-
|
82 |
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
if os.environ.get("OPENAI_API_KEY"):
|
86 |
openai_client = openai.OpenAI()
|
@@ -91,7 +126,6 @@ if os.environ.get("GOOGLE_API_KEY"):
|
|
91 |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
|
92 |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
|
93 |
|
94 |
-
|
95 |
if not AVAILABLE_MODELS:
|
96 |
raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")
|
97 |
|
|
|
6 |
import random
|
7 |
import openai
|
8 |
import google.generativeai as genai
|
9 |
+
from llama_cpp import Llama
|
10 |
|
11 |
class ChatModel(ABC):
|
12 |
def __init__(self, name):
|
|
|
79 |
yield response
|
80 |
|
81 |
|
82 |
+
class LocalModel(ChatModel):
|
83 |
|
84 |
+
def __init__(self, model: str, model_path: str):
|
85 |
+
super().__init__(model)
|
86 |
+
self.llm = Llama(
|
87 |
+
model_path=model_path,
|
88 |
+
n_ctx=8000,
|
89 |
+
)
|
90 |
+
|
91 |
+
def get_response(self, prompt) -> Generator[str, None, None]:
|
92 |
+
|
93 |
+
output = self.llm.create_chat_completion(
|
94 |
+
messages = [
|
95 |
+
{"role": "system", "content": "You are PerfGuru, a helpful assistant for assisting developers in identifying performance bottlenecks in their code and optimizing them."},
|
96 |
+
{
|
97 |
+
"role": "user",
|
98 |
+
"content": prompt,
|
99 |
+
}
|
100 |
+
],
|
101 |
+
max_tokens=4000,
|
102 |
+
)
|
103 |
+
|
104 |
+
result = output["choices"][0]["message"]["content"]
|
105 |
+
for idx in range(len(result)):
|
106 |
+
yield result[:idx+1]
|
107 |
+
|
108 |
+
|
109 |
+
LOCAL_MODELS = [
|
110 |
+
"Meta-Llama-3-8B-Instruct.Q4_K_S",
|
111 |
+
]
|
112 |
+
|
113 |
+
AVAILABLE_MODELS = [
|
114 |
+
LocalModel(model_name, f"../local_models/{model_name}.gguf")
|
115 |
+
for model_name in LOCAL_MODELS
|
116 |
+
]
|
117 |
+
|
118 |
+
# AVAILABLE_MODELS.append( DummyModel() )
|
119 |
|
120 |
if os.environ.get("OPENAI_API_KEY"):
|
121 |
openai_client = openai.OpenAI()
|
|
|
126 |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-flash") )
|
127 |
AVAILABLE_MODELS.append( GeminiModel("gemini-1.5-pro") )
|
128 |
|
|
|
129 |
if not AVAILABLE_MODELS:
|
130 |
raise ValueError("No models available. Please set OPENAI_API_KEY or GOOGLE_API_KEY environment variables.")
|
131 |
|
src/perf_guru_logs/README.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Logging Interactions
|
2 |
+
|
3 |
+
### This folder stores a json log of each interaction with PerfGuru.
|
4 |
+
|
src/perfguru.py
CHANGED
@@ -21,7 +21,7 @@ def code_upload(code_file_select):
|
|
21 |
|
22 |
|
23 |
def token_limit_getter(model: str) -> int:
|
24 |
-
with open("token_limits.json", "r") as f:
|
25 |
token_limits = json.load(f)
|
26 |
if model in token_limits:
|
27 |
return token_limits[model]
|
@@ -37,7 +37,7 @@ def check_length(text, model):
|
|
37 |
token_limit = token_limit_getter(model.name)
|
38 |
|
39 |
if token_length >= token_limit:
|
40 |
-
error_helper(
|
41 |
|
42 |
|
43 |
def chat_with_llms(prompt, code_files, profile_file, profile_type):
|
@@ -93,7 +93,9 @@ def log_interaction(prompt, vote, response1, model1, formatter1, full_prompt1, r
|
|
93 |
"timestamp": datetime.datetime.now().isoformat()
|
94 |
}
|
95 |
|
96 |
-
|
|
|
|
|
97 |
if os.path.exists(log_file_path):
|
98 |
with open(log_file_path, "r") as log_file:
|
99 |
logs = json.load(log_file)
|
@@ -105,6 +107,7 @@ def log_interaction(prompt, vote, response1, model1, formatter1, full_prompt1, r
|
|
105 |
# Write updated logs to file
|
106 |
with open(log_file_path, "w") as log_file:
|
107 |
json.dump(logs, log_file, indent=4)
|
|
|
108 |
|
109 |
def handle_vote(prompt, vote, response1, source1, full_prompt1, response2, source2, full_prompt2):
|
110 |
model1, formatter1 = source1.split(" + ")
|
|
|
21 |
|
22 |
|
23 |
def token_limit_getter(model: str) -> int:
|
24 |
+
with open("../token_limits.json", "r") as f:
|
25 |
token_limits = json.load(f)
|
26 |
if model in token_limits:
|
27 |
return token_limits[model]
|
|
|
37 |
token_limit = token_limit_getter(model.name)
|
38 |
|
39 |
if token_length >= token_limit:
|
40 |
+
error_helper("Prompt is too long. Please try reducing the size of the prompt or code uploaded.")
|
41 |
|
42 |
|
43 |
def chat_with_llms(prompt, code_files, profile_file, profile_type):
|
|
|
93 |
"timestamp": datetime.datetime.now().isoformat()
|
94 |
}
|
95 |
|
96 |
+
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")[:-3]
|
97 |
+
|
98 |
+
log_file_path = f"perf_guru_logs/log_{timestamp}.json"
|
99 |
if os.path.exists(log_file_path):
|
100 |
with open(log_file_path, "r") as log_file:
|
101 |
logs = json.load(log_file)
|
|
|
107 |
# Write updated logs to file
|
108 |
with open(log_file_path, "w") as log_file:
|
109 |
json.dump(logs, log_file, indent=4)
|
110 |
+
|
111 |
|
112 |
def handle_vote(prompt, vote, response1, source1, full_prompt1, response2, source2, full_prompt2):
|
113 |
model1, formatter1 = source1.split(" + ")
|
src/profiles.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2 |
"""
|
3 |
import json
|
4 |
import os
|
|
|
|
|
5 |
from typing import Literal, Optional
|
6 |
|
7 |
import hatchet as ht
|
@@ -14,7 +16,11 @@ class Profile:
|
|
14 |
|
15 |
def _load(self, profile_path: os.PathLike, profile_type: Literal["HPCToolkit", "CProfile", "Caliper"]) -> ht.GraphFrame:
|
16 |
if profile_type == "HPCToolkit":
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
elif profile_type == "CProfile":
|
19 |
return ht.GraphFrame.from_cprofile(profile_path)
|
20 |
elif profile_type == "Caliper":
|
|
|
2 |
"""
|
3 |
import json
|
4 |
import os
|
5 |
+
import tempfile as tf
|
6 |
+
import zipfile as zf
|
7 |
from typing import Literal, Optional
|
8 |
|
9 |
import hatchet as ht
|
|
|
16 |
|
17 |
def _load(self, profile_path: os.PathLike, profile_type: Literal["HPCToolkit", "CProfile", "Caliper"]) -> ht.GraphFrame:
|
18 |
if profile_type == "HPCToolkit":
|
19 |
+
toolkit_dir = profile_path[profile_path.rfind("/")+1:-4] # last dir in path, without ".zip" [:-4]
|
20 |
+
with tf.TemporaryDirectory() as temp_dir:
|
21 |
+
with zf.ZipFile(profile_path, 'r') as zip_ref:
|
22 |
+
zip_ref.extractall(temp_dir)
|
23 |
+
return ht.GraphFrame.from_hpctoolkit(os.path.join(temp_dir, toolkit_dir))
|
24 |
elif profile_type == "CProfile":
|
25 |
return ht.GraphFrame.from_cprofile(profile_path)
|
26 |
elif profile_type == "Caliper":
|
src/rag.py
CHANGED
@@ -61,9 +61,53 @@ class BasicPromptFormatter(PerfGuruPromptFormatter):
|
|
61 |
|
62 |
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
63 |
|
|
|
64 |
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
|
69 |
def select_random_formatter() -> PerfGuruPromptFormatter:
|
|
|
61 |
|
62 |
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
63 |
|
64 |
+
class SlowestFunctionPromptFormatter(PerfGuruPromptFormatter):
|
65 |
|
66 |
+
def __init__(self):
|
67 |
+
super().__init__("slowest_function")
|
68 |
+
|
69 |
+
def format_prompt(self, prompt: str, code_paths: List[PathLike], profile_path: Optional[PathLike] = None, profile_type: Optional[str] = None, error_fn: Optional[callable] = None) -> str:
|
70 |
+
if not code_paths:
|
71 |
+
if error_fn:
|
72 |
+
error_fn("No code files provided. At least one code file must be provided.")
|
73 |
+
return None
|
74 |
+
|
75 |
+
concatenated_code = ""
|
76 |
+
code_file_contents = self._read_code_files(code_paths)
|
77 |
+
for code_path, content in code_file_contents.items():
|
78 |
+
fname = basename(code_path)
|
79 |
+
concatenated_code += f"{fname}:\n{content}\n\n"
|
80 |
+
|
81 |
+
if profile_path:
|
82 |
+
if not profile_type:
|
83 |
+
if error_fn:
|
84 |
+
error_fn("Profile type must be provided if a profile file is provided.")
|
85 |
+
return None
|
86 |
+
k = 1
|
87 |
+
profile = self._read_profile(profile_path, profile_type)
|
88 |
+
slowest = profile.gf.dataframe.nlargest(k, 'time')
|
89 |
+
function_names = [slowest['name'].values[i] for i in range(k) if i < len(slowest['name'].values)]
|
90 |
+
execution_times = [slowest['time'].values[i] for i in range(k) if i < len(slowest['name'].values)]
|
91 |
+
# print(profile_content)
|
92 |
+
hot_path = profile.gf.hot_path()
|
93 |
+
hot_path_functions = []
|
94 |
+
|
95 |
+
for node in hot_path:
|
96 |
+
if "name" in node.frame.attrs:
|
97 |
+
hot_path_functions.append(node.frame["name"])
|
98 |
+
hot_path_functions = hot_path_functions[:k]
|
99 |
+
|
100 |
+
profile_content = (f"The slowest functions are {function_names} and they took {execution_times} seconds, respectively." +
|
101 |
+
f" Also, these functions were in the hot path: {hot_path_functions}.")
|
102 |
+
print(profile_content)
|
103 |
+
|
104 |
+
else:
|
105 |
+
profile_content = ""
|
106 |
+
|
107 |
+
return f"Code:\n{concatenated_code}\n\n{profile_type} Profile:\n{profile_content}\n\n{prompt}"
|
108 |
+
|
109 |
+
AVAILABLE_FORMATTERS = [SlowestFunctionPromptFormatter()]
|
110 |
+
# AVAILABLE_FORMATTERS.append(BasicPromptFormatter())
|
111 |
|
112 |
|
113 |
def select_random_formatter() -> PerfGuruPromptFormatter:
|
token_limits.json
CHANGED
@@ -5,5 +5,6 @@
|
|
5 |
"gpt-4": 8192,
|
6 |
"gpt-3.5-turbo": 16385,
|
7 |
"gemini-1.5-flash": 1048576,
|
8 |
-
"gemini-1.5-pro": 2097152
|
|
|
9 |
}
|
|
|
5 |
"gpt-4": 8192,
|
6 |
"gpt-3.5-turbo": 16385,
|
7 |
"gemini-1.5-flash": 1048576,
|
8 |
+
"gemini-1.5-pro": 2097152,
|
9 |
+
"Meta-Llama-3-8B-Instruct.Q4_K_S": 8000
|
10 |
}
|