lukestanley commited on
Commit
e01e28e
1 Parent(s): 0945e5b

Add env vars to set GPU layer count and context size, make verbose

Browse files
Files changed (1) hide show
  1. utils.py +9 -6
utils.py CHANGED
@@ -19,9 +19,12 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
21
 
22
-
 
23
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
24
  USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
 
 
25
 
26
  if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
27
  print(f"Using local model from {LLM_MODEL_PATH}")
@@ -35,7 +38,7 @@ else:
35
 
36
  if in_memory_llm is None and USE_HTTP_SERVER is False:
37
  print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
38
- in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=4096, n_gpu_layers=20)
39
 
40
  def llm_streaming(
41
  prompt: str, pydantic_model_class, return_pydantic_object=False
@@ -51,9 +54,9 @@ def llm_streaming(
51
 
52
  payload = {
53
  "stream": True,
54
- "max_tokens": 1000,
55
  "grammar": grammar,
56
- "temperature": 0.7,
57
  "messages": [{"role": "user", "content": prompt}],
58
  }
59
  headers = {
@@ -117,8 +120,8 @@ def llm_stream_sans_network(
117
 
118
  stream = in_memory_llm(
119
  prompt,
120
- max_tokens=1000,
121
- temperature=0.7,
122
  grammar=grammar,
123
  stream=True
124
  )
 
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
21
 
22
+ N_GPU_LAYERS = env.get("N_GPU_LAYERS", 10)
23
+ CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
24
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
25
  USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
26
+ MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
27
+ TEMPERATURE = float(env.get("TEMPERATURE", 0.7))
28
 
29
  if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
30
  print(f"Using local model from {LLM_MODEL_PATH}")
 
38
 
39
  if in_memory_llm is None and USE_HTTP_SERVER is False:
40
  print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
41
+ in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
42
 
43
  def llm_streaming(
44
  prompt: str, pydantic_model_class, return_pydantic_object=False
 
54
 
55
  payload = {
56
  "stream": True,
57
+ "max_tokens": MAX_TOKENS,
58
  "grammar": grammar,
59
+ "temperature": TEMPERATURE,
60
  "messages": [{"role": "user", "content": prompt}],
61
  }
62
  headers = {
 
120
 
121
  stream = in_memory_llm(
122
  prompt,
123
+ max_tokens=MAX_TOKENS,
124
+ temperature=TEMPERATURE,
125
  grammar=grammar,
126
  stream=True
127
  )