Luke Stanley commited on
Commit
56e785c
1 Parent(s): 434144a

Introduces worker mode env var

Browse files
Files changed (1) hide show
  1. utils.py +46 -34
utils.py CHANGED
@@ -1,12 +1,14 @@
1
  import json
2
  from os import environ as env
3
  from typing import Any, Dict, Union
 
4
  import requests
5
 
6
  from huggingface_hub import hf_hub_download
7
  from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
8
 
9
- # There are two ways to use the LLM model currently used:
 
10
  # 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
11
  # when you want to change the logic of the translator without restarting the server.
12
  # 2. Load the model into memory
@@ -15,18 +17,24 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
15
  # to the OpenAI API but adds a unique "grammar" parameter.
16
  # The real OpenAI API has other ways to set the output format.
17
  # It's possible to switch to another LLM API by changing the llm_streaming function.
 
 
18
 
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
 
21
 
22
- N_GPU_LAYERS = int(env.get("N_GPU_LAYERS", 20)) # Default to -1, which means use all layers if available
23
- CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
 
 
 
24
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
25
- USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
26
  MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
27
  TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
28
 
29
- if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
30
  print(f"Using local model from {LLM_MODEL_PATH}")
31
  else:
32
  print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
@@ -36,7 +44,7 @@ else:
36
  )
37
  print(f"Model downloaded to {LLM_MODEL_PATH}")
38
 
39
- if in_memory_llm is None and USE_HTTP_SERVER is False:
40
  print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
41
  in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
42
 
@@ -141,33 +149,37 @@ def llm_stream_sans_network(
141
  json_output = json.loads(output_text)
142
  return json_output
143
 
144
- def query_ai_prompt(prompt, replacements, model_class, in_memory=True):
145
- prompt = replace_text(prompt, replacements)
146
- if in_memory:
147
- return llm_stream_sans_network(prompt, model_class)
148
- else:
149
- return llm_streaming(prompt, model_class)
150
-
151
 
152
- def llm_stream_sans_network_simple(
153
- prompt: str, json_schema:str
154
- ):
155
- grammar = LlamaGrammar.from_json_schema(json_schema)
 
156
 
157
- stream = in_memory_llm(
158
- prompt,
159
- max_tokens=MAX_TOKENS,
160
- temperature=TEMPERATURE,
161
- grammar=grammar,
162
- stream=True
163
- )
164
-
165
- output_text = ""
166
- for chunk in stream:
167
- result = chunk["choices"][0]
168
- print(result["text"], end='', flush=True)
169
- output_text = output_text + result["text"]
170
- #yield result["text"]
171
-
172
- print('\n')
173
- return output_text
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  from os import environ as env
3
  from typing import Any, Dict, Union
4
+ # TODO: Make imports conditional on type of worker being used:
5
  import requests
6
 
7
  from huggingface_hub import hf_hub_download
8
  from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
9
 
10
+
11
+ # There are 3 ways to use the LLM model currently used:
12
  # 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
13
  # when you want to change the logic of the translator without restarting the server.
14
  # 2. Load the model into memory
 
17
  # to the OpenAI API but adds a unique "grammar" parameter.
18
  # The real OpenAI API has other ways to set the output format.
19
  # It's possible to switch to another LLM API by changing the llm_streaming function.
20
+ # 3. Use the RunPod API, which is a paid service with severless GPU functions.
21
+ # TODO: Update README with instructions on how to use the RunPod API and options.
22
 
23
  URL = "http://localhost:5834/v1/chat/completions"
24
  in_memory_llm = None
25
+ worker_options = ["runpod", "http", "in_memory"]
26
 
27
+ LLM_WORKER = env.get("LLM_WORKER", "runpod")
28
+ if LLM_WORKER not in worker_options:
29
+ raise ValueError(f"Invalid worker: {LLM_WORKER}")
30
+ N_GPU_LAYERS = int(env.get("N_GPU_LAYERS", -1)) # Default to -1, use all layers if available
31
+ CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
32
  LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
33
+
34
  MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
35
  TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
36
 
37
+ if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0 and (LLM_WORKER == "in_memory" or LLM_WORKER == "http"):
38
  print(f"Using local model from {LLM_MODEL_PATH}")
39
  else:
40
  print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
 
44
  )
45
  print(f"Model downloaded to {LLM_MODEL_PATH}")
46
 
47
+ if in_memory_llm is None and LLM_WORKER == "in_memory":
48
  print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
49
  in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
50
 
 
149
  json_output = json.loads(output_text)
150
  return json_output
151
 
 
 
 
 
 
 
 
152
 
153
+ # Function to call the RunPod API with a Pydantic model and movie name
154
+ def llm_stream_serverless(prompt,model):
155
+ RUNPOD_ENDPOINT_ID = env("RUNPOD_API_KEY")
156
+ RUNPOD_API_KEY = env("RUNPOD_API_KEY")
157
+ url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync"
158
 
159
+ headers = {
160
+ 'Content-Type': 'application/json',
161
+ 'Authorization': f'Bearer {RUNPOD_API_KEY}'
162
+ }
163
+
164
+ schema = model.schema()
165
+ data = {
166
+ 'input': {
167
+ 'schema': json.dumps(schema),
168
+ 'prompt': prompt
169
+ }
170
+ }
171
+
172
+ response = requests.post(url, json=data, headers=headers)
173
+ result = response.json()
174
+ output = result.get('output', '').replace("model:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf\n", "")
175
+ print(output)
176
+ return json.loads(output)
177
+
178
+ def query_ai_prompt(prompt, replacements, model_class):
179
+ prompt = replace_text(prompt, replacements)
180
+ if LLM_WORKER == "runpod":
181
+ return llm_stream_serverless(prompt, model_class)
182
+ if LLM_WORKER == "http":
183
+ return llm_streaming(prompt, model_class)
184
+ if LLM_WORKER == "in_memory":
185
+ return llm_stream_sans_network(prompt, model_class)