Luke Stanley commited on
Commit
74d6e52
1 Parent(s): a0f49a0

Auto-downloads model if env var is not set

Browse files
Files changed (1) hide show
  1. utils.py +26 -5
utils.py CHANGED
@@ -1,9 +1,16 @@
1
  import json
 
2
  from typing import Any, Dict, Union
3
  import requests
4
 
 
5
  from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
6
 
 
 
 
 
 
7
  # The llama_cpp Python HTTP server communicates with the AI model, similar
8
  # to the OpenAI API but adds a unique "grammar" parameter.
9
  # The real OpenAI API has other ways to set the output format.
@@ -11,8 +18,24 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
11
 
12
  URL = "http://localhost:5834/v1/chat/completions"
13
  in_memory_llm = None
14
- IN_MEMORY_LLM_PATH = "/fast/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
15
- # TODO: Have a good way to set the model path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def llm_streaming(
18
  prompt: str, pydantic_model_class, return_pydantic_object=False
@@ -83,9 +106,6 @@ def calculate_overall_score(faithfulness, spiciness):
83
  def llm_stream_sans_network(
84
  prompt: str, pydantic_model_class, return_pydantic_object=False
85
  ) -> Union[str, Dict[str, Any]]:
86
- global in_memory_llm
87
- if in_memory_llm is None:
88
- in_memory_llm = Llama(model_path=IN_MEMORY_LLM_PATH)
89
  schema = pydantic_model_class.model_json_schema()
90
 
91
  # Optional example field from schema, is not needed for the grammar generation
@@ -97,6 +117,7 @@ def llm_stream_sans_network(
97
 
98
  stream = in_memory_llm(
99
  prompt,
 
100
  max_tokens=1000,
101
  temperature=0.7,
102
  grammar=grammar,
 
1
  import json
2
+ from os import environ as env
3
  from typing import Any, Dict, Union
4
  import requests
5
 
6
+ from huggingface_hub import hf_hub_download
7
  from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
8
 
9
+ # There are two ways to use the LLM model currently used:
10
+ # 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
11
+ # when you want to change the logic of the translator without restarting the server.
12
+ # 2. Load the model into memory
13
+ # When using the HTTP server, it must be ran separately. See the README for instructions.
14
  # The llama_cpp Python HTTP server communicates with the AI model, similar
15
  # to the OpenAI API but adds a unique "grammar" parameter.
16
  # The real OpenAI API has other ways to set the output format.
 
18
 
19
  URL = "http://localhost:5834/v1/chat/completions"
20
  in_memory_llm = None
21
+
22
+
23
+ LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
24
+ USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
25
+
26
+ if len(LLM_MODEL_PATH) > 0:
27
+ print(f"Using local model from {LLM_MODEL_PATH}")
28
+ else:
29
+ print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
30
+ LLM_MODEL_PATH =hf_hub_download(
31
+ repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
32
+ filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
33
+ )
34
+ print(f"Model downloaded to {LLM_MODEL_PATH}")
35
+
36
+ if in_memory_llm is None and USE_HTTP_SERVER is False:
37
+ print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
38
+ in_memory_llm = Llama(model_path=LLM_MODEL_PATH)
39
 
40
  def llm_streaming(
41
  prompt: str, pydantic_model_class, return_pydantic_object=False
 
106
  def llm_stream_sans_network(
107
  prompt: str, pydantic_model_class, return_pydantic_object=False
108
  ) -> Union[str, Dict[str, Any]]:
 
 
 
109
  schema = pydantic_model_class.model_json_schema()
110
 
111
  # Optional example field from schema, is not needed for the grammar generation
 
117
 
118
  stream = in_memory_llm(
119
  prompt,
120
+ n_ctx=4096,
121
  max_tokens=1000,
122
  temperature=0.7,
123
  grammar=grammar,