Daniel Marques commited on
Commit
9f067a2
1 Parent(s): b21e4ba

feat: add websocket

Browse files
Files changed (2) hide show
  1. constants.py +3 -3
  2. load_models.py +4 -5
constants.py CHANGED
@@ -32,13 +32,13 @@ CHROMA_SETTINGS = Settings(
32
  )
33
 
34
  # Context Window and Max New Tokens
35
- CONTEXT_WINDOW_SIZE = 2048
36
  MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
37
 
38
  #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
39
 
40
- N_GPU_LAYERS = 83 # Llama-2-70B has 83 layers
41
- N_BATCH = 2048
42
 
43
  ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
44
  # N_GPU_LAYERS = 20
 
32
  )
33
 
34
  # Context Window and Max New Tokens
35
+ CONTEXT_WINDOW_SIZE = 4096
36
  MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
37
 
38
  #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
39
 
40
+ N_GPU_LAYERS = 40 # Llama-2-70B has 83 layers
41
+ N_BATCH = 1024
42
 
43
  ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
44
  # N_GPU_LAYERS = 20
load_models.py CHANGED
@@ -215,11 +215,10 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
215
  "text-generation",
216
  model=model,
217
  tokenizer=tokenizer,
218
- max_length=50,
219
- temperature=0.15,
220
- top_p=0.1,
221
- top_k=40,
222
- repetition_penalty=1.0,
223
  generation_config=generation_config,
224
  )
225
 
 
215
  "text-generation",
216
  model=model,
217
  tokenizer=tokenizer,
218
+ max_length=MAX_NEW_TOKENS,
219
+ temperature=0.2,
220
+ # top_p=0.95,
221
+ repetition_penalty=1.15,
 
222
  generation_config=generation_config,
223
  )
224