ffreemt commited on
Commit
e1bd529
1 Parent(s): 8f1f68d

Fix concurrency_count to avoid OOM

Browse files
Files changed (2) hide show
  1. app.py +23 -7
  2. requirements.txt +1 -1
app.py CHANGED
@@ -2,6 +2,7 @@
2
  # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
  # ruff: noqa: E501
4
  import os
 
5
  import time
6
  from dataclasses import asdict, dataclass
7
  from pathlib import Path
@@ -39,8 +40,10 @@ URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main
39
 
40
  url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
41
  url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
42
- # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
 
43
  # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" # 7.87G
 
44
  url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
45
 
46
  prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
@@ -88,11 +91,20 @@ logger.info("load llm")
88
  _ = Path(model_loc).absolute().as_posix()
89
  logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
90
  LLM = None
91
- LLM = AutoModelForCausalLM.from_pretrained(
92
- model_loc,
93
- model_type="llama",
94
- threads=cpu_count,
95
- )
 
 
 
 
 
 
 
 
 
96
 
97
  logger.info("done load llm")
98
 
@@ -458,4 +470,8 @@ with gr.Blocks(
458
 
459
  # concurrency_count=5, max_size=20
460
  # max_size=36, concurrency_count=14
461
- block.queue(concurrency_count=5, max_size=20).launch(debug=True)
 
 
 
 
 
2
  # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
  # ruff: noqa: E501
4
  import os
5
+ import platform
6
  import time
7
  from dataclasses import asdict, dataclass
8
  from pathlib import Path
 
40
 
41
  url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
42
  url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
43
+ # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin"
44
+ url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
45
  # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q4_K_M.bin" # 7.87G
46
+
47
  url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
48
 
49
  prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 
91
  _ = Path(model_loc).absolute().as_posix()
92
  logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
93
  LLM = None
94
+
95
+ if "okteto" in platform.node():
96
+ # url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q2_K.bin"
97
+ LLM = AutoModelForCausalLM.from_pretrained(
98
+ "models/llama-2-13b-chat.ggmlv3.q2_K.bin",
99
+ model_type="llama",
100
+ threads=cpu_count,
101
+ )
102
+ else:
103
+ LLM = AutoModelForCausalLM.from_pretrained(
104
+ model_loc,
105
+ model_type="llama",
106
+ threads=cpu_count,
107
+ )
108
 
109
  logger.info("done load llm")
110
 
 
470
 
471
  # concurrency_count=5, max_size=20
472
  # max_size=36, concurrency_count=14
473
+ # CPU cpu_count=2 16G, model 7G
474
+ # CPU UPGRADE cpu_count=8 32G, model 7G
475
+
476
+ concurrency_count = max(psutil.virtual_memory().total / 10**9 // file_size - 1, 1)
477
+ block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- ctransformers # ==0.2.10
2
  transformers # ==4.30.2
3
  # huggingface_hub
4
  gradio
 
1
+ ctransformers # ==0.2.10 0.2.13
2
  transformers # ==4.30.2
3
  # huggingface_hub
4
  gradio