Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,8 @@ from threading import Thread
|
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
MODEL_ID = "CohereForAI/aya-23-8B"
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
TITLE = "<h1><center>Aya-23-Chatbox</center></h1>"
|
| 15 |
|
|
@@ -34,26 +35,27 @@ USE_FLASH_ATTENTION = False
|
|
| 34 |
GRAD_ACC_STEPS = 16
|
| 35 |
|
| 36 |
quantization_config = None
|
|
|
|
| 37 |
if QUANTIZE_4BIT:
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
attn_implementation = None
|
| 46 |
if USE_FLASH_ATTENTION:
|
| 47 |
-
|
| 48 |
|
| 49 |
model = AutoModelForCausalLM.from_pretrained(
|
| 50 |
-
|
| 51 |
quantization_config=quantization_config,
|
| 52 |
attn_implementation=attn_implementation,
|
| 53 |
torch_dtype=torch.bfloat16,
|
| 54 |
device_map="auto",
|
| 55 |
)
|
| 56 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 57 |
|
| 58 |
@spaces.GPU
|
| 59 |
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
|
|
|
|
| 9 |
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 11 |
MODEL_ID = "CohereForAI/aya-23-8B"
|
| 12 |
+
MODEL_ID2 = "CohereForAI/aya-23-35B"
|
| 13 |
+
MODEL_NAME = MODEL_ID2.split("/")[-1]
|
| 14 |
|
| 15 |
TITLE = "<h1><center>Aya-23-Chatbox</center></h1>"
|
| 16 |
|
|
|
|
| 35 |
GRAD_ACC_STEPS = 16
|
| 36 |
|
| 37 |
quantization_config = None
|
| 38 |
+
|
| 39 |
if QUANTIZE_4BIT:
|
| 40 |
+
quantization_config = BitsAndBytesConfig(
|
| 41 |
+
load_in_4bit=True,
|
| 42 |
+
bnb_4bit_quant_type="nf4",
|
| 43 |
+
bnb_4bit_use_double_quant=True,
|
| 44 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 45 |
+
)
|
| 46 |
|
| 47 |
attn_implementation = None
|
| 48 |
if USE_FLASH_ATTENTION:
|
| 49 |
+
attn_implementation="flash_attention_2"
|
| 50 |
|
| 51 |
model = AutoModelForCausalLM.from_pretrained(
|
| 52 |
+
MODEL_ID2,
|
| 53 |
quantization_config=quantization_config,
|
| 54 |
attn_implementation=attn_implementation,
|
| 55 |
torch_dtype=torch.bfloat16,
|
| 56 |
device_map="auto",
|
| 57 |
)
|
| 58 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID2)
|
| 59 |
|
| 60 |
@spaces.GPU
|
| 61 |
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
|