Upload 2 files
Browse files- app.py +10 -3
- requirements.txt +0 -1
app.py
CHANGED
@@ -6,8 +6,8 @@ import gradio as gr
|
|
6 |
import spaces
|
7 |
import torch
|
8 |
|
9 |
-
|
10 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
11 |
|
12 |
MAX_MAX_NEW_TOKENS = 2048
|
13 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
@@ -16,12 +16,19 @@ ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
|
|
16 |
|
17 |
model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
model_id,
|
21 |
torch_dtype=torch.float16,
|
22 |
device_map="auto",
|
23 |
trust_remote_code=True,
|
24 |
low_cpu_mem_usage=True,
|
|
|
25 |
token=ACCESS_TOKEN)
|
26 |
tokenizer = AutoTokenizer.from_pretrained(
|
27 |
model_id,
|
|
|
6 |
import spaces
|
7 |
import torch
|
8 |
|
9 |
+
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, AwqConfig
|
11 |
|
12 |
MAX_MAX_NEW_TOKENS = 2048
|
13 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
|
|
16 |
|
17 |
model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
|
18 |
|
19 |
+
quantization_config = AwqConfig(
|
20 |
+
bits=4,
|
21 |
+
fuse_max_seq_len=512, # Note: Update this as per your use-case
|
22 |
+
do_fuse=True,
|
23 |
+
)
|
24 |
+
|
25 |
+
model = AutoModelForCausalLM.from_pretrained(
|
26 |
model_id,
|
27 |
torch_dtype=torch.float16,
|
28 |
device_map="auto",
|
29 |
trust_remote_code=True,
|
30 |
low_cpu_mem_usage=True,
|
31 |
+
quantization_config=quantization_config,
|
32 |
token=ACCESS_TOKEN)
|
33 |
tokenizer = AutoTokenizer.from_pretrained(
|
34 |
model_id,
|
requirements.txt
CHANGED
@@ -246,4 +246,3 @@ einops
|
|
246 |
pytest
|
247 |
gguf>=0.10.0
|
248 |
autoawq
|
249 |
-
awq
|
|
|
246 |
pytest
|
247 |
gguf>=0.10.0
|
248 |
autoawq
|
|