Commit
•
55aee82
1
Parent(s):
618f4ed
Update README.md
Browse files
README.md
CHANGED
@@ -49,13 +49,18 @@ import torch
|
|
49 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
50 |
|
51 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
prompt = [
|
53 |
{"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
|
54 |
{"role": "user", "content": "What's Deep Learning?"},
|
55 |
]
|
56 |
-
|
57 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
-
|
59 |
inputs = tokenizer.apply_chat_template(
|
60 |
prompt,
|
61 |
tokenize=True,
|
@@ -64,13 +69,6 @@ inputs = tokenizer.apply_chat_template(
|
|
64 |
return_dict=True,
|
65 |
).to("cuda")
|
66 |
|
67 |
-
model = AutoModelForCausalLM.from_pretrained(
|
68 |
-
model_id,
|
69 |
-
torch_dtype=torch.float16,
|
70 |
-
low_cpu_mem_usage=True,
|
71 |
-
device_map="auto",
|
72 |
-
)
|
73 |
-
|
74 |
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
|
75 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
76 |
```
|
@@ -92,13 +90,18 @@ from auto_gptq import AutoGPTQForCausalLM
|
|
92 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
93 |
|
94 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
prompt = [
|
96 |
{"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
|
97 |
{"role": "user", "content": "What's Deep Learning?"},
|
98 |
]
|
99 |
-
|
100 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
101 |
-
|
102 |
inputs = tokenizer.apply_chat_template(
|
103 |
prompt,
|
104 |
tokenize=True,
|
@@ -107,13 +110,6 @@ inputs = tokenizer.apply_chat_template(
|
|
107 |
return_dict=True,
|
108 |
).to("cuda")
|
109 |
|
110 |
-
model = AutoGPTQForCausalLM.from_pretrained(
|
111 |
-
model_id,
|
112 |
-
torch_dtype=torch.float16,
|
113 |
-
low_cpu_mem_usage=True,
|
114 |
-
device_map="auto",
|
115 |
-
)
|
116 |
-
|
117 |
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
|
118 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
119 |
```
|
@@ -135,7 +131,6 @@ Then you just need to run the TGI v2.2.0 (or higher) Docker container as follows
|
|
135 |
docker run --gpus all --shm-size 1g -ti -p 8080:80 \
|
136 |
-v hf_cache:/data \
|
137 |
-e MODEL_ID=hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
|
138 |
-
-e NUM_SHARD=4 \
|
139 |
-e QUANTIZE=gptq \
|
140 |
-e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
|
141 |
-e MAX_INPUT_LENGTH=4000 \
|
@@ -214,7 +209,6 @@ docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
|
|
214 |
vllm/vllm-openai:latest \
|
215 |
--model hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
|
216 |
--quantization gptq_marlin \
|
217 |
-
--tensor-parallel-size 4 \
|
218 |
--max-model-len 4096
|
219 |
```
|
220 |
|
|
|
49 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
50 |
|
51 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
53 |
+
model = AutoModelForCausalLM.from_pretrained(
|
54 |
+
model_id,
|
55 |
+
torch_dtype=torch.float16,
|
56 |
+
low_cpu_mem_usage=True,
|
57 |
+
device_map="auto",
|
58 |
+
)
|
59 |
+
|
60 |
prompt = [
|
61 |
{"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
|
62 |
{"role": "user", "content": "What's Deep Learning?"},
|
63 |
]
|
|
|
|
|
|
|
64 |
inputs = tokenizer.apply_chat_template(
|
65 |
prompt,
|
66 |
tokenize=True,
|
|
|
69 |
return_dict=True,
|
70 |
).to("cuda")
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
|
73 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
74 |
```
|
|
|
90 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
91 |
|
92 |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
93 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
94 |
+
model = AutoGPTQForCausalLM.from_pretrained(
|
95 |
+
model_id,
|
96 |
+
torch_dtype=torch.float16,
|
97 |
+
low_cpu_mem_usage=True,
|
98 |
+
device_map="auto",
|
99 |
+
)
|
100 |
+
|
101 |
prompt = [
|
102 |
{"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
|
103 |
{"role": "user", "content": "What's Deep Learning?"},
|
104 |
]
|
|
|
|
|
|
|
105 |
inputs = tokenizer.apply_chat_template(
|
106 |
prompt,
|
107 |
tokenize=True,
|
|
|
110 |
return_dict=True,
|
111 |
).to("cuda")
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
|
114 |
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
|
115 |
```
|
|
|
131 |
docker run --gpus all --shm-size 1g -ti -p 8080:80 \
|
132 |
-v hf_cache:/data \
|
133 |
-e MODEL_ID=hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
|
|
|
134 |
-e QUANTIZE=gptq \
|
135 |
-e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
|
136 |
-e MAX_INPUT_LENGTH=4000 \
|
|
|
209 |
vllm/vllm-openai:latest \
|
210 |
--model hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 \
|
211 |
--quantization gptq_marlin \
|
|
|
212 |
--max-model-len 4096
|
213 |
```
|
214 |
|