LlamaFinetuneBase
commited on
Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -8,11 +8,8 @@ extra_gated_prompt: >-
|
|
8 |
Google’s usage license. To do this, please ensure you’re logged in to Hugging
|
9 |
Face and click below. Requests are processed immediately.
|
10 |
extra_gated_button_content: Acknowledge license
|
11 |
-
base_model: google/gemma-2-27b
|
12 |
---
|
13 |
|
14 |
-
|
15 |
-
|
16 |
# Gemma 2 model card
|
17 |
|
18 |
**Model Page**: [Gemma](https://ai.google.dev/gemma/docs)
|
@@ -23,7 +20,7 @@ base_model: google/gemma-2-27b
|
|
23 |
* [Gemma on Kaggle][kaggle-gemma]
|
24 |
* [Gemma on Vertex Model Garden][vertex-mg-gemma]
|
25 |
|
26 |
-
**Terms of Use**: [Terms](https://www.kaggle.com/models/google/gemma/license/consent/verify/huggingface?returnModelRepoId=google/gemma-2-27b
|
27 |
|
28 |
**Authors**: Google
|
29 |
|
@@ -60,19 +57,14 @@ from transformers import pipeline
|
|
60 |
|
61 |
pipe = pipeline(
|
62 |
"text-generation",
|
63 |
-
model="google/gemma-2-27b
|
64 |
-
model_kwargs={"torch_dtype": torch.bfloat16},
|
65 |
device="cuda", # replace with "mps" to run on a Mac device
|
66 |
)
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
]
|
71 |
-
|
72 |
-
outputs = pipe(messages, max_new_tokens=256)
|
73 |
-
assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
|
74 |
-
print(assistant_response)
|
75 |
-
# Ahoy, matey! I be Gemma, a digital scallywag, a language-slingin' parrot of the digital seas. I be here to help ye with yer wordy woes, answer yer questions, and spin ye yarns of the digital world. So, what be yer pleasure, eh? 🦜
|
76 |
```
|
77 |
|
78 |
#### Running the model on a single / multi GPU
|
@@ -82,47 +74,9 @@ print(assistant_response)
|
|
82 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
83 |
import torch
|
84 |
|
85 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b
|
86 |
model = AutoModelForCausalLM.from_pretrained(
|
87 |
-
"google/gemma-2-27b
|
88 |
-
device_map="auto",
|
89 |
-
torch_dtype=torch.bfloat16,
|
90 |
-
)
|
91 |
-
|
92 |
-
input_text = "Write me a poem about Machine Learning."
|
93 |
-
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
94 |
-
|
95 |
-
outputs = model.generate(**input_ids, max_new_tokens=32)
|
96 |
-
print(tokenizer.decode(outputs[0]))
|
97 |
-
```
|
98 |
-
|
99 |
-
You can ensure the correct chat template is applied by using `tokenizer.apply_chat_template` as follows:
|
100 |
-
```python
|
101 |
-
messages = [
|
102 |
-
{"role": "user", "content": "Write me a poem about Machine Learning."},
|
103 |
-
]
|
104 |
-
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
|
105 |
-
|
106 |
-
outputs = model.generate(**input_ids, max_new_tokens=256)
|
107 |
-
print(tokenizer.decode(outputs[0]))
|
108 |
-
```
|
109 |
-
|
110 |
-
<a name="precisions"></a>
|
111 |
-
#### Running the model on a GPU using different precisions
|
112 |
-
|
113 |
-
The native weights of this model were exported in `bfloat16` precision.
|
114 |
-
|
115 |
-
You can also use `float32` if you skip the dtype, but no precision increase will occur (model weights will just be upcasted to `float32`). See examples below.
|
116 |
-
|
117 |
-
* _Upcasting to `torch.float32`_
|
118 |
-
|
119 |
-
```python
|
120 |
-
# pip install accelerate
|
121 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
122 |
-
|
123 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b-it")
|
124 |
-
model = AutoModelForCausalLM.from_pretrained(
|
125 |
-
"google/gemma-2-27b-it",
|
126 |
device_map="auto",
|
127 |
)
|
128 |
|
@@ -140,7 +94,7 @@ for running Gemma 2 through a command line interface, or CLI. Follow the [instal
|
|
140 |
for getting started, then launch the CLI through the following command:
|
141 |
|
142 |
```shell
|
143 |
-
local-gemma --model 27b --
|
144 |
```
|
145 |
|
146 |
#### Quantized Versions through `bitsandbytes`
|
@@ -156,9 +110,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
156 |
|
157 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
158 |
|
159 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b
|
160 |
model = AutoModelForCausalLM.from_pretrained(
|
161 |
-
"google/gemma-2-27b
|
162 |
quantization_config=quantization_config,
|
163 |
)
|
164 |
|
@@ -181,9 +135,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
181 |
|
182 |
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
183 |
|
184 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b
|
185 |
model = AutoModelForCausalLM.from_pretrained(
|
186 |
-
"google/gemma-2-27b
|
187 |
quantization_config=quantization_config,
|
188 |
)
|
189 |
|
@@ -218,8 +172,8 @@ import torch
|
|
218 |
torch.set_float32_matmul_precision("high")
|
219 |
|
220 |
# load the model + tokenizer
|
221 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b
|
222 |
-
model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-27b
|
223 |
model.to("cuda")
|
224 |
|
225 |
# apply the torch compile transformation
|
@@ -257,57 +211,6 @@ For more details, refer to the [Transformers documentation](https://huggingface.
|
|
257 |
|
258 |
</details>
|
259 |
|
260 |
-
### Chat Template
|
261 |
-
|
262 |
-
The instruction-tuned models use a chat template that must be adhered to for conversational use.
|
263 |
-
The easiest way to apply it is using the tokenizer's built-in chat template, as shown in the following snippet.
|
264 |
-
|
265 |
-
Let's load the model and apply the chat template to a conversation. In this example, we'll start with a single user interaction:
|
266 |
-
|
267 |
-
```py
|
268 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
269 |
-
import transformers
|
270 |
-
import torch
|
271 |
-
|
272 |
-
model_id = "google/gemma-2-27b-it"
|
273 |
-
dtype = torch.bfloat16
|
274 |
-
|
275 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
276 |
-
model = AutoModelForCausalLM.from_pretrained(
|
277 |
-
model_id,
|
278 |
-
device_map="cuda",
|
279 |
-
torch_dtype=dtype,
|
280 |
-
)
|
281 |
-
|
282 |
-
chat = [
|
283 |
-
{ "role": "user", "content": "Write a hello world program" },
|
284 |
-
]
|
285 |
-
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
286 |
-
```
|
287 |
-
|
288 |
-
At this point, the prompt contains the following text:
|
289 |
-
|
290 |
-
```
|
291 |
-
<bos><start_of_turn>user
|
292 |
-
Write a hello world program<end_of_turn>
|
293 |
-
<start_of_turn>model
|
294 |
-
```
|
295 |
-
|
296 |
-
As you can see, each turn is preceded by a `<start_of_turn>` delimiter and then the role of the entity
|
297 |
-
(either `user`, for content supplied by the user, or `model` for LLM responses). Turns finish with
|
298 |
-
the `<end_of_turn>` token.
|
299 |
-
|
300 |
-
You can follow this format to build the prompt manually, if you need to do it without the tokenizer's
|
301 |
-
chat template.
|
302 |
-
|
303 |
-
After the prompt is ready, generation can be performed like this:
|
304 |
-
|
305 |
-
```py
|
306 |
-
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
|
307 |
-
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)
|
308 |
-
print(tokenizer.decode(outputs[0]))
|
309 |
-
```
|
310 |
-
|
311 |
### Inputs and outputs
|
312 |
|
313 |
* **Input:** Text string, such as a question, a prompt, or a document to be
|
|
|
8 |
Google’s usage license. To do this, please ensure you’re logged in to Hugging
|
9 |
Face and click below. Requests are processed immediately.
|
10 |
extra_gated_button_content: Acknowledge license
|
|
|
11 |
---
|
12 |
|
|
|
|
|
13 |
# Gemma 2 model card
|
14 |
|
15 |
**Model Page**: [Gemma](https://ai.google.dev/gemma/docs)
|
|
|
20 |
* [Gemma on Kaggle][kaggle-gemma]
|
21 |
* [Gemma on Vertex Model Garden][vertex-mg-gemma]
|
22 |
|
23 |
+
**Terms of Use**: [Terms](https://www.kaggle.com/models/google/gemma/license/consent/verify/huggingface?returnModelRepoId=google/gemma-2-27b)
|
24 |
|
25 |
**Authors**: Google
|
26 |
|
|
|
57 |
|
58 |
pipe = pipeline(
|
59 |
"text-generation",
|
60 |
+
model="google/gemma-2-27b",
|
|
|
61 |
device="cuda", # replace with "mps" to run on a Mac device
|
62 |
)
|
63 |
|
64 |
+
text = "Once upon a time,"
|
65 |
+
outputs = pipe(text, max_new_tokens=256)
|
66 |
+
response = outputs[0]["generated_text"]
|
67 |
+
print(response)
|
|
|
|
|
|
|
|
|
68 |
```
|
69 |
|
70 |
#### Running the model on a single / multi GPU
|
|
|
74 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
75 |
import torch
|
76 |
|
77 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b")
|
78 |
model = AutoModelForCausalLM.from_pretrained(
|
79 |
+
"google/gemma-2-27b",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
device_map="auto",
|
81 |
)
|
82 |
|
|
|
94 |
for getting started, then launch the CLI through the following command:
|
95 |
|
96 |
```shell
|
97 |
+
local-gemma --model "google/gemma-2-27b" --prompt "What is the capital of Mexico?"
|
98 |
```
|
99 |
|
100 |
#### Quantized Versions through `bitsandbytes`
|
|
|
110 |
|
111 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
112 |
|
113 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b")
|
114 |
model = AutoModelForCausalLM.from_pretrained(
|
115 |
+
"google/gemma-2-27b",
|
116 |
quantization_config=quantization_config,
|
117 |
)
|
118 |
|
|
|
135 |
|
136 |
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
137 |
|
138 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b")
|
139 |
model = AutoModelForCausalLM.from_pretrained(
|
140 |
+
"google/gemma-2-27b",
|
141 |
quantization_config=quantization_config,
|
142 |
)
|
143 |
|
|
|
172 |
torch.set_float32_matmul_precision("high")
|
173 |
|
174 |
# load the model + tokenizer
|
175 |
+
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-27b")
|
176 |
+
model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-27b", torch_dtype=torch.bfloat16)
|
177 |
model.to("cuda")
|
178 |
|
179 |
# apply the torch compile transformation
|
|
|
211 |
|
212 |
</details>
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
### Inputs and outputs
|
215 |
|
216 |
* **Input:** Text string, such as a question, a prompt, or a document to be
|