Update README.md
Browse files
README.md
CHANGED
@@ -43,7 +43,7 @@ model_kwargs = {}
|
|
43 |
# optional quantization
|
44 |
quantization_config = BitsAndBytesConfig(
|
45 |
load_in_8bit=True,
|
46 |
-
llm_int8_threshold=
|
47 |
)
|
48 |
model_kwargs["quantization_config"] = quantization_config
|
49 |
|
@@ -93,18 +93,28 @@ Alternatively, if you prefer to not use `trust_remote_code=True` you can downloa
|
|
93 |
```python
|
94 |
import torch
|
95 |
from h2oai_pipeline import H2OTextGenerationPipeline
|
96 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
tokenizer = AutoTokenizer.from_pretrained(
|
99 |
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
100 |
use_fast=False,
|
101 |
-
padding_side="left"
|
|
|
102 |
)
|
103 |
model = AutoModelForCausalLM.from_pretrained(
|
104 |
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
|
|
105 |
torch_dtype=torch.float16,
|
106 |
-
device_map={"": "cuda:0"}
|
107 |
-
|
|
|
108 |
generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)
|
109 |
|
110 |
res = generate_text(
|
@@ -124,16 +134,33 @@ print(res[0]["generated_text"])
|
|
124 |
You may also construct the pipeline from the loaded model and tokenizer yourself and consider the preprocessing steps:
|
125 |
|
126 |
```python
|
127 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
128 |
|
129 |
-
model_name = "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1" # either local folder or huggingface model name
|
130 |
# Important: The prompt needs to be in the same format the model was trained with.
|
131 |
# You can find an example prompt in the experiment logs.
|
132 |
prompt = "<|prompt|>How are you?<|endoftext|><|answer|>"
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
|
138 |
|
139 |
# generate configuration can be modified to your needs
|
|
|
43 |
# optional quantization
|
44 |
quantization_config = BitsAndBytesConfig(
|
45 |
load_in_8bit=True,
|
46 |
+
llm_int8_threshold=6.0,
|
47 |
)
|
48 |
model_kwargs["quantization_config"] = quantization_config
|
49 |
|
|
|
93 |
```python
|
94 |
import torch
|
95 |
from h2oai_pipeline import H2OTextGenerationPipeline
|
96 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
97 |
+
|
98 |
+
quantization_config = None
|
99 |
+
# optional quantization
|
100 |
+
quantization_config = BitsAndBytesConfig(
|
101 |
+
load_in_8bit=True,
|
102 |
+
llm_int8_threshold=6.0,
|
103 |
+
)
|
104 |
|
105 |
tokenizer = AutoTokenizer.from_pretrained(
|
106 |
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
107 |
use_fast=False,
|
108 |
+
padding_side="left",
|
109 |
+
trust_remote_code=True,
|
110 |
)
|
111 |
model = AutoModelForCausalLM.from_pretrained(
|
112 |
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
113 |
+
trust_remote_code=True,
|
114 |
torch_dtype=torch.float16,
|
115 |
+
device_map={"": "cuda:0"},
|
116 |
+
quantization_config=quantization_config
|
117 |
+
).eval()
|
118 |
generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)
|
119 |
|
120 |
res = generate_text(
|
|
|
134 |
You may also construct the pipeline from the loaded model and tokenizer yourself and consider the preprocessing steps:
|
135 |
|
136 |
```python
|
137 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
138 |
|
|
|
139 |
# Important: The prompt needs to be in the same format the model was trained with.
|
140 |
# You can find an example prompt in the experiment logs.
|
141 |
prompt = "<|prompt|>How are you?<|endoftext|><|answer|>"
|
142 |
|
143 |
+
quantization_config = None
|
144 |
+
# optional quantization
|
145 |
+
quantization_config = BitsAndBytesConfig(
|
146 |
+
load_in_8bit=True,
|
147 |
+
llm_int8_threshold=6.0,
|
148 |
+
)
|
149 |
+
|
150 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
151 |
+
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
152 |
+
use_fast=False,
|
153 |
+
padding_side="left",
|
154 |
+
trust_remote_code=True,
|
155 |
+
)
|
156 |
+
model = AutoModelForCausalLM.from_pretrained(
|
157 |
+
"psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
|
158 |
+
trust_remote_code=True,
|
159 |
+
torch_dtype=torch.float16,
|
160 |
+
device_map={"": "cuda:0"},
|
161 |
+
quantization_config=quantization_config
|
162 |
+
).eval()
|
163 |
+
|
164 |
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
|
165 |
|
166 |
# generate configuration can be modified to your needs
|