h2oai
/

h2ogpt-gm-oasst1-en-2048-falcon-40b-v1

@@ -43,7 +43,7 @@ model_kwargs = {}
 # optional quantization
 quantization_config = BitsAndBytesConfig(
     load_in_8bit=True,
-    llm_int8_threshold=3.0,
 )
 model_kwargs["quantization_config"] = quantization_config
@@ -93,18 +93,28 @@ Alternatively, if you prefer to not use `trust_remote_code=True` you can downloa
 ```python
 import torch
 from h2oai_pipeline import H2OTextGenerationPipeline
-from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained(
     "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
     use_fast=False,
-    padding_side="left"
 )
 model = AutoModelForCausalLM.from_pretrained(
     "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
     torch_dtype=torch.float16,
-    device_map={"": "cuda:0"}
-)
 generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)
 res = generate_text(
@@ -124,16 +134,33 @@ print(res[0]["generated_text"])
 You may also construct the pipeline from the loaded model and tokenizer yourself and consider the preprocessing steps:
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1"  # either local folder or huggingface model name
 # Important: The prompt needs to be in the same format the model was trained with.
 # You can find an example prompt in the experiment logs.
 prompt = "<|prompt|>How are you?<|endoftext|><|answer|>"
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-model.cuda().eval()
 inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
 # generate configuration can be modified to your needs

 # optional quantization
 quantization_config = BitsAndBytesConfig(
     load_in_8bit=True,
+    llm_int8_threshold=6.0,
 )
 model_kwargs["quantization_config"] = quantization_config
 ```python
 import torch
 from h2oai_pipeline import H2OTextGenerationPipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+quantization_config = None
+# optional quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_threshold=6.0,
+)
 tokenizer = AutoTokenizer.from_pretrained(
     "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
     use_fast=False,
+    padding_side="left",
+    trust_remote_code=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
     "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
+    trust_remote_code=True,
     torch_dtype=torch.float16,
+    device_map={"": "cuda:0"},
+    quantization_config=quantization_config
+).eval()
 generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)
 res = generate_text(
 You may also construct the pipeline from the loaded model and tokenizer yourself and consider the preprocessing steps:
 ```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 # Important: The prompt needs to be in the same format the model was trained with.
 # You can find an example prompt in the experiment logs.
 prompt = "<|prompt|>How are you?<|endoftext|><|answer|>"
+quantization_config = None
+# optional quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_threshold=6.0,
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
+    use_fast=False,
+    padding_side="left",
+    trust_remote_code=True,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "psinger/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map={"": "cuda:0"},
+    quantization_config=quantization_config
+).eval()
 inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
 # generate configuration can be modified to your needs