homebrewltd
/

llama3-s-v0.1

Text Generation

sound language model

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

jan-hq commited on Jul 23

Commit

8ac9442

•

1 Parent(s): 35741cd

Update README.md

Files changed (1) hide show

README.md +7 -2

README.md CHANGED Viewed

@@ -71,10 +71,9 @@ sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file")
 Then, we can inference the model the same as any other LLM.
 ```python
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
-def setup_pipeline(model_path, use_4bit=True):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model_kwargs = {"device_map": "auto"}
@@ -86,6 +85,12 @@ def setup_pipeline(model_path, use_4bit=True):
             bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
         )
     else:
         model_kwargs["torch_dtype"] = torch.bfloat16

 Then, we can inference the model the same as any other LLM.
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
+def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model_kwargs = {"device_map": "auto"}
             bnb_4bit_use_double_quant=True,
             bnb_4bit_quant_type="nf4",
         )
+    elif use_8bit:
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+            bnb_8bit_use_double_quant=True,
+        )
     else:
         model_kwargs["torch_dtype"] = torch.bfloat16