pankajmathur commited on
Commit
2ae7263
1 Parent(s): ccb2558

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +55 -14
README.md CHANGED
@@ -39,36 +39,77 @@ Hello Orca Mini, what can you do for me?<|eot_id|>
39
  <|start_header_id|>assistant<|end_header_id|>
40
  ```
41
 
42
- Below shows a code example on how to use this model in default(bf16) format
43
 
44
  ```python
45
- from transformers import AutoModel, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  model_slug = "pankajmathur/orca_mini_v8_1_70b"
47
- model = AutoModel.from_pretrained(model_slug)
48
- tokenizer = AutoTokenizer.from_pretrained(model_slug)
 
 
 
 
 
 
 
 
 
 
49
  messages = [
50
  {"role": "system", "content": "You are Orca Mini, a helpful AI assistant."},
51
  {"role": "user", "content": "Hello Orca Mini, what can you do for me?"}
52
  ]
53
- gen_input = tokenizer.apply_chat_template(messages, return_tensors="pt")
54
- model.generate(**gen_input)
 
55
  ```
56
 
57
- Below shows a code example on how to use this model in 4-bit format via bitsandbytes library
58
 
59
  ```python
60
- from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 
 
61
  model_slug = "pankajmathur/orca_mini_v8_1_70b"
62
- quantization_config = BitsAndBytesConfig(load_in_4bit=True)
63
- quantized_model = AutoModelForCausalLM.from_pretrained(
64
- model_slug, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
65
- tokenizer = AutoTokenizer.from_pretrained(model_slug)
 
 
 
 
 
66
  messages = [
67
  {"role": "system", "content": "You are Orca Mini, a helpful AI assistant."},
68
  {"role": "user", "content": "Hello Orca Mini, what can you do for me?"}
69
  ]
70
- gen_input = tokenizer.apply_chat_template(messages, return_tensors="pt")
71
- quantized_model.generate(**gen_input)
 
72
  ```
73
 
74
  Below shows a code example on how to do a tool use with this model and tranformer library
 
39
  <|start_header_id|>assistant<|end_header_id|>
40
  ```
41
 
42
+ Below shows a code example on how to use this model in default full precision (bf16) format, it requires around ~130GB VRAM
43
 
44
  ```python
45
+ import torch
46
+ from transformers import pipeline
47
+
48
+ model_slug = "pankajmathur/orca_mini_v8_1_70b"
49
+ pipeline = pipeline(
50
+ "text-generation",
51
+ model=model_slug,
52
+ device_map="auto",
53
+ )
54
+ messages = [
55
+ {"role": "system", "content": "You are Orca Mini, a helpful AI assistant."},
56
+ {"role": "user", "content": "Hello Orca Mini, what can you do for me?"}
57
+ ]
58
+ outputs = pipeline(messages, max_new_tokens=128, do_sample=True, temperature=0.01, top_k=100, top_p=0.95)
59
+ print(outputs[0]["generated_text"][-1])
60
+ ```
61
+
62
+ Below shows a code example on how to use this model in 4-bit format via bitsandbytes library, it requires around ~39GB VRAM
63
+
64
+ ```python
65
+ import torch
66
+ from transformers import BitsAndBytesConfig, pipeline
67
+
68
  model_slug = "pankajmathur/orca_mini_v8_1_70b"
69
+ quantization_config = BitsAndBytesConfig(
70
+ load_in_4bit=True,
71
+ bnb_4bit_quant_type="nf4",
72
+ bnb_4bit_compute_dtype="float16",
73
+ bnb_4bit_use_double_quant=True,
74
+ )
75
+ pipeline = pipeline(
76
+ "text-generation",
77
+ model=model_slug,
78
+ model_kwargs={"quantization_config": quantization_config},
79
+ device_map="auto",
80
+ )
81
  messages = [
82
  {"role": "system", "content": "You are Orca Mini, a helpful AI assistant."},
83
  {"role": "user", "content": "Hello Orca Mini, what can you do for me?"}
84
  ]
85
+ outputs = pipeline(messages, max_new_tokens=128, do_sample=True, temperature=0.01, top_k=100, top_p=0.95)
86
+ print(outputs[0]["generated_text"][-1])
87
+
88
  ```
89
 
90
+ Below shows a code example on how to use this model in 8-bit format via bitsandbytes library, it requires around ~69GB VRAM
91
 
92
  ```python
93
+ import torch
94
+ from transformers import BitsAndBytesConfig, pipeline
95
+
96
  model_slug = "pankajmathur/orca_mini_v8_1_70b"
97
+ quantization_config = BitsAndBytesConfig(
98
+ load_in_8bit=True
99
+ )
100
+ pipeline = pipeline(
101
+ "text-generation",
102
+ model=model_slug,
103
+ model_kwargs={"quantization_config": quantization_config},
104
+ device_map="auto",
105
+ )
106
  messages = [
107
  {"role": "system", "content": "You are Orca Mini, a helpful AI assistant."},
108
  {"role": "user", "content": "Hello Orca Mini, what can you do for me?"}
109
  ]
110
+ outputs = pipeline(messages, max_new_tokens=128, do_sample=True, temperature=0.01, top_k=100, top_p=0.95)
111
+ print(outputs[0]["generated_text"][-1])
112
+
113
  ```
114
 
115
  Below shows a code example on how to do a tool use with this model and tranformer library