alokabhishek commited on
Commit
773b9bf
1 Parent(s): 83b7d62

Updated Readme

Browse files
Files changed (1) hide show
  1. README.md +64 -1
README.md CHANGED
@@ -12,7 +12,7 @@ tags:
12
  - 4.0-bpw
13
  ---
14
 
15
- # Model Card for alokabhishek/Mistral-7B-Instruct-v0.2-5.0-bpw-exl2
16
 
17
  <!-- Provide a quick summary of what the model is/does. -->
18
  This repo contains 4-bit quantized (using ExLlamaV2) model Mistral AI_'s Mistral-7B-Instruct-v0.2
@@ -79,6 +79,69 @@ model_name = model_id.split("/")[-1]
79
  ```shell
80
  # Run model
81
  !python exllamav2/test_inference.py -m {model_name}/ -p "Tell me a funny joke about Large Language Models meeting a Blackhole in an intergalactic Bar."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ```
83
 
84
 
 
12
  - 4.0-bpw
13
  ---
14
 
15
+ # Model Card for alokabhishek/Mistral-7B-Instruct-v0.2-4.0-bpw-exl2
16
 
17
  <!-- Provide a quick summary of what the model is/does. -->
18
  This repo contains 4-bit quantized (using ExLlamaV2) model Mistral AI_'s Mistral-7B-Instruct-v0.2
 
79
  ```shell
80
  # Run model
81
  !python exllamav2/test_inference.py -m {model_name}/ -p "Tell me a funny joke about Large Language Models meeting a Blackhole in an intergalactic Bar."
82
+ ```
83
+
84
+ ```python
85
+
86
+ import sys, os
87
+
88
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
89
+
90
+ from exllamav2 import (
91
+ ExLlamaV2,
92
+ ExLlamaV2Config,
93
+ ExLlamaV2Cache,
94
+ ExLlamaV2Tokenizer,
95
+ )
96
+
97
+ from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
98
+
99
+ import time
100
+
101
+ # Initialize model and cache
102
+
103
+ model_directory = "/model_path/Mistral-7B-Instruct-v0.2-4.0-bpw-exl2/"
104
+ print("Loading model: " + model_directory)
105
+
106
+ config = ExLlamaV2Config(model_directory)
107
+ model = ExLlamaV2(config)
108
+ cache = ExLlamaV2Cache(model, lazy=True)
109
+ model.load_autosplit(cache)
110
+ tokenizer = ExLlamaV2Tokenizer(config)
111
+
112
+ # Initialize generator
113
+
114
+ generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
115
+
116
+ # Generate some text
117
+
118
+ settings = ExLlamaV2Sampler.Settings()
119
+ settings.temperature = 0.85
120
+ settings.top_k = 50
121
+ settings.top_p = 0.8
122
+ settings.token_repetition_penalty = 1.01
123
+ settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
124
+
125
+ prompt = "Tell me a funny joke about Large Language Models meeting a Blackhole in an intergalactic Bar."
126
+
127
+ max_new_tokens = 512
128
+
129
+ generator.warmup()
130
+ time_begin = time.time()
131
+
132
+ output = generator.generate_simple(prompt, settings, max_new_tokens, seed=1234)
133
+
134
+ time_end = time.time()
135
+ time_total = time_end - time_begin
136
+
137
+ print(output)
138
+ print()
139
+ print(
140
+ f"Response generated in {time_total:.2f} seconds, {max_new_tokens} tokens, {max_new_tokens / time_total:.2f} tokens/second"
141
+ )
142
+
143
+
144
+
145
  ```
146
 
147