mwitiderrick
commited on
Commit
•
acccbfa
1
Parent(s):
6094ea3
Update README.md
Browse files
README.md
CHANGED
@@ -23,13 +23,12 @@ Run in a [Python pipeline](https://github.com/neuralmagic/deepsparse/blob/main/d
|
|
23 |
```python
|
24 |
from deepsparse import TextGeneration
|
25 |
|
26 |
-
template= "
|
27 |
-
|
28 |
-
prompt = "How to get into a good university?"
|
29 |
|
30 |
input_str = template.format(prompt=prompt)
|
31 |
|
32 |
-
model = TextGeneration(model_path="hf:nm-testing/
|
33 |
|
34 |
print(model(input_str, max_new_tokens=200).generations[0].text)
|
35 |
"""
|
@@ -51,7 +50,7 @@ For details on how this model was sparsified, see the `recipe.yaml` in this repo
|
|
51 |
git clone https://github.com/neuralmagic/sparseml
|
52 |
pip install -e "sparseml[transformers]"
|
53 |
python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py glaiveai/glaive-coder-7b open_platypus --recipe recipe.yaml --save True
|
54 |
-
python sparseml/src/sparseml/transformers/sparsification/obcq/export.py --task text-generation --model_path obcq_deployment
|
55 |
cp deployment/model.onnx deployment/model-orig.onnx
|
56 |
```
|
57 |
Run this kv-cache injection to speed up the model at inference by caching the Key and Value states:
|
|
|
23 |
```python
|
24 |
from deepsparse import TextGeneration
|
25 |
|
26 |
+
template = "<s>[INST] {prompt} [/INST]"
|
27 |
+
prompt = "Write a quick sort algorithm in Python"
|
|
|
28 |
|
29 |
input_str = template.format(prompt=prompt)
|
30 |
|
31 |
+
model = TextGeneration(model_path="hf:nm-testing/glaive-coder-7b-pruned50-quant-ds")
|
32 |
|
33 |
print(model(input_str, max_new_tokens=200).generations[0].text)
|
34 |
"""
|
|
|
50 |
git clone https://github.com/neuralmagic/sparseml
|
51 |
pip install -e "sparseml[transformers]"
|
52 |
python sparseml/src/sparseml/transformers/sparsification/obcq/obcq.py glaiveai/glaive-coder-7b open_platypus --recipe recipe.yaml --save True
|
53 |
+
python sparseml/src/sparseml/transformers/sparsification/obcq/export.py --sequence_length 4096 --task text-generation --model_path obcq_deployment
|
54 |
cp deployment/model.onnx deployment/model-orig.onnx
|
55 |
```
|
56 |
Run this kv-cache injection to speed up the model at inference by caching the Key and Value states:
|