!python -m pip install --upgrade pip -q
!pip install -q accelerate safetensors deepspeed
!pip install -q bitsandbytes sentencepiece
!pip install -q scipy ninja -U
!pip install git+https://github.com/mobiusml/hqq/ transformers -U -q
import transformers
print(transformers.__version__)
model_id = 'NickyNicky/Hermes-2-Pro-Mistral-7B-4bit_g128-HQQ'
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id,device="cuda:0")
model.config.use_cache = True
model.eval();
system=""""""
contenido="""escribe solo tres palabaras que contengan la letra 'T'"""
messages = [{"role": "system", "content": system+"eres un modelo de AI que responde adecuadamente a las tareas exactas que te pide el usuario, el idioma a la cual debes de responder es español."},
{"role": "user", "content": contenido},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
input= tokenizer(prompt,
return_tensors="pt",
add_special_tokens=False).to(model.device)
generate_params = dict(
max_new_tokens = 1900,
do_sample = True,
top_p = 0.90,
top_k = 50,
temperature = 0.6,
repetition_penalty = 1.,
pad_token_id = tokenizer.eos_token_id,
eos_token_id = tokenizer.eos_token_id,
)
output= model.generate(**input,**generate_params)
print(tokenizer.decode(output[0], skip_special_tokens=True))
use gpu.
colab.
https://colab.research.google.com/drive/1oEoH0qScGzkLV4WLGrMEMgl4qnEsZhTs?usp=sharing