Conversion of https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct into the ```ctranslate2``` format using ```int8``` quantization.
NOTE #1: This requires a version of ```ctranslate2``` GREATER THAN 4.5.0.
NOTE #2: The sample scripts below require ```pip``` installing the necessary ```CUDA``` and ```CUDNN``` libraries. If you rely on a systemwide installation instead, adjust your code accordingly.
Requirements:
- torch 2.4.0+cu124
- nvidia-cublas-cu12 12.4.2.65
- nvidia-cuda-nvrtc-cu12 12.4.99
- nvidia-cuda-runtime-cu12 12.4.99
- nvidia-cudnn-cu12 9.1.0.70
- numpy==1.26.4 (YOU MUST DOWNGRADE FROM THE NUMPY VERSION THAT CTRANSLATE2 INSTALLS BY DEFAULT)
- All other traditional dependencies like ```transformers```, ```accelerate```, etc.
Sample Script #1 (non-streaming):
```
import sys
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'
from pathlib import Path
def set_cuda_paths():
venv_base = Path(sys.executable).parent.parent
nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia'
cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin'
cublas_path = nvidia_base_path / 'cublas' / 'bin'
cudnn_path = nvidia_base_path / 'cudnn' / 'bin'
nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin'
paths_to_add = [
str(cuda_path),
str(cublas_path),
str(cudnn_path),
str(nvrtc_path),
]
env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH']
for env_var in env_vars:
current_value = os.environ.get(env_var, '')
new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add)
os.environ[env_var] = new_value
set_cuda_paths()
import ctranslate2
import gc
import torch
from transformers import AutoTokenizer
import pynvml
from constants import user_message, system_message
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
model_dir = r"[INSERT PATH TO FOLDER CONTAINING THE MODEL FILES HERE]"
def build_prompt():
prompt = f"""<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
"""
return prompt
def main():
model_name = os.path.basename(model_dir)
beam_size_value = 1
intra_threads = max(os.cpu_count() - 4, 4)
generator = ctranslate2.Generator(
model_dir,
device="cuda",
compute_type="int8",
intra_threads=intra_threads
)
tokenizer = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=None)
prompt = build_prompt()
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt))
results_batch = generator.generate_batch(
[tokens],
include_prompt_in_result=False,
max_batch_size=4096,
batch_type="tokens",
beam_size=beam_size_value,
num_hypotheses=1,
max_length=512,
sampling_temperature=0.0,
)
output = tokenizer.decode(results_batch[0].sequences_ids[0])
print("\nGenerated response:\n")
print(output)
del generator
del tokenizer
torch.cuda.empty_cache()
gc.collect()
if __name__ == "__main__":
main()
```
Sample Script #2 (streaming)
```
import sys
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'
from pathlib import Path
def set_cuda_paths():
venv_base = Path(sys.executable).parent.parent
nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia'
cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin'
cublas_path = nvidia_base_path / 'cublas' / 'bin'
cudnn_path = nvidia_base_path / 'cudnn' / 'bin'
nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin'
paths_to_add = [
str(cuda_path),
str(cublas_path),
str(cudnn_path),
str(nvrtc_path),
]
env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH']
for env_var in env_vars:
current_value = os.environ.get(env_var, '')
new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add)
os.environ[env_var] = new_value
set_cuda_paths()
import ctranslate2
import gc
import torch
from transformers import AutoTokenizer
import pynvml
from constants import user_message, system_message
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
model_dir = r"[PATH TO FOLDER CONTAINING THE MODEL FILES]"
def build_prompt():
prompt = f"""<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
"""
return prompt
def main():
generator = ctranslate2.Generator(
model_dir,
device="cuda",
compute_type="int8",
)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
prompt = build_prompt()
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt))
# Initialize token iterator
token_iterator = generator.generate_tokens(
[tokens],
max_length=512,
sampling_temperature=0.0
)
decoded_output = ""
tokens_buffer = []
try:
for token_result in token_iterator:
token_id = token_result.token_id
token = tokenizer.convert_ids_to_tokens(token_id)
if token_id == tokenizer.eos_token_id:
break
is_new_word = token.startswith("Ġ")
if is_new_word and tokens_buffer:
word = tokenizer.decode(tokens_buffer)
print(word, end='', flush=True)
decoded_output += word
tokens_buffer = []
tokens_buffer.append(token_id)
if tokens_buffer:
word = tokenizer.decode(tokens_buffer)
print(word, end='', flush=True)
decoded_output += word
except KeyboardInterrupt:
print("\nGeneration interrupted")
del generator
del tokenizer
torch.cuda.empty_cache()
gc.collect()
if __name__ == "__main__":
main()
```