|
Conversion of https://huggingface.co/Qwen/Qwen2.5-14B-Instruct into the ```ctranslate2``` format using ```int8``` quantization. |
|
|
|
NOTE #1: This requires a version of ```ctranslate2``` GREATER THAN 4.5.0. |
|
|
|
NOTE #2: The sample scripts below require ```pip``` installing the necessary ```CUDA``` and ```CUDNN``` libraries. If you rely on a systemwide installation instead, adjust your code accordingly. |
|
|
|
Requirements: |
|
|
|
- torch 2.4.0+cu124 |
|
- nvidia-cublas-cu12 12.4.2.65 |
|
- nvidia-cuda-nvrtc-cu12 12.4.99 |
|
- nvidia-cuda-runtime-cu12 12.4.99 |
|
- nvidia-cudnn-cu12 9.1.0.70 |
|
- numpy==1.26.4 (YOU MUST DOWNGRADE FROM THE NUMPY VERSION THAT CTRANSLATE2 INSTALLS BY DEFAULT) |
|
- All other traditional dependencies like ```transformers```, ```accelerate```, etc. |
|
|
|
<details><summary>Sample Script #1 (non-streaming):</summary> |
|
|
|
```Python |
|
import sys |
|
import os |
|
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE' |
|
from pathlib import Path |
|
|
|
def set_cuda_paths(): |
|
venv_base = Path(sys.executable).parent.parent |
|
nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia' |
|
cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin' |
|
cublas_path = nvidia_base_path / 'cublas' / 'bin' |
|
cudnn_path = nvidia_base_path / 'cudnn' / 'bin' |
|
nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin' |
|
|
|
paths_to_add = [ |
|
str(cuda_path), |
|
str(cublas_path), |
|
str(cudnn_path), |
|
str(nvrtc_path), |
|
] |
|
|
|
env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH'] |
|
|
|
for env_var in env_vars: |
|
current_value = os.environ.get(env_var, '') |
|
new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add) |
|
os.environ[env_var] = new_value |
|
|
|
set_cuda_paths() |
|
|
|
import ctranslate2 |
|
import gc |
|
import torch |
|
from transformers import AutoTokenizer |
|
import pynvml |
|
from constants import user_message, system_message |
|
|
|
pynvml.nvmlInit() |
|
handle = pynvml.nvmlDeviceGetHandleByIndex(0) |
|
|
|
model_dir = r"[INSERT PATH TO FOLDER CONTAINING THE MODEL FILES HERE]" |
|
|
|
def build_prompt(): |
|
prompt = f"""<|im_start|>system |
|
{system_message}<|im_end|> |
|
<|im_start|>user |
|
{user_message}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
return prompt |
|
|
|
def main(): |
|
model_name = os.path.basename(model_dir) |
|
beam_size_value = 1 |
|
intra_threads = max(os.cpu_count() - 4, 4) |
|
|
|
generator = ctranslate2.Generator( |
|
model_dir, |
|
device="cuda", |
|
compute_type="int8", |
|
intra_threads=intra_threads |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=None) |
|
prompt = build_prompt() |
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) |
|
|
|
results_batch = generator.generate_batch( |
|
[tokens], |
|
include_prompt_in_result=False, |
|
max_batch_size=4096, |
|
batch_type="tokens", |
|
beam_size=beam_size_value, |
|
num_hypotheses=1, |
|
max_length=512, |
|
sampling_temperature=0.0, |
|
) |
|
|
|
output = tokenizer.decode(results_batch[0].sequences_ids[0]) |
|
print("\nGenerated response:\n") |
|
print(output) |
|
|
|
del generator |
|
del tokenizer |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
``` |
|
</details> |
|
|
|
<details><summary>Sample Script #2 (streaming)</summary> |
|
|
|
```Python |
|
import sys |
|
import os |
|
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE' |
|
from pathlib import Path |
|
|
|
def set_cuda_paths(): |
|
venv_base = Path(sys.executable).parent.parent |
|
nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia' |
|
cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin' |
|
cublas_path = nvidia_base_path / 'cublas' / 'bin' |
|
cudnn_path = nvidia_base_path / 'cudnn' / 'bin' |
|
nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin' |
|
|
|
paths_to_add = [ |
|
str(cuda_path), |
|
str(cublas_path), |
|
str(cudnn_path), |
|
str(nvrtc_path), |
|
] |
|
|
|
env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH'] |
|
|
|
for env_var in env_vars: |
|
current_value = os.environ.get(env_var, '') |
|
new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add) |
|
os.environ[env_var] = new_value |
|
|
|
set_cuda_paths() |
|
|
|
import ctranslate2 |
|
import gc |
|
import torch |
|
from transformers import AutoTokenizer |
|
import pynvml |
|
from constants import user_message, system_message |
|
|
|
pynvml.nvmlInit() |
|
handle = pynvml.nvmlDeviceGetHandleByIndex(0) |
|
|
|
model_dir = r"[PATH TO FOLDER CONTAINING THE MODEL FILES]" |
|
|
|
|
|
def build_prompt(): |
|
prompt = f"""<|im_start|>system |
|
{system_message}<|im_end|> |
|
<|im_start|>user |
|
{user_message}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
return prompt |
|
|
|
def main(): |
|
generator = ctranslate2.Generator( |
|
model_dir, |
|
device="cuda", |
|
compute_type="int8", |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_dir) |
|
prompt = build_prompt() |
|
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) |
|
|
|
# Initialize token iterator |
|
token_iterator = generator.generate_tokens( |
|
[tokens], |
|
max_length=512, |
|
sampling_temperature=0.0 |
|
) |
|
|
|
decoded_output = "" |
|
tokens_buffer = [] |
|
|
|
try: |
|
for token_result in token_iterator: |
|
token_id = token_result.token_id |
|
token = tokenizer.convert_ids_to_tokens(token_id) |
|
|
|
if token_id == tokenizer.eos_token_id: |
|
break |
|
|
|
is_new_word = token.startswith("Ġ") |
|
if is_new_word and tokens_buffer: |
|
word = tokenizer.decode(tokens_buffer) |
|
print(word, end='', flush=True) |
|
decoded_output += word |
|
tokens_buffer = [] |
|
|
|
tokens_buffer.append(token_id) |
|
|
|
if tokens_buffer: |
|
word = tokenizer.decode(tokens_buffer) |
|
print(word, end='', flush=True) |
|
decoded_output += word |
|
|
|
except KeyboardInterrupt: |
|
print("\nGeneration interrupted") |
|
|
|
del generator |
|
del tokenizer |
|
torch.cuda.empty_cache() |
|
gc.collect() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
``` |
|
</details> |