Conversion of https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct into the ```ctranslate2``` format using ```int8``` quantization. NOTE #1: This requires a version of ```ctranslate2``` GREATER THAN 4.5.0. NOTE #2: The sample scripts below require ```pip``` installing the necessary ```CUDA``` and ```CUDNN``` libraries. If you rely on a systemwide installation instead, adjust your code accordingly. Requirements: - torch 2.4.0+cu124 - nvidia-cublas-cu12 12.4.2.65 - nvidia-cuda-nvrtc-cu12 12.4.99 - nvidia-cuda-runtime-cu12 12.4.99 - nvidia-cudnn-cu12 9.1.0.70 - numpy==1.26.4 (YOU MUST DOWNGRADE FROM THE NUMPY VERSION THAT CTRANSLATE2 INSTALLS BY DEFAULT) - All other traditional dependencies like ```transformers```, ```accelerate```, etc.
Sample Script #1 (non-streaming): ``` import sys import os os.environ['KMP_DUPLICATE_LIB_OK']='TRUE' from pathlib import Path def set_cuda_paths(): venv_base = Path(sys.executable).parent.parent nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia' cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin' cublas_path = nvidia_base_path / 'cublas' / 'bin' cudnn_path = nvidia_base_path / 'cudnn' / 'bin' nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin' paths_to_add = [ str(cuda_path), str(cublas_path), str(cudnn_path), str(nvrtc_path), ] env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH'] for env_var in env_vars: current_value = os.environ.get(env_var, '') new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add) os.environ[env_var] = new_value set_cuda_paths() import ctranslate2 import gc import torch from transformers import AutoTokenizer import pynvml from constants import user_message, system_message pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) model_dir = r"[INSERT PATH TO FOLDER CONTAINING THE MODEL FILES HERE]" def build_prompt(): prompt = f"""<|im_start|>system {system_message}<|im_end|> <|im_start|>user {user_message}<|im_end|> <|im_start|>assistant """ return prompt def main(): model_name = os.path.basename(model_dir) beam_size_value = 1 intra_threads = max(os.cpu_count() - 4, 4) generator = ctranslate2.Generator( model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads ) tokenizer = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=None) prompt = build_prompt() tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) results_batch = generator.generate_batch( [tokens], include_prompt_in_result=False, max_batch_size=4096, batch_type="tokens", beam_size=beam_size_value, num_hypotheses=1, max_length=512, sampling_temperature=0.0, ) output = tokenizer.decode(results_batch[0].sequences_ids[0]) print("\nGenerated response:\n") print(output) del generator del tokenizer torch.cuda.empty_cache() gc.collect() if __name__ == "__main__": main() ```
Sample Script #2 (streaming) ``` import sys import os os.environ['KMP_DUPLICATE_LIB_OK']='TRUE' from pathlib import Path def set_cuda_paths(): venv_base = Path(sys.executable).parent.parent nvidia_base_path = venv_base / 'Lib' / 'site-packages' / 'nvidia' cuda_path = nvidia_base_path / 'cuda_runtime' / 'bin' cublas_path = nvidia_base_path / 'cublas' / 'bin' cudnn_path = nvidia_base_path / 'cudnn' / 'bin' nvrtc_path = nvidia_base_path / 'cuda_nvrtc' / 'bin' paths_to_add = [ str(cuda_path), str(cublas_path), str(cudnn_path), str(nvrtc_path), ] env_vars = ['CUDA_PATH', 'CUDA_PATH_V12_4', 'PATH'] for env_var in env_vars: current_value = os.environ.get(env_var, '') new_value = os.pathsep.join(paths_to_add + [current_value] if current_value else paths_to_add) os.environ[env_var] = new_value set_cuda_paths() import ctranslate2 import gc import torch from transformers import AutoTokenizer import pynvml from constants import user_message, system_message pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) model_dir = r"[PATH TO FOLDER CONTAINING THE MODEL FILES]" def build_prompt(): prompt = f"""<|im_start|>system {system_message}<|im_end|> <|im_start|>user {user_message}<|im_end|> <|im_start|>assistant """ return prompt def main(): generator = ctranslate2.Generator( model_dir, device="cuda", compute_type="int8", ) tokenizer = AutoTokenizer.from_pretrained(model_dir) prompt = build_prompt() tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) # Initialize token iterator token_iterator = generator.generate_tokens( [tokens], max_length=512, sampling_temperature=0.0 ) decoded_output = "" tokens_buffer = [] try: for token_result in token_iterator: token_id = token_result.token_id token = tokenizer.convert_ids_to_tokens(token_id) if token_id == tokenizer.eos_token_id: break is_new_word = token.startswith("Ġ") if is_new_word and tokens_buffer: word = tokenizer.decode(tokens_buffer) print(word, end='', flush=True) decoded_output += word tokens_buffer = [] tokens_buffer.append(token_id) if tokens_buffer: word = tokenizer.decode(tokens_buffer) print(word, end='', flush=True) decoded_output += word except KeyboardInterrupt: print("\nGeneration interrupted") del generator del tokenizer torch.cuda.empty_cache() gc.collect() if __name__ == "__main__": main() ```