Modified example code using GGUF quantized ctransformers on Apple Silicon

#8
by teneriffa - opened
# Because ctransformers' AutoTokenizer.from_pretrain(:model) for GGML/GGUF is not implemented yet, 
# I use it from HF transformers.
import torch, json
from transformers import AutoTokenizer

from ctransformers import AutoModelForCausalLM

# Model paths must have everything from original files except original model weight *.bin files
# becase those files are necessary to load tokenizers from HF transformers.
model_path_actor = "/replace_it_with_your_model_path/migtissera_HelixNet/actor"
model_path_critic = "/replace_it_with_your_model_path/migtissera_HelixNet/critic"
model_path_regenerator = "/replace_it_with_your_model_path/migtissera_HelixNet/regenerator"

# Quantized GGUF files must be in each model path.
model_file_actor = "migtissera_HelixNet_actor-Q4_K_M.gguf"
model_file_critic = "migtissera_HelixNet_critic-Q4_K_M.gguf"
model_file_regenerator = "migtissera_HelixNet_regenerator-Q4_K_M.gguf"


def load_model_gguf(model_path, file_path):
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        model_file=file_path,
        model_type="llama",
        local_files_only=True,
        context_length=1024,
        # gpu_layers is for Metal acceleration for Apple Silicon.
        # Comment out or remove if you do not use Metal acceleration.
        gpu_layers=1,
        hf=True
    )
    return model

def load_tokenizer(model):
    tokenizer = AutoTokenizer.from_pretrained(model, local_files_only=True)
    return tokenizer

model_actor = load_model_gguf(model_path_actor, model_file_actor)
model_critic = load_model_gguf(model_path_critic, model_file_critic)
model_regenerator = load_model_gguf(model_path_regenerator, model_file_regenerator)

tokenizer_actor = load_tokenizer(model_path_actor)
tokenizer_critic = load_tokenizer(model_path_critic)
tokenizer_regenerator = load_tokenizer(model_path_regenerator)

def generate_text(instruction, model, tokenizer):
    tokens = tokenizer.encode(instruction)
    tokens = torch.LongTensor(tokens).unsqueeze(0)

    instance = {
        "input_ids": tokens,
        "top_p": 1.0,
        "temperature": 0.75,
        "generate_len": 1024,
        "top_k": 50,
    }

    length = len(tokens[0])
    with torch.no_grad():
        rest = model.generate(
            input_ids=tokens,
            max_length=length + instance["generate_len"],
            use_cache=True,
            do_sample=True,
            top_p=instance["top_p"],
            temperature=instance["temperature"],
            top_k=instance["top_k"],
            num_return_sequences=1,
        )
    output = rest[0][length:]
    string = tokenizer.decode(output, skip_special_tokens=True)
    return f"{string}"

system_prompt = "You are HelixNet. Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
  
conversation = f"SYSTEM:{system_prompt}"

while True:
    user_input = input("You: ")

    prompt_actor = f"{conversation} \nUSER: {user_input} \nASSISTANT: "
    actor_response = generate_text(prompt_actor, model_actor, tokenizer_actor)
    print("Generated ACTOR RESPONSE")

    prompt_critic = f"SYSTEM: {system_prompt} \nUSER: {user_input} \nRESPONSE: {actor_response} \nCRITIQUE:"
    critic_response = generate_text(prompt_critic, model_critic, tokenizer_critic)
    print("Generated CRITIQUE")

    prompt_regenerator = f"SYSTEM: {system_prompt} \nUSER: {user_input} \nRESPONSE: {actor_response} \nCRITIQUE: {critic_response} \nREGENERATOR: REGENERATED ANSWER:"
    regenerator_response = generate_text(prompt_regenerator, model_regenerator, tokenizer_regenerator)
    print("Generated REGENERATION")

    conversation = f"{conversation} \nUSER: {user_input} \nASSISTANT: {regenerator_response}"
    print(conversation)
teneriffa changed discussion title from Modified example code using ctransformers on Apple Silicon to Modified example code using GGUF quantized ctransformers on Apple Silicon

Sign up or log in to comment