import gguf import numpy as np from sklearn.decomposition import PCA import tqdm def load_hidden_states(path): print("\nyour mom\n") '''Load hidden states produced by the llama.cpp ./repeng tool.''' gguf_file = gguf.GGUFReader(path) print("\nyour dad\n") hidden_states = {} for t in gguf_file.tensors: if not t.name.startswith('l_out-'): continue layer = int(t.name[len('l_out-'):]) assert layer not in hidden_states, 'duplicate hidden states for layer %d' % layer data = t.data.reshape((t.shape[1], t.shape[0])) hidden_states[layer] = data return hidden_states def project_onto_direction(H, direction): """Project matrix H (n, d_1) onto direction vector (d_2,)""" mag = np.linalg.norm(direction) assert not np.isinf(mag) return (H @ direction) / mag def read_representations( layer_hiddens: dict[int, np.ndarray], ) -> dict[int, np.ndarray]: """ Extract the representations based on the contrast dataset. """ hidden_layers = sorted(layer_hiddens.keys()) num_inputs = next(iter(layer_hiddens.values())).shape[0] // 2 print('%d inputs' % num_inputs) # get differences between (positive, negative) pairs relative_layer_hiddens = {} for layer in hidden_layers: relative_layer_hiddens[layer] = ( layer_hiddens[layer][::2] - layer_hiddens[layer][1::2] ) # get directions for each layer using PCA directions: dict[int, np.ndarray] = {} for layer in tqdm.tqdm(hidden_layers): assert layer_hiddens[layer].shape[0] == num_inputs * 2 # fit layer directions train = np.vstack( relative_layer_hiddens[layer] - relative_layer_hiddens[layer].mean(axis=0, keepdims=True) ) pca_model = PCA(n_components=1, whiten=False).fit(train) # shape (n_features,) directions[layer] = pca_model.components_.astype(np.float32).squeeze(axis=0) # calculate sign projected_hiddens = project_onto_direction( layer_hiddens[layer], directions[layer] ) # order is [positive, negative, positive, negative, ...] positive_smaller_mean = np.mean( [ projected_hiddens[i] < projected_hiddens[i + 1] for i in range(0, num_inputs * 2, 2) ] ) positive_larger_mean = np.mean( [ projected_hiddens[i] > projected_hiddens[i + 1] for i in range(0, num_inputs * 2, 2) ] ) if positive_smaller_mean > positive_larger_mean: # type: ignore directions[layer] *= -1 return directions def export_gguf(directions, path: str): """ Export a trained ControlVector to a llama.cpp .gguf file. """ arch = "controlvector" writer = gguf.GGUFWriter(path, arch) #writer.add_string(f"{arch}.model_hint", model_type) #writer.add_uint32(f"{arch}.layer_count", len(directions)) for layer in directions.keys(): if layer == 0: # For some reason, llama.cpp bails out if it sees a direction.0 # tensor. continue writer.add_tensor(f"direction.{layer}", directions[layer]) writer.write_header_to_file() writer.write_kv_data_to_file() writer.write_tensors_to_file() writer.close() def test_model(model_name, directions): import torch from transformers import AutoModelForCausalLM, AutoTokenizer from repeng import ControlVector, ControlModel tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token_id = 0 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) model = model.to("cuda:0" if torch.cuda.is_available() else "mps:0" if torch.backends.mps.is_available() else "cpu") model = ControlModel(model, list(range(-5, -18, -1))) control_vector = ControlVector(model.config.model_type, directions) user_tag, asst_tag = "[INST]", "[/INST]" # the question to ask the modified model # don't forget the space after {user_tag} and before {asst_tag}! input = f"{user_tag} What are human beings like? {asst_tag}" # tokenizer and generation settings input_ids = tokenizer(input, return_tensors="pt").to(model.device) settings = { "pad_token_id": tokenizer.eos_token_id, # silence warning "do_sample": False, # temperature=0 "max_new_tokens": 128, "repetition_penalty": 1.1, # reduce control jank } print("==baseline") model.reset() print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze())) print("\n++control") # add the control vector with a certain strength (try increasing or decreasing this!) model.set_control(control_vector, 1.0) print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze())) print("\n--control") # subtract the control vector, giving the opposite result (e.g. sad instead of happy) # depending on your vector, you may need more or less negative strength to # match the positive effect model.set_control(control_vector, -1.0) print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze())) model.reset() print("\nLoad hidden shit\n") hidden_states = load_hidden_states('control_vector_data.gguf') print("\nHidden shit loaded\n") directions = read_representations(hidden_states) print("\nExport this motherfucker\n") export_gguf(directions, 'control_vector.gguf') TEST_MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.1' #test_model(TEST_MODEL_NAME, directions)