ELIA / function_vectors /generate_german_vectors.py
aaron0eidt's picture
Deploy static demo
5b6c556
import os
import sys
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
# Add root project dir to path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from function_vectors.data.multilingual_function_categories import FUNCTION_CATEGORIES
def generate_german_vectors():
# Generates and saves function vectors for all German prompts.
print("πŸš€ Starting German function vector generation...")
# Load the model and tokenizer.
print("πŸ”§ Loading OLMo-2-7B model and tokenizer... (this may take a moment)")
try:
model_path = "./models/OLMo-2-1124-7B"
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto",
output_hidden_states=True
)
print(f"βœ… Model loaded successfully on device: {device}")
except Exception as e:
print(f"❌ Error loading model: {e}")
print("Please ensure the model exists at './Models/OLMo-2-1124-7B'")
return
# Function to get activation vectors.
def get_activation_for_prompt(prompt):
# Calculates the model's activation for a given prompt.
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True)
last_token_pos = inputs['attention_mask'].sum(dim=1) - 1
last_hidden_state = outputs.hidden_states[-1]
activation = last_hidden_state[0, last_token_pos[0], :].cpu().numpy()
return activation.astype(np.float64)
# Generate vectors for German prompts.
print("\nπŸ‡©πŸ‡ͺ Generating vectors for German prompts...")
german_category_vectors = {}
# Loop over all categories and generate vectors.
for category_key, data in tqdm(FUNCTION_CATEGORIES.items(), desc="Processing Categories"):
german_prompts = data.get('de', [])
if not german_prompts:
print(f"⚠️ Warning: No German prompts found for category '{category_key}'. Skipping.")
continue
# Get activations for all German prompts in the category
activations = [get_activation_for_prompt(p) for p in german_prompts]
if activations:
# Average the activations to get one vector per category.
german_category_vectors[category_key] = np.mean(activations, axis=0)
# Save the generated vectors.
if not german_category_vectors:
print("❌ No vectors were generated. Aborting save.")
return
output_dir = Path(__file__).parent / "data" / "vectors"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "de_category_vectors.npz"
try:
np.savez_compressed(output_path, **german_category_vectors)
print(f"\nβœ… Successfully generated and saved German function vectors to:")
print(f" {output_path}")
except Exception as e:
print(f"❌ Error saving vectors: {e}")
if __name__ == "__main__":
generate_german_vectors()