In [2]:
%load_ext autoreload
%autoreload 2

import json
import os
import pickle
from datetime import datetime

import evaluate
import torch
from tqdm import tqdm

from eval import *
from superposed.llama.metrics import *
from superposed.llama.generation import Llama
from superposed.llama.superposed_generation import SuperposedLlama
from superposed.llama.tokenizer import Tokenizer
from superposed.ngrams.ngram_models import make_models

 from .autonotebook import tqdm as notebook_tqdm
2024-05-30 03:09:58.230601: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-30 03:09:58.280835: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
sup_device = torch.device("cuda:0")
tokenizer = Tokenizer('../../7B/tokenizer.model')

In [5]:
# Params
param_file = "../../params/p15_d3_mixed.json"
with open(param_file, "r") as f:
 params = json.load(f)
 print(f"Parameters: {params}")
alpha = params["alpha"]
temp = params["temp"]
n_drafts = params["n_drafts"]
prompt_len = params["prompt_len"]
n_token_sample = params["n_token_sample"]
i_weights = params["i_weights"]
i_length = params["i_length"]

Parameters: {'alpha': 0.54, 'temp': 0.06, 'n_drafts': 3, 'prompt_len': 15, 'n_token_sample': 9, 'n_token_consider': 32000, 'mixing_method': 'sample_new_weights_with_score', 'smoothing': 'geom', 'sample_tokens': 0, 'sample_beams': 0, 'i_weights': [0.01, 0.04, 0.15, 0.18, 0.12], 'i_length': [1, 2, 3, 4, 5]}


In [6]:
# Create ngram models
ngrams = make_models("../../ckpts-200k", bigram=True, trigram=True, fourgram=True, fivegram=True, sixgram=True, sevengram=False)

Making bigram...
1310800
Making trigram...
671088728
Making fourgram...
2684354648
Making fivegram...
5368709200
Making sixgram...
5368709200


In [7]:
weight_path = "../../7B/"
model = SuperposedLlama.build(ckpt_dir=weight_path, 
 tokenizer_path=f'{weight_path}/tokenizer.model', 
 max_seq_len=100, 
 max_batch_size=32,
 device=sup_device,
 model_parallel_size=1)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


 _C._set_default_tensor_type(t)


Loaded in 25.15 seconds
cuda:0


# Inference

In [11]:
def decode(tokenizer, encoding):
 """
 Args:
 tokenizer (Any): Tokenizer
 encoding (torch.Tensor): Encoding
 Returns:
 decoding (str)
 """
 eos_locs = (encoding == tokenizer.eos_id).nonzero()
 if len(eos_locs > 0):
 encoding = encoding[:eos_locs[0]]
 return tokenizer.decode(encoding.to(torch.int32).tolist())

In [22]:
prompts = [
 "Hi my name is",
 "The Seattle Seahawks were Super Bowl",
 "Penguins are birds native to"
]
tokenized_prompts = tokenizer.encode(prompts, True, False)

In [23]:
alive_gens, _ = model.sup_generate(prompt_tokens=tokenized_prompts, 
 smoothing="geom",
 max_gen_len=10, 
 n_token_sample=n_token_sample,
 alpha=alpha, 
 temp=temp,
 n_drafts=n_drafts,
 i_weights=i_weights,
 i_length=i_length,
 ngrams=ngrams,
 get_time=False,
 penalty=200)

In [24]:
gens = alive_gens[0].reshape(len(prompts) * n_drafts, -1)

In [25]:
for i in gens:
 print(decode(tokenizer, i))

Hi
my name
is L
inda,
I am
a 
40
year old
woman who
