In [1]:
%load_ext autoreload
%autoreload 2

import copy
import json
import pickle
import os
import random
import re
import string
import math
from datetime import datetime

import evaluate
import torch
import numpy as np
from datasets import load_dataset
from transformers import LlamaTokenizer
from tqdm import tqdm

from eval import *
from superposed.llama.metrics import *
from superposed.llama.generation import Llama
from superposed.llama.superposed_generation import SuperposedLlama
from superposed.llama.tokenizer import Tokenizer
from superposed.ngrams.ngram_models import make_models

 from .autonotebook import tqdm as notebook_tqdm
2024-05-30 01:35:17.813978: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-30 01:35:20.452213: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Setup

In [3]:
# Params
param_file = "../../params/p15_d3_mixed.json"
with open(param_file, "r") as f:
 params = json.load(f)
 print(f"Parameters: {params}")
alpha = params["alpha"]
temp = params["temp"]
n_drafts = params["n_drafts"]
prompt_len = params["prompt_len"]
n_token_sample = params["n_token_sample"]
i_weights = params["i_weights"]
i_length = params["i_length"]

Parameters: {'alpha': 0.54, 'temp': 0.06, 'n_drafts': 3, 'prompt_len': 15, 'n_token_sample': 9, 'n_token_consider': 32000, 'mixing_method': 'sample_new_weights_with_score', 'smoothing': 'geom', 'sample_tokens': 0, 'sample_beams': 0, 'i_weights': [0.01, 0.04, 0.15, 0.18, 0.12], 'i_length': [1, 2, 3, 4, 5]}


In [5]:
ngrams = make_models("../../ckpts-200k", bigram=True, trigram=True, fourgram=True, fivegram=True, sixgram=True, sevengram=False)

Making bigram...
1310800
Making trigram...
671088728
Making fourgram...
2684354648
Making fivegram...
5368709200
Making sixgram...
5368709200


In [10]:
sup_device = torch.device("cuda:0")
reg_device = torch.device("cuda:1")

In [11]:
weight_path = "../../7B/"
sup_model = SuperposedLlama.build(ckpt_dir=weight_path, 
 tokenizer_path=f'{weight_path}/tokenizer.model', 
 max_seq_len=1000, 
 max_batch_size=16,
 device=sup_device,
 model_parallel_size=1)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


 _C._set_default_tensor_type(t)


Loaded in 22.07 seconds
cuda:0


In [12]:
reg_model = Llama.build(ckpt_dir=weight_path, 
 tokenizer_path=f'{weight_path}/tokenizer.model', 
 max_seq_len=1000, 
 max_batch_size=16,
 device=reg_device,
 model_parallel_size=1)

0
Loaded in 22.76 seconds


In [18]:
tokenizer = Tokenizer(f"{weight_path}/tokenizer.model")

# Evaluation

In [13]:
trivia_path = "../../../datasets/qa/wikipedia-dev.json"
with open(trivia_path, "r") as f:
 triviaqa = json.load(f)["Data"]
print(f"Length: {len(triviaqa)}")

Length: 7993


In [14]:
torch.set_default_dtype(torch.float32)

In [15]:
model_types = ["superposed", "regular"]
model_type = model_types[0]

In [16]:
# https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/triviaqa/default.yaml
def evaluate_trivia(model_type, question, max_gen_len):
 question = "Question: " + question + "\nAnswer:"
 text_len = len(question) # for truncating
 prompt_len = len(tokenizer.encode([question], True, False)[0]) # for model
 if model_type == "regular":
 input = [question for _ in range(n_drafts)]
 sequences, _ = evaluate_nucleus_losses(data=input,
 model=reg_model,
 tokenizer=tokenizer,
 prompt_len=prompt_len,
 max_gen_len=max_gen_len,
 temp=0.6, # Set to 0 for greedy
 bsz=8,
 marker=False)
 n_pd, seq_len = sequences.shape
 elif model_type == "superposed":
 sequences, _ = evaluate_mixed_losses(data=[question],
 model=sup_model,
 tokenizer=tokenizer,
 prompt_len=prompt_len,
 max_gen_len=max_gen_len,
 alpha=alpha,
 temp=temp,
 n_drafts=n_drafts,
 n_token_sample=n_token_sample,
 smoothing=None, # greedy
 bsz=8,
 i_weights=i_weights,
 i_length=i_length,
 ngrams=ngrams,
 marker=False)
 n_p, n_d, seq_len = sequences.shape
 # Process results
 sequences = sequences.reshape(-1, seq_len).tolist()
 for d_idx in range(len(sequences)):
 draft = sequences[d_idx]
 if -1 in draft:
 draft = draft[:draft.index(-1)]
 sequences[d_idx] = draft
 decoded_seq = tokenizer.decode(sequences)
 answers = []
 for s in decoded_seq:
 # print(s)
 answers.append(re.split("[,.\n]", s[text_len:].strip())[0])
 return answers
 

In [None]:
questions = {}
predictions = {}
print(f"Precision from 1 to {n_drafts}")
for sample in tqdm(triviaqa):
 # Adaptively select generation length
 longest = 0
 shortest = 1000
 total = 0
 for answer in sample["Answer"]["Aliases"]:
 tmp = tokenizer.encode([answer], False, False)[0]
 if len(tmp) > longest:
 longest = len(tmp)
 if len(tmp) < shortest:
 shortest = len(tmp)
 total += len(tmp)
 # Evaluation code
 id = sample["QuestionId"]
 question = sample["Question"]
 answer = evaluate_trivia(model_type, question, max_gen_len=longest + 3)
 predictions[id] = answer
 questions[id] = question

In [None]:
# Save precisions
precisions = {}
for i in range(1, n_drafts+1):
 prec = str(i)
 responses = {k: v[:i] for k, v in predictions.items()}
 precisions[prec] = responses

In [None]:
# Print some results
counter = 0
for k in predictions:
 if counter >= 10:
 break
 print(questions[k])
 print(predictions[k])
 counter += 1
 print("================")

In [None]:
# Save results
os.makedirs("../../trivia/", exist_ok=True)
for prec in range(1, n_drafts+1):
 out_path = f"../nucleus_extra/trivia_extra/ngram_4trivia_{model_type}_{prec}_4.json"
 with open(out_path, "w") as f:
 json.dump(precisions[str(prec)], f, indent=4)