{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "\n", "import json\n", "import os\n", "import re\n", "from datetime import datetime\n", "\n", "import torch\n", "from datasets import load_dataset\n", "from tqdm import tqdm\n", "\n", "from eval import *\n", "from superposed.llama.metrics import *\n", "from superposed.llama.generation import Llama\n", "from superposed.llama.superposed_generation import SuperposedLlama\n", "from superposed.llama.tokenizer import Tokenizer\n", "from superposed.ngrams.ngram_models import make_models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "nq = load_dataset(\"nq_open\")[\"validation\"]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters: {'alpha': 0.54, 'temp': 0.06, 'n_drafts': 3, 'prompt_len': 15, 'n_token_sample': 9, 'n_token_consider': 32000, 'mixing_method': 'sample_new_weights_with_score', 'smoothing': 'geom', 'sample_tokens': 0, 'sample_beams': 0, 'i_weights': [0.01, 0.04, 0.15, 0.18, 0.12], 'i_length': [1, 2, 3, 4, 5]}\n" ] } ], "source": [ "# Params\n", "param_file = \"../../params/p15_d3_mixed.json\"\n", "with open(param_file, \"r\") as f:\n", " params = json.load(f)\n", " print(f\"Parameters: {params}\")\n", "alpha = params[\"alpha\"]\n", "temp = params[\"temp\"]\n", "n_drafts = params[\"n_drafts\"]\n", "prompt_len = params[\"prompt_len\"]\n", "n_token_sample = params[\"n_token_sample\"]\n", "i_weights = params[\"i_weights\"]\n", "i_length = params[\"i_length\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create Models" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Making bigram...\n", "1310800\n", "Making trigram...\n", "671088728\n", "Making fourgram...\n", "2684354648\n", "Making fivegram...\n", "5368709200\n", "Making sixgram...\n", "5368709200\n" ] } ], "source": [ "ngrams = make_models(\"../../ckpts-200k\", bigram=True, trigram=True, fourgram=True, fivegram=True, sixgram=True, sevengram=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "sup_device = torch.device(\"cuda:0\")\n", "reg_device = torch.device(\"cuda:1\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "> initializing model parallel with size 1\n", "> initializing ddp with size 1\n", "> initializing pipeline with size 1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/gscratch/raivn/ethans/miniconda3/envs/llms_12.1/lib/python3.11/site-packages/torch/__init__.py:614: UserWarning: torch.set_default_tensor_type() is deprecated as of PyTorch 2.1, please use torch.set_default_dtype() and torch.set_default_device() as alternatives. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:451.)\n", " _C._set_default_tensor_type(t)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loaded in 33.68 seconds\n", "cuda:0\n" ] } ], "source": [ "# load superposed\n", "weight_path = \"../../7B/\"\n", "sup_model = SuperposedLlama.build(ckpt_dir=weight_path, \n", " tokenizer_path=f'{weight_path}/tokenizer.model', \n", " max_seq_len=1000, \n", " max_batch_size=16,\n", " device=sup_device,\n", " model_parallel_size=1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "Loaded in 22.47 seconds\n" ] } ], "source": [ "# load regular\n", "reg_model = Llama.build(ckpt_dir=weight_path, \n", " tokenizer_path=f'{weight_path}/tokenizer.model', \n", " max_seq_len=1000, \n", " max_batch_size=16,\n", " device=reg_device, # reg_device,\n", " model_parallel_size=1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(f\"{weight_path}/tokenizer.model\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluation" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "model_types = [\"greedy\", \"superposed\", \"regular\"]\n", "model_type = model_types[1]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def evaluate_nq(model_type, question, max_gen_len):\n", " question = \"Answer these questions:\\n\\nQ: \" + question + \"?\\nA:\"\n", " text_len = len(question) # for truncating\n", " prompt_len = len(tokenizer.encode([question], True, False)[0]) # for model\n", " if model_type == \"regular\" or model_type == \"greedy\":\n", " if model_type == \"regular\":\n", " input = [question for _ in range(n_drafts)]\n", " print(input)\n", " sequences, _ = evaluate_nucleus_losses(data=input,\n", " model=reg_model,\n", " tokenizer=tokenizer,\n", " prompt_len=prompt_len,\n", " max_gen_len=max_gen_len,\n", " temp=0.6,\n", " bsz=8,\n", " marker=False)\n", " else:\n", " sequences, _ = evaluate_nucleus_losses(data=[question],\n", " model=reg_model,\n", " tokenizer=tokenizer,\n", " prompt_len=prompt_len,\n", " max_gen_len=max_gen_len,\n", " temp=0,\n", " bsz=8,\n", " marker=False)\n", " n_pd, seq_len = sequences.shape\n", " elif model_type == \"superposed\":\n", " sequences, _ = evaluate_mixed_losses(data=[question],\n", " model=sup_model,\n", " tokenizer=tokenizer,\n", " prompt_len=prompt_len,\n", " max_gen_len=max_gen_len,\n", " alpha=alpha,\n", " temp=temp,\n", " n_drafts=n_drafts,\n", " n_token_sample=n_token_sample,\n", " smoothing=None, # Use greedy\n", " bsz=8,\n", " i_weights=i_weights,\n", " i_length=i_length,\n", " ngrams=ngrams,\n", " marker=False)\n", " n_p, n_d, seq_len = sequences.shape\n", " # Process results\n", " sequences = sequences.reshape(-1, seq_len).tolist()\n", " for d_idx in range(len(sequences)):\n", " draft = sequences[d_idx]\n", " if -1 in draft:\n", " draft = draft[:draft.index(-1)]\n", " sequences[d_idx] = draft\n", " decoded_seq = tokenizer.decode(sequences)\n", " answers = []\n", " for s in decoded_seq:\n", " answers.append(re.split(\"[,.\\n]\", s[text_len:].strip())[0])\n", " return answers\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Run evaluation\n", "predictions = []\n", "print(f\"Precision from 1 to {n_drafts}\")\n", "for sample in tqdm(nq):\n", " # Adaptively determine max generation length\n", " longest = 0\n", " shortest = 1000\n", " for answer in sample[\"answer\"]:\n", " tmp = tokenizer.encode([answer], False, False)[0]\n", " if len(tmp) > longest:\n", " longest = len(tmp)\n", " if len(tmp) < shortest:\n", " shortest = len(tmp)\n", " question = sample[\"question\"]\n", " answer = evaluate_nq(model_type, question, max_gen_len=shortest+3)\n", " predictions.append({\"question\": question, \"answer\": answer})" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "# Separate results into precisions\n", "precisions = {}\n", "for i in range(1, n_drafts+1):\n", " prec = str(i)\n", " responses = []\n", " for result in predictions:\n", " responses.append({\"question\": result[\"question\"], \"answer\": result[\"answer\"][:i]})\n", " precisions[prec] = responses" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'question': 'when was the last time anyone was on the moon', 'answer': ['2019', '2019', '2019-', '2019-', '1019']}\n", "================\n", "{'question': \"who wrote he ain't heavy he's my brother lyrics\", 'answer': ['The song was written by', 'The lyr was written by', 'The Hol was written by', 'Neil song was written by', 'Neil lyr was written by']}\n", "================\n", "{'question': 'how many seasons of the bastard executioner are there', 'answer': ['1', 'There1', 'there1', '1', 'There1']}\n", "================\n", "{'question': 'when did the eagles win last super bowl', 'answer': ['2018', 'The2018', '1018', '2017', 'the2018']}\n", "================\n", "{'question': \"who won last year's ncaa women's basketball\", 'answer': ['the university of connecticut', 'The university of connecticut', 'university of connecticut', 'the University of connecticut', 'The University of connecticut']}\n", "================\n", "{'question': 'when did the isle of wight become an island', 'answer': ['1207', 'when1207', '1287', '1277', 'when1287']}\n", "================\n", "{'question': 'love yourself by justin bieber is about who', 'answer': ['love yourself by justin b', 'love yourself is justin b', 'Justin yourself by justin b', 'Justin yourself is justin b', 'It yourself by justin b']}\n", "================\n", "{'question': 'who was the ruler of england in 1616', 'answer': ['James I', 'James I of', 'King I', 'j I', 'James I']}\n", "================\n", "{'question': 'what is the hot coffee mod in san andreas', 'answer': ['The Hot Coffee mod is a modification for Grand', 'The Hot Coffee mod is a mod for Grand', 'The hot Coffee mod is a modification for Grand', 'The Hot Coffee mod is a modification that Grand', 'It Hot Coffee mod is a modification for Grand']}\n", "================\n", "{'question': 'what is the maximum data rate for the 802.11a standard select one', 'answer': ['54 Mbps', '54Mbps', '54 mbps', '54 Mbps', '54 Mbps']}\n", "================\n" ] } ], "source": [ "# Print some results\n", "counter = 0\n", "for k in predictions:\n", " if counter >= 10:\n", " break\n", " print(k)\n", " counter += 1\n", " print(\"================\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Saving" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['1', '2', '3', '4', '5'])\n" ] } ], "source": [ "# Save results\n", "os.makedirs(\"../../nq/\", exist_ok=True)\n", "print(precisions.keys())\n", "for prec in range(1, n_drafts+1):\n", " out_path = f\"../nq/eval_{model_type}_{prec}_test.jsonl\"\n", " with open(out_path, \"w\") as f:\n", " for obj in precisions[str(prec)]: \n", " f.write(json.dumps(obj) + \"\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 4 }