Intel
/

gpt-j-6B-int8-dynamic-inc

@@ -86,14 +86,18 @@
     "    input_ids = pad(input_ids, (0, pad_len), value=1)\n",
     "    ort_inputs = {\n",
     "        'input_ids': input_ids.detach().cpu().numpy(),\n",
-    "        'attention_mask': torch.ones(input_ids.shape).detach().cpu().numpy().astype('int64')\n",
     "    }\n",
     "    predictions = session.run(None, ort_inputs)\n",
     "    outputs = torch.from_numpy(predictions[0]) \n",
     "    last_token_logits = outputs[:, -2 - pad_len, :]\n",
     "    pred = last_token_logits.argmax(dim=-1)\n",
     "    total += label.size(0)\n",
     "    hit += (pred == label).sum().item()\n",
     "acc = hit / total\n",
     "print('acc: ', acc)"
    ]
@@ -132,19 +136,59 @@
     "\n",
     "print(\"prompt: \", prompt)\n",
     "\n",
     "# start\n",
-    "input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
-    "for i in range(32):\n",
     "    inp = {'input_ids': input_ids.detach().cpu().numpy(),\n",
-    "            'attention_mask': torch.ones(input_ids.shape).detach().cpu().numpy().astype('int64')}\n",
-    "    output = session.run(None, inp)\n",
-    "    logits = output[0]\n",
-    "    logits = torch.from_numpy(logits)\n",
-    "    next_token_logits = logits[:, -1, :]\n",
-    "    probs = torch.nn.functional.softmax(next_token_logits, dim=-1)\n",
-    "    next_tokens = torch.argmax(probs, dim=-1)\n",
-    "    input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)\n",
-    "print(tokenizer.decode(input_ids[0]))"
    ]
   }
  ],

     "    input_ids = pad(input_ids, (0, pad_len), value=1)\n",
     "    ort_inputs = {\n",
     "        'input_ids': input_ids.detach().cpu().numpy(),\n",
+    "        'attention_mask': torch.cat([torch.ones(input_ids.shape), torch.ones([1, 1])], dim=-1).detach().cpu().numpy().astype('int64')\n",
     "    }\n",
+    "    for i in range(28):\n",
+    "        ort_inputs[\"past_key_values.{}.key\".format(i)] = np.zeros((1,16,1,256), dtype='float32')\n",
+    "        ort_inputs[\"past_key_values.{}.value\".format(i)] = np.zeros((1,16,1,256), dtype='float32')\n",
     "    predictions = session.run(None, ort_inputs)\n",
     "    outputs = torch.from_numpy(predictions[0]) \n",
     "    last_token_logits = outputs[:, -2 - pad_len, :]\n",
     "    pred = last_token_logits.argmax(dim=-1)\n",
     "    total += label.size(0)\n",
     "    hit += (pred == label).sum().item()\n",
+    "\n",
     "acc = hit / total\n",
     "print('acc: ', acc)"
    ]
     "\n",
     "print(\"prompt: \", prompt)\n",
     "\n",
+    "total_time = 0.0\n",
+    "num_iter = 10\n",
+    "num_warmup = 3\n",
+    "\n",
     "# start\n",
+    "for idx in range(num_iter):\n",
+    "    text = []\n",
+    "    tic = time.time()\n",
+    "\n",
+    "    input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids\n",
+    "\n",
+    "    attention_mask = torch.ones(input_ids.shape[1] +1)\n",
+    "    attention_mask[0] = 0\n",
+    "    attention_mask = attention_mask.unsqueeze(0)\n",
+    "\n",
     "    inp = {'input_ids': input_ids.detach().cpu().numpy(),\n",
+    "            'attention_mask': attention_mask.detach().cpu().numpy().astype('int64')}\n",
+    "    for i in range(28):\n",
+    "        inp[\"past_key_values.{}.key\".format(i)] = torch.zeros([1,16,1,256]).detach().cpu().numpy()\n",
+    "        inp[\"past_key_values.{}.value\".format(i)] = torch.zeros([1,16,1,256]).detach().cpu().numpy()\n",
+    "\n",
+    "    for i in range(32):\n",
+    "\n",
+    "        output = session.run(None, inp)\n",
+    "        logits = output[0]\n",
+    "        logits = torch.from_numpy(logits)\n",
+    "        next_token_logits = logits[:, -1, :]\n",
+    "        probs = torch.nn.functional.softmax(next_token_logits, dim=-1)\n",
+    "        next_tokens = torch.argmax(probs, dim=-1)\n",
+    "        present_kv = output[1]\n",
+    "        for i in range(28):\n",
+    "\n",
+    "            if step == 0:\n",
+    "                inp[\"past_key_values.{}.key\".format(i)] = output[2*i+1][:, :, 1:, :]\n",
+    "                inp[\"past_key_values.{}.value\".format(i)] = output[2*i+2][:, :, 1:, :]\n",
+    "            else:\n",
+    "                inp[\"past_key_values.{}.key\".format(i)] = output[2*i+1]\n",
+    "                inp[\"past_key_values.{}.value\".format(i)] = output[2*i+2]\n",
+    "\n",
+    "        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)\n",
+    "        if step == 0:\n",
+    "            attention_mask = torch.cat([attention_mask[:, 1:], torch.ones([1, 1])], dim=-1)\n",
+    "        else:\n",
+    "            attention_mask = torch.cat([attention_mask, torch.ones([1, 1])], dim=-1)\n",
+    "\n",
+    "        inp['attention_mask'] = attention_mask.detach().cpu().numpy().astype('int64')\n",
+    "        inp['input_ids'] = input_ids[:, -1:].detach().cpu().numpy()\n",
+    "\n",
+    "    print(tokenizer.decode(input_ids[0]))\n",
+    "    toc = time.time()\n",
+    "    if idx >= num_warmup:\n",
+    "        total_time += (toc - tic)\n",
+    "print(\"Inference latency: %.3f s.\" % (total_time / (num_iter - num_warmup)))"
    ]
   }
  ],

model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c1e2627bdfc69469e0bb412d24acffd611a686be3fdf788f1c077040f5e0f92
-size 6127447

 version https://git-lfs.github.com/spec/v1
+oid sha256:99af1fc6a93e6b02902f3f4c3fe32bf3d7bb4441406bee3bf0cbceaa5b9f64e3
+size 6332176

weights.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:938fa97d7d469cb2f373c92916a55e5bcfab1cff40bd878f6f789ccae240c655
-size 6790222720

 version https://git-lfs.github.com/spec/v1
+oid sha256:e9641d64847996acc53c7093cf4ff9c02443b9c4fd61699cb9ac00b86861c528
+size 6057661312