{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Notebook where I identify which token IDs to pass to my completion_only LM training code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/evan/miniconda3/envs/mlp_39/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"Gryphe/MythoMax-L2-13b\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "### Response (2 paragraphs, engaging, natural, authentic, descriptive, creative):\n", "#### Kurisu:\n", "#### Rintaro: How can I calm down!? Do you know how many times I’ve seen Mayuri die!? Covered in blood! Gasping for breath! The light fading from her eyes! Again... and again... and again... And all I could do... was watch!\n", "\n", "### Response (2 paragraphs, engaging, natural, authentic, descriptive, creative):\n", "#### Kurisu:\n" ] } ], "source": [ "annoying_prepended_stuff = \"\"\"#### Rintaro: How can I calm down!? Do you know how many times I’ve seen Mayuri die!? Covered in blood! Gasping for breath! The light fading from her eyes! Again... and again... and again... And all I could do... was watch!\n", "\n", "\"\"\"\n", "\n", "target_string = \"\"\"### Response (2 paragraphs, engaging, natural, authentic, descriptive, creative):\n", "#### Kurisu:\"\"\"\n", "\n", "# [tokenizer.decode(t) for t in tokenizer.encode(\"### Response\", add_special_tokens=False)]\n", "print(target_string)\n", "print(annoying_prepended_stuff + target_string)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['###',\n", " 'Response',\n", " '(',\n", " '2',\n", " 'paragraph',\n", " 's',\n", " ',',\n", " 'eng',\n", " 'aging',\n", " ',',\n", " 'natural',\n", " ',',\n", " 'authentic',\n", " ',',\n", " 'descript',\n", " 'ive',\n", " ',',\n", " 'cre',\n", " 'ative',\n", " '):',\n", " '\\n',\n", " '####',\n", " 'Kur',\n", " 'isu',\n", " ':']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[tokenizer.decode(t) for t in tokenizer.encode(target_string, add_special_tokens=False)]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['##',\n", " '#',\n", " 'Response',\n", " '(',\n", " '2',\n", " 'paragraph',\n", " 's',\n", " ',',\n", " 'eng',\n", " 'aging',\n", " ',',\n", " 'natural',\n", " ',',\n", " 'authentic',\n", " ',',\n", " 'descript',\n", " 'ive',\n", " ',',\n", " 'cre',\n", " 'ative',\n", " '):',\n", " '\\n',\n", " '####',\n", " 'Kur',\n", " 'isu',\n", " ':']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[tokenizer.decode(t) for t in tokenizer.encode(annoying_prepended_stuff + target_string, add_special_tokens=False)][65:]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[2277,\n", " 29937,\n", " 13291,\n", " 313,\n", " 29906,\n", " 14880,\n", " 29879,\n", " 29892,\n", " 3033,\n", " 6751,\n", " 29892,\n", " 5613,\n", " 29892,\n", " 15585,\n", " 29892,\n", " 29037,\n", " 573,\n", " 29892,\n", " 907,\n", " 1230,\n", " 1125,\n", " 13,\n", " 4136,\n", " 9742,\n", " 28311,\n", " 29901]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This is what we actually want to pass to the model \n", "\n", "[t for t in tokenizer.encode(annoying_prepended_stuff + target_string, add_special_tokens=False)][65:]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'## # Response ( 2 paragraph s , eng aging , natural , authentic , descript ive , cre ative ): \\n #### Kur isu :'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.decode(tokenizer.encode(''.join(['##',\n", " '#',\n", " 'Response',\n", " '(',\n", " '2',\n", " 'paragraph',\n", " 's',\n", " ',',\n", " 'eng',\n", " 'aging',\n", " ',',\n", " 'natural',\n", " ',',\n", " 'authentic',\n", " ',',\n", " 'descript',\n", " 'ive',\n", " ',',\n", " 'cre',\n", " 'ative',\n", " '):',\n", " '\\n',\n", " '####',\n", " 'Kur',\n", " 'isu',\n", " ':']), add_special_tokens=False))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mlp_39", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }