Spaces:
Sleeping
Sleeping
File size: 3,966 Bytes
0097326 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/jamino/xsbpe/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import gradio as gr\n",
"from xsbpe.basic import BasicTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tk = BasicTokenizer()\n",
"tk.train(open('dune.txt').read(), 256 + 10000, verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running on local URL: http://127.0.0.1:7896\n",
"Running on public URL: https://cb2bf07164e5cebb6e.gradio.live\n",
"\n",
"This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"https://cb2bf07164e5cebb6e.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def tokenize(text):\n",
" tokens = tk.encode(text)\n",
" \n",
" colors = ['rgba(107,64,216,.3)', 'rgba(104,222,122,.4)', 'rgba(244,172,54,.4)', 'rgba(239,65,70,.4)', 'rgba(39,181,234,.4)']\n",
" colored_tokens = []\n",
" \n",
" for i, token in enumerate(tokens):\n",
" token = tk.vocab[token].decode('utf-8').replace(' ', ' ')\n",
" span = f'<span style=\"background-color: {colors[i % len(colors)]}\">{token}</span>'\n",
" colored_tokens.append(span)\n",
"\n",
" return '<p style=\"margin-left: 2px; margin-right: 2px; word-wrap: break-word\">' + ''.join(colored_tokens) + '</p>', tokens, len(tokens), len(text)\n",
"\n",
"interface = gr.Interface(\n",
" fn=tokenize, \n",
" inputs=[gr.TextArea(label='Input Text', type='text')], \n",
" outputs=[\n",
" gr.HTML(label='Tokenized Text'),\n",
" gr.Textbox(label='Token IDs', lines=1, max_lines=5),\n",
" gr.Textbox(label='Tokens', max_lines=1),\n",
" gr.Textbox(label='Characters', max_lines=1)\n",
" ],\n",
" title=\"BPE Tokenization Visualizer\",\n",
" live=True,\n",
" examples=[\n",
" 'BPE, or Byte Pair Encoding, is a method used to compress text by breaking it down into smaller units. In natural language processing, it helps tokenize words by merging the most frequent pairs of characters or symbols, creating more efficient and manageable tokens for analysis.'\n",
" ],\n",
" show_progress='hidden',\n",
" api_name='tokenize',\n",
" allow_flagging='never'\n",
").launch(share=True, inbrowser=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|