diff --git "a/space/space/space/notebooks/Softmax_PhoBERT.ipynb" "b/space/space/space/notebooks/Softmax_PhoBERT.ipynb" new file mode 100644--- /dev/null +++ "b/space/space/space/notebooks/Softmax_PhoBERT.ipynb" @@ -0,0 +1,5063 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 191 + }, + "id": "yRrmkevlCjXr", + "outputId": "b0abb114-925d-4ebf-f9ab-1abe0ce61723" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " window._wandbApiKey = new Promise((resolve, reject) => {\n", + " function loadScript(url) {\n", + " return new Promise(function(resolve, reject) {\n", + " let newScript = document.createElement(\"script\");\n", + " newScript.onerror = reject;\n", + " newScript.onload = resolve;\n", + " document.body.appendChild(newScript);\n", + " newScript.src = url;\n", + " });\n", + " }\n", + " loadScript(\"https://cdn.jsdelivr.net/npm/postmate/build/postmate.min.js\").then(() => {\n", + " const iframe = document.createElement('iframe')\n", + " iframe.style.cssText = \"width:0;height:0;border:none\"\n", + " document.body.appendChild(iframe)\n", + " const handshake = new Postmate({\n", + " container: iframe,\n", + " url: 'https://wandb.ai/authorize'\n", + " });\n", + " const timeout = setTimeout(() => reject(\"Couldn't auto authenticate\"), 5000)\n", + " handshake.then(function(child) {\n", + " child.on('authorize', data => {\n", + " clearTimeout(timeout)\n", + " resolve(data)\n", + " });\n", + " });\n", + " })\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", + "wandb: Paste an API key from your profile and hit enter:" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ··········\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlenguyenquocanh-vn-fptu\u001b[0m (\u001b[33mlenguyenquocanh-vn-fptu-fpt-university\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "source": [ + "import wandb\n", + "wandb.login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YY74yDYXID_a" + }, + "source": [ + "# Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "66m2J73nGXEV", + "outputId": "24173ca4-38fe-4f9e-f9a8-b39bdfdabe72" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "U81OmhBeGmMM", + "outputId": "f6c51bb0-3b7b-4029-e1fc-ae8d9a75ef87" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " id \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " seg_text \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " raw_text \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensidseg_textraw_textlabels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[O, O, O, O]
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seg_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"raw_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "# Tạo thêm các cột khác\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "\n", + "def replace_7_8(lst):\n", + " return [0 if x in (7, 8) else x for x in lst]\n", + "\n", + "\n", + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n", + "\n", + "\n", + "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n", + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sgpm-btsXxzt", + "outputId": "4275e090-0bb5-47a2-9b51-682d13bd7e45" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...\n" + ] + } + ], + "source": [ + "print(df['seg_text'][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I_VaVTLfXxzu", + "outputId": "f5a568e0-0235-40b5-9a87-9df21b39af44" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn lội qua dòng suối nước chảy rần rần , tự nhiên nước mắt mình rưng rưng ...\n" + ] + } + ], + "source": [ + "print(df['raw_text'][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zum7uCLSXxzu", + "outputId": "44b2d50f-fec1-42c5-fd52-854d510ba13d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)]\n" + ] + } + ], + "source": [ + "print(df['id'][1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ooewb479FdqS" + }, + "source": [ + "# Get Embedding Vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 889, + "referenced_widgets": [ + "1d9aaa035056485c959f66b60cf41714", + "71054a4397e344a2a66e32892a37b59b", + "6c582399be1d42c3b5a4ef21743d1a26", + "26a6ef7f967a4504a698ff3152ccb24e", + "82669dd27686486588fdd7d11f49edd3", + "461c136b3eac4a9dadb8a3af7c11c98a", + "4fb68dcea1ac4e54b82c92ad64e9be95", + "df397db853874f6db911acb667785ec0", + "2de5befe0ab24de9a62ba076e5abf78e", + "d7da38e7c5e1484597bb1faae3c2d7f5", + "2ec3573cb04143a8ba5e555bfaf60165", + "67c557f2651b4e5c81e9af82531898fe", + "5a5a04069ebd41fdba7835e1b5da585e", + "6c5b30dc67d94071af4d4b14cf4be7e4", + "e9200c8269fa4ebd9c1157cc1b871005", + "50ab44e431a54c27b409dc74c068c392", + "9929425b4df94d3792dc454afe59b3fd", + "03efbd25cd4341cea6714ffc3585632a", + "ae22d9a4de574ce3905a6f6c82fac1aa", + "71fe669e2f68444ba4b81ffc14a39c03", + "4d4c131206f448c7ba6d5c4e41126d41", + "ec260d8279bd4a40ad2bbc3c1d7dadaa", + "08611ca37f8c462db079dc2883f06002", + "47eca14208cc4e5085d44cce42872a35", + "49d3e88f60e744e7b37bcbcca5bbb087", + "81b9bb3315e4402ea5b6768d0d189591", + "4b88329de5ed48738ca0da054a1f0131", + "1507e771a5ca4056b0605cd453d89c60", + "44946e0a5d31408a851b0e8ab5217c43", + "ee8c999b66e84cbda17702e916d48a3f", + "6c6228d416944599b110ffa97b20bd8c", + "c519c27334b742ee8f14e29da2ebdf9b", + "7d422740d70546559703cd0304be663f", + "371ab2b9d7c84402b3c4b934e89eca4b", + "7ead683f167c408f88ba72b2ee1599d1", + "737d3d1e5a3146de96c17ca8ec72d75e", + "7ec9beb535c9428a954367613fa7f4cd", + "3c51b3c0b7ba4a2eacb1b8b2be8e024c", + "50dc5fc6a7354394ace536241fa01714", + "78ba04a44d9e42dba9fb7617d28c91e9", + "ca87704af3bc4c7590eea8f8f0f50d94", + "f3870bce67da4affa8925d9d898638da", + "e6efac23366643dd861caf121a8a220b", + "1044cf40d7e54337859bab0057aa0b54", + "3a6bb86ac3db4f82ae139507f94607e0", + "04a96e9ef4774bd1a3cd3a1dd20fe194", + "6158b1f05f6c4851b492df312f0312ca", + "b81c4146511045e280ba4fa226074679", + "3676c8217c654790972189be8c1f4627", + "762b2c8d23824070b2eb115e151f0c73", + "d9d57b09b01846a4805882b4adf64e55", + "e804cb9279dd458aa8b661d28c4427ff", + "3169169dbabb4b1aa7906a0415eacdcb", + "408130d71bb74141906cbc1d2123bb63", + "4be265bd67e1470cbd856dd268908c00" + ] + }, + "id": "b04c2Xq7IBac", + "outputId": "b09c7e51-baad-4dde-fea2-9d380a5988d9" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/557 [00:00\", \"\"]:\n", + " continue\n", + " #Token là subword (có đuôi \"@@\")\n", + " if token.endswith(\"@@\"):\n", + " current_vecs.append(emb)\n", + " else: #Token là phần cuối của một từ (không có \"@@\")\n", + " current_vecs.append(emb)\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + " current_vecs = []\n", + "\n", + " if current_vecs: # Trong trường hợp sót lại cuối câu\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + "\n", + " return word_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FSAhQKN1Xxzw", + "outputId": "ddd0edd3-145e-4966-b78c-2f66f83bfd14" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Không khí thật náo nhiệt .\n" + ] + } + ], + "source": [ + "raw_e = df['raw_text'][0]\n", + "print(raw_e)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OJ7ifS6wXxzw", + "outputId": "4908dbef-b495-4a17-e2f6-19a6b2b85eb3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Số lượng từ trong câu là: 31\n", + "[0, 1108, 19703, 6, 28163, 40, 57, 4, 68, 414, 364, 82, 213, 2747, 20899, 2533, 34, 23798, 4, 13468, 89, 532, 3364, 58, 2181, 33151, 4, 1124, 2396, 68, 17865, 135, 2]\n", + "31\n", + "độ dài của tokens 33\n", + "\n" + ] + } + ], + "source": [ + "sentence_e = 'Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...'\n", + "id_e = [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "def dem_so_tu(cau):\n", + " # Tách câu thành các từ bằng khoảng trắng\n", + " tu_danh_sach = cau.split()\n", + " # Đếm số lượng từ\n", + " return len(tu_danh_sach)\n", + "\n", + "# Ví dụ sử dụng\n", + "sentence_e = 'Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...'\n", + "so_tu = dem_so_tu(sentence_e)\n", + "print(\"Số lượng từ trong câu là:\", so_tu)\n", + "input_e = tokenizer.encode(sentence_e)\n", + "tokens_e = tokenizer.convert_ids_to_tokens(input_e[0])\n", + "print(input_e)\n", + "print(len(id_e))\n", + "print('độ dài của tokens',len(input_e))\n", + "print(tokens_e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 830, + "referenced_widgets": [ + "fabfacd2a2964d23994070bcb6bc4b3c", + "09db5366452347dfa40bbd192d22d489", + "706046f4266a4ade9f5f1718fc0bbf4f", + "d39e9db3bbf84736b090e12ecf9bd1fd", + "654a5b527d0c4c51afce2e65fb3b36aa", + "fa1045a3003d4496a2a5c3055355120d", + "f00793fc0f1948fca2fda701eb461505", + "9994f44df6a0451792559446361557e9", + "cc2a9437c039472f8447f9d0194459dc", + "2c64cf42d35a4722a3b32b366d1dcd1b", + "a9886da1334d4df0b4c6536255df6420" + ] + }, + "id": "3wpjBGK3JuwS", + "outputId": "ec11cd7f-84ca-402d-c7c0-b86db3ea555c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + " 0%| | 18/16858 [00:00<08:06, 34.64it/s]" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "model.safetensors: 0%| | 0.00/543M [00:00 Train Loss: {total_train_loss/len(train_loader):.4f} | \"\n", + "# f\"Test Loss: {avg_test_loss:.4f} | test_f1: {f1:.4f} | test_acc: {accuracy:.4f}\")\n", + "\n", + "\n", + "# print(\"\\n--- Training Finished ---\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "xnPOKnLBJF7A" + }, + "outputs": [], + "source": [ + "# # --- 5. Final Evaluation Report ---\n", + "# print(\"\\nFinal Test Set Performance:\")\n", + "# model.eval()\n", + "# all_preds_final, all_true_final = [], []\n", + "# with torch.no_grad():\n", + "# for x, y, lengths in tqdm(test_loader, desc=\"Generating Final Report\"):\n", + "# x, y = x.to(device), y.to(device)\n", + "# preds = torch.argmax(model(x), dim=2)\n", + "# for i in range(len(lengths)):\n", + "# true_len = lengths[i]\n", + "# all_true_final.extend(y[i, :true_len].cpu().numpy())\n", + "# all_preds_final.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + "# # Generate and print the classification report\n", + "# target_names = [id_tag[i] for i in range(NUM_TAGS)]\n", + "# report = classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4)\n", + "# print(\"\\nClassification Report:\\n\", report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "CM4FYvo4IL3e", + "outputId": "a86b87f7-0f74-4c96-dd9a-9450ca3b905b" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250610_121142-lmnb07kv" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run Softmax_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/lenguyenquocanh-vn-fptu-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/lenguyenquocanh-vn-fptu-fpt-university/NER/runs/lmnb07kv" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Starting Softmax Model Training...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 1/20: 100%|██████████| 841/841 [00:05<00:00, 149.09it/s, loss=0.122]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_1.pt\n", + "Epoch 1/20 -> Train Loss: 0.1224 | Val Loss: 0.0489 | Val F1: 0.7949 | Val Acc: 0.9847\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 2/20: 100%|██████████| 841/841 [00:04<00:00, 173.24it/s, loss=0.0419]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_2.pt\n", + "Epoch 2/20 -> Train Loss: 0.0419 | Val Loss: 0.0359 | Val F1: 0.8518 | Val Acc: 0.9885\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 3/20: 100%|██████████| 841/841 [00:05<00:00, 159.28it/s, loss=0.0338]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_3.pt\n", + "Epoch 3/20 -> Train Loss: 0.0338 | Val Loss: 0.0314 | Val F1: 0.8666 | Val Acc: 0.9898\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 4/20: 100%|██████████| 841/841 [00:04<00:00, 174.77it/s, loss=0.03]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_4.pt\n", + "Epoch 4/20 -> Train Loss: 0.0300 | Val Loss: 0.0295 | Val F1: 0.8674 | Val Acc: 0.9899\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 5/20: 100%|██████████| 841/841 [00:05<00:00, 157.59it/s, loss=0.0279]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_5.pt\n", + "Epoch 5/20 -> Train Loss: 0.0279 | Val Loss: 0.0281 | Val F1: 0.8838 | Val Acc: 0.9908\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 6/20: 100%|██████████| 841/841 [00:04<00:00, 172.50it/s, loss=0.026]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_6.pt\n", + "Epoch 6/20 -> Train Loss: 0.0260 | Val Loss: 0.0268 | Val F1: 0.8838 | Val Acc: 0.9909\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 7/20: 100%|██████████| 841/841 [00:05<00:00, 156.51it/s, loss=0.0249]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_7.pt\n", + "Epoch 7/20 -> Train Loss: 0.0249 | Val Loss: 0.0262 | Val F1: 0.8855 | Val Acc: 0.9910\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 8/20: 100%|██████████| 841/841 [00:04<00:00, 173.05it/s, loss=0.0238]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_8.pt\n", + "Epoch 8/20 -> Train Loss: 0.0238 | Val Loss: 0.0258 | Val F1: 0.8849 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 9/20: 100%|██████████| 841/841 [00:05<00:00, 158.86it/s, loss=0.0228]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9/20 -> Train Loss: 0.0228 | Val Loss: 0.0256 | Val F1: 0.8850 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 10/20: 100%|██████████| 841/841 [00:04<00:00, 170.77it/s, loss=0.0224]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_10.pt\n", + "Epoch 10/20 -> Train Loss: 0.0224 | Val Loss: 0.0254 | Val F1: 0.8866 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 11/20: 100%|██████████| 841/841 [00:05<00:00, 163.16it/s, loss=0.0218]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_11.pt\n", + "Epoch 11/20 -> Train Loss: 0.0218 | Val Loss: 0.0249 | Val F1: 0.8908 | Val Acc: 0.9916\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 12/20: 100%|██████████| 841/841 [00:04<00:00, 170.64it/s, loss=0.021]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12/20 -> Train Loss: 0.0210 | Val Loss: 0.0252 | Val F1: 0.8885 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 13/20: 100%|██████████| 841/841 [00:05<00:00, 161.40it/s, loss=0.0209]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13/20 -> Train Loss: 0.0209 | Val Loss: 0.0250 | Val F1: 0.8902 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 14/20: 100%|██████████| 841/841 [00:04<00:00, 170.76it/s, loss=0.0203]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14/20 -> Train Loss: 0.0203 | Val Loss: 0.0251 | Val F1: 0.8895 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 15/20: 100%|██████████| 841/841 [00:05<00:00, 162.77it/s, loss=0.0199]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15/20 -> Train Loss: 0.0199 | Val Loss: 0.0250 | Val F1: 0.8868 | Val Acc: 0.9913\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 16/20: 100%|██████████| 841/841 [00:04<00:00, 171.25it/s, loss=0.0197]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16/20 -> Train Loss: 0.0197 | Val Loss: 0.0253 | Val F1: 0.8888 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 17/20: 100%|██████████| 841/841 [00:05<00:00, 160.48it/s, loss=0.0195]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17/20 -> Train Loss: 0.0195 | Val Loss: 0.0250 | Val F1: 0.8900 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 18/20: 100%|██████████| 841/841 [00:04<00:00, 168.69it/s, loss=0.0192]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18/20 -> Train Loss: 0.0192 | Val Loss: 0.0250 | Val F1: 0.8893 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 19/20: 100%|██████████| 841/841 [00:05<00:00, 163.39it/s, loss=0.0188]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_19.pt\n", + "Epoch 19/20 -> Train Loss: 0.0188 | Val Loss: 0.0253 | Val F1: 0.8926 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 20/20: 100%|██████████| 841/841 [00:04<00:00, 168.43it/s, loss=0.0188]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_20.pt\n", + "Epoch 20/20 -> Train Loss: 0.0188 | Val Loss: 0.0249 | Val F1: 0.8936 | Val Acc: 0.9918\n", + "\n", + "--- Training Finished ---\n", + "\n", + "Final Test Set Performance:\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " O 0.9973 0.9973 0.9973 68476\n", + " B-PER 0.9869 0.9768 0.9818 1464\n", + " I-PER 0.9810 0.9767 0.9788 686\n", + " B-ORG 0.7709 0.8249 0.7970 257\n", + " I-ORG 0.7981 0.7721 0.7849 430\n", + " B-LOC 0.8809 0.9001 0.8904 1241\n", + " I-LOC 0.8339 0.8159 0.8248 554\n", + "\n", + " accuracy 0.9918 73108\n", + " macro avg 0.8927 0.8948 0.8936 73108\n", + "weighted avg 0.9918 0.9918 0.9918 73108\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score\n", + "from tqdm import tqdm\n", + "import wandb\n", + "\n", + "# Create checkpoint directory\n", + "os.makedirs(\"checkpoints\", exist_ok=True)\n", + "\n", + "# Initialize Weights & Biases\n", + "wandb.init(\n", + " project=\"NER\",\n", + " name=\"Softmax_VLSP2016\",\n", + " config={\n", + " \"epochs\": 20,\n", + " \"batch_size\": 16,\n", + " \"learning_rate\": 1e-3,\n", + " \"input_dim\": 768,\n", + " \"test_size\": 0.2\n", + " }\n", + ")\n", + "\n", + "# --- Dataset ---\n", + "class NERDataset(Dataset):\n", + " def __init__(self, embeddings, labels):\n", + " self.embeddings = embeddings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.embeddings)\n", + "\n", + " def __getitem__(self, idx):\n", + " return self.embeddings[idx], self.labels[idx]\n", + "\n", + "def collate_fn(batch):\n", + " embeddings, labels = zip(*batch)\n", + " lengths = [len(x) for x in embeddings]\n", + " max_len = max(lengths)\n", + "\n", + " padded_embs = torch.stack([\n", + " torch.cat([e, torch.zeros(max_len - len(e), e.size(1))]) for e in embeddings\n", + " ])\n", + " padded_labels = torch.stack([\n", + " torch.cat([l, torch.full((max_len - len(l),), -1)]) for l in labels\n", + " ])\n", + " return padded_embs, padded_labels, lengths\n", + "\n", + "# --- Model ---\n", + "class SoftmaxTagger(nn.Module):\n", + " def __init__(self, input_dim, num_tags):\n", + " super().__init__()\n", + " self.hidden2tag = nn.Linear(input_dim, num_tags)\n", + "\n", + " def forward(self, x):\n", + " return self.hidden2tag(x)\n", + "\n", + " def save_model(self, path):\n", + " torch.save(self.state_dict(), path)\n", + "\n", + " def load_model(self, path):\n", + " self.load_state_dict(torch.load(path))\n", + " self.eval()\n", + "\n", + "# --- Evaluation ---\n", + "def evaluate_softmax(model, dataloader, loss_fn, device):\n", + " model.eval()\n", + " total_loss = 0\n", + " all_preds, all_true = [], []\n", + "\n", + " with torch.no_grad():\n", + " for x, y, lengths in dataloader:\n", + " x, y = x.to(device), y.to(device)\n", + " emissions = model(x)\n", + " loss = loss_fn(emissions.view(-1, model.hidden2tag.out_features), y.view(-1))\n", + " total_loss += loss.item()\n", + " preds = torch.argmax(emissions, dim=2)\n", + " for i in range(len(lengths)):\n", + " true_len = lengths[i]\n", + " all_true.extend(y[i, :true_len].cpu().numpy())\n", + " all_preds.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + " precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)\n", + " accuracy = accuracy_score(all_true, all_preds)\n", + "\n", + " return total_loss / len(dataloader), precision, recall, f1, accuracy, all_preds, all_true\n", + "\n", + "# Train/test split\n", + "train_embs, test_embs, train_labels, test_labels = train_test_split(\n", + " all_embeddings, all_labels, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "train_dataset = NERDataset(train_embs, train_labels)\n", + "test_dataset = NERDataset(test_embs, test_labels)\n", + "\n", + "BATCH_SIZE = wandb.config.batch_size\n", + "train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n", + "test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n", + "\n", + "INPUT_DIM = wandb.config.input_dim\n", + "NUM_TAGS = max(label.max().item() for label in all_labels) + 1\n", + "LEARNING_RATE = wandb.config.learning_rate\n", + "EPOCHS = wandb.config.epochs\n", + "\n", + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS).to(device)\n", + "loss_fn = nn.CrossEntropyLoss(ignore_index=-1)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)\n", + "\n", + "best_f1 = 0\n", + "best_acc = 0\n", + "\n", + "# --- Training Loop ---\n", + "print(\"Starting Softmax Model Training...\")\n", + "for epoch in range(1, EPOCHS + 1):\n", + " model.train()\n", + " total_train_loss = 0\n", + " all_train_preds, all_train_true = [], []\n", + "\n", + " train_bar = tqdm(train_loader, desc=f\"Epoch {epoch}/{EPOCHS}\")\n", + " for x, y, lengths in train_bar:\n", + " x, y = x.to(device), y.to(device)\n", + " emissions = model(x)\n", + " loss = loss_fn(emissions.view(-1, NUM_TAGS), y.view(-1))\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " total_train_loss += loss.item()\n", + " train_bar.set_postfix(loss=total_train_loss / len(train_bar))\n", + "\n", + " preds = torch.argmax(emissions, dim=2)\n", + " for i in range(len(lengths)):\n", + " true_len = lengths[i]\n", + " all_train_true.extend(y[i, :true_len].cpu().numpy())\n", + " all_train_preds.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + " train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(\n", + " all_train_true, all_train_preds, average='macro', zero_division=0\n", + " )\n", + " train_acc = accuracy_score(all_train_true, all_train_preds)\n", + "\n", + " # Validation\n", + " val_loss, val_precision, val_recall, val_f1, val_acc, _, _ = evaluate_softmax(model, test_loader, loss_fn, device)\n", + "\n", + " # Logging to wandb\n", + " wandb.log({\n", + " \"epoch\": epoch,\n", + " \"avg_train_loss\": total_train_loss / len(train_loader),\n", + " \"train_precision\": train_precision,\n", + " \"train_recall\": train_recall,\n", + " \"train_f1\": train_f1,\n", + " \"train_acc\": train_acc,\n", + " \"val_loss\": val_loss,\n", + " \"val_precision\": val_precision,\n", + " \"val_recall\": val_recall,\n", + " \"val_f1\": val_f1,\n", + " \"val_acc\": val_acc,\n", + " })\n", + "\n", + " # Save best model\n", + " if val_f1 > best_f1 or val_acc > best_acc:\n", + " best_f1 = max(val_f1, best_f1)\n", + " best_acc = max(val_acc, best_acc)\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " model.save_model(ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + " print(f\"Epoch {epoch}/{EPOCHS} -> Train Loss: {total_train_loss/len(train_loader):.4f} | \"\n", + " f\"Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f} | Val Acc: {val_acc:.4f}\")\n", + "\n", + "print(\"\\n--- Training Finished ---\")\n", + "\n", + "# --- Final Evaluation Report ---\n", + "print(\"\\nFinal Test Set Performance:\")\n", + "model.eval()\n", + "_, _, _, _, _, all_preds_final, all_true_final = evaluate_softmax(model, test_loader, loss_fn, device)\n", + "\n", + "# Classification report table\n", + "target_names = [id_tag[i] for i in range(NUM_TAGS)]\n", + "report = classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4, output_dict=True)\n", + "\n", + "# Log report as wandb table\n", + "table = wandb.Table(columns=[\"Label\", \"Precision\", \"Recall\", \"F1-score\", \"Support\"])\n", + "for label in target_names:\n", + " row = report[label]\n", + " table.add_data(label, row[\"precision\"], row[\"recall\"], row[\"f1-score\"], row[\"support\"])\n", + "\n", + "wandb.log({\"Test Classification Report\": table})\n", + "print(\"\\nClassification Report:\\n\", classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Ppa-bdT8r2v" + }, + "source": [ + "# Lưu data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "s9GulKoGqx6d" + }, + "outputs": [], + "source": [ + "def save_tensors(all_embeddings, all_labels, embed_path='embeddings.pt', label_path='labels.pt'):\n", + " torch.save(all_embeddings, embed_path)\n", + " torch.save(all_labels, label_path)\n", + " print(f\"Saved embeddings to {embed_path} and labels to {label_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "AGAJZH_h8ve6", + "outputId": "1893fbfa-dbcc-48f9-b6e3-ef17f9eef51c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved embeddings to embeddings.pt and labels to labels.pt\n", + "Mounted at /content/drive\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/labels.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "from google.colab import drive\n", + "import shutil\n", + "\n", + "# Gọi hàm đã viết\n", + "save_tensors(all_embeddings, all_labels)\n", + "\n", + "# Mount và tải lên Drive\n", + "drive.mount('/content/drive')\n", + "shutil.copy('embeddings.pt', '/content/drive/My Drive')\n", + "shutil.copy('labels.pt', '/content/drive/My Drive')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "ESWu8QI59dwl" + }, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"softmax_tagger.pth\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p_iixQwcVuum", + "outputId": "9d4875b8-f67a-46aa-c5c4-a001836e9cb6" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "SoftmaxTagger(\n", + " (hidden2tag): Linear(in_features=768, out_features=7, bias=True)\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], + "source": [ + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS)\n", + "model.load_state_dict(torch.load(\"softmax_tagger.pth\"))\n", + "model.eval() # chuyển sang chế độ đánh giá nếu cần\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "vM2lbEBkXxzy" + }, + "outputs": [], + "source": [ + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\")\n", + "model_bert = AutoModel.from_pretrained(\"vinai/phobert-base\").to(device) # PhoBERT để lấy embedding\n", + "\n", + "# model là SoftmaxTagger đã train xong\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "InnYf_SyXxzy" + }, + "outputs": [], + "source": [ + "def predict_ner(text):\n", + " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " model.eval()\n", + "\n", + " # Tokenize văn bản đầu vào\n", + " input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(device)\n", + " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())\n", + "\n", + " # Lấy embedding đầu ra từ PhoBERT\n", + " with torch.no_grad():\n", + " outputs = model_bert(input_ids)\n", + " last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()\n", + "\n", + " # Gộp embedding của từ bị tách (sentencepiece)\n", + " word_embeds = group_embeddings(tokens, last_hidden_state)\n", + "\n", + " # Chuyển sang tensor\n", + " x_tensor = torch.stack(word_embeds).unsqueeze(0).to(device) # (1, seq_len, 768)\n", + "\n", + " # Dự đoán\n", + " with torch.no_grad():\n", + " emissions = model(x_tensor)\n", + " preds = torch.argmax(emissions, dim=2).squeeze(0).cpu().tolist()\n", + "\n", + " # Trích xuất token gốc không bị tách '@@'\n", + " final_tokens = []\n", + " current_token = \"\"\n", + " for tok in tokens:\n", + " if tok in [\"\", \"\"]:\n", + " continue\n", + " if tok.endswith(\"@@\"):\n", + " current_token += tok[:-2]\n", + " else:\n", + " current_token += tok\n", + " final_tokens.append(current_token)\n", + " current_token = \"\"\n", + "\n", + " # Ánh xạ sang tên nhãn\n", + " label_names = [id_tag[i] for i in preds]\n", + "\n", + " return preds, label_names, final_tokens\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "aSj0-dl1Xxzy", + "outputId": "26bbf3fe-b011-4a59-bb3e-03d9511be21a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: gradio in /usr/local/lib/python3.11/dist-packages (5.31.0)\n", + "Requirement already satisfied: aiofiles<25.0,>=22.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (24.1.0)\n", + "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (4.9.0)\n", + "Requirement already satisfied: fastapi<1.0,>=0.115.2 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.115.12)\n", + "Requirement already satisfied: ffmpy in /usr/local/lib/python3.11/dist-packages (from gradio) (0.6.0)\n", + "Requirement already satisfied: gradio-client==1.10.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (1.10.1)\n", + "Requirement already satisfied: groovy~=0.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.1.2)\n", + "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.28.1)\n", + "Requirement already satisfied: huggingface-hub>=0.28.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.32.4)\n", + "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.1.6)\n", + "Requirement already satisfied: markupsafe<4.0,>=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.0.2)\n", + "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.0.2)\n", + "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.10.18)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from gradio) (24.2)\n", + "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.2.2)\n", + "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (11.2.1)\n", + "Requirement already satisfied: pydantic<2.12,>=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.11.5)\n", + "Requirement already satisfied: pydub in /usr/local/lib/python3.11/dist-packages (from gradio) (0.25.1)\n", + "Requirement already satisfied: python-multipart>=0.0.18 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.0.20)\n", + "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (6.0.2)\n", + "Requirement already satisfied: ruff>=0.9.3 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.11.12)\n", + "Requirement already satisfied: safehttpx<0.2.0,>=0.1.6 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.1.6)\n", + "Requirement already satisfied: semantic-version~=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.10.0)\n", + "Requirement already satisfied: starlette<1.0,>=0.40.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.46.2)\n", + "Requirement already satisfied: tomlkit<0.14.0,>=0.12.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.13.2)\n", + "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.16.0)\n", + "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (4.14.0)\n", + "Requirement already satisfied: uvicorn>=0.14.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.34.3)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from gradio-client==1.10.1->gradio) (2025.3.2)\n", + "Requirement already satisfied: websockets<16.0,>=10.0 in /usr/local/lib/python3.11/dist-packages (from gradio-client==1.10.1->gradio) (15.0.1)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.11/dist-packages (from anyio<5.0,>=3.0->gradio) (3.10)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.11/dist-packages (from anyio<5.0,>=3.0->gradio) (1.3.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx>=0.24.1->gradio) (2025.4.26)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx>=0.24.1->gradio) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.16.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (3.18.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (4.67.1)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (1.1.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (2.33.2)\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (0.4.1)\n", + "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (8.2.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.17.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.19.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.28.1->gradio) (3.4.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.28.1->gradio) (2.4.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n" + ] + } + ], + "source": [ + "pip install gradio" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SshnFWQzj6aS", + "outputId": "f25a7aa5-f179-472f-a79c-11df166497be" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "SoftmaxTagger(\n", + " (hidden2tag): Linear(in_features=768, out_features=7, bias=True)\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], + "source": [ + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS) # Make sure INPUT_DIM and NUM_TAGS are defined or accessible here\n", + "model.load_state_dict(torch.load(\"softmax_tagger.pth\"))\n", + "model.eval() # chuyển sang chế độ đánh giá nếu cần\n", + "model.to(device) # Add this line to move the model to the device" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 625 + }, + "id": "wIK-QRWmXxzz", + "outputId": "2547cdda-a687-46a9-9243-cea7a8916de6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", + "\n", + "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", + "* Running on public URL: https://c3f739dbf40a0a0681.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [] + }, + "metadata": {}, + "execution_count": 29 + } + ], + "source": [ + "import gradio as gr\n", + "import json\n", + "import tempfile\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "\n", + "def ner_interface(text):\n", + " try:\n", + " ids, labels, tokens = predict_ner(text)\n", + " data = [[token, label, _id] for token, label, _id in zip(tokens, labels, ids)]\n", + " json_result = {\n", + " \"tokens\": tokens,\n", + " \"labels\": labels,\n", + " \"label_ids\": ids\n", + " }\n", + " return data, json_result\n", + " except Exception as e:\n", + " print(\"Error:\", e)\n", + " return [[\"Lỗi\", str(e), \"\"]], {\"error\": str(e)}\n", + "\n", + "def json_to_file(json_data):\n", + " # Tạo file tạm thời để trả về cho gr.File tải về\n", + " tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8')\n", + " json.dump(json_data, tmp, ensure_ascii=False, indent=2)\n", + " tmp.close()\n", + " return tmp.name\n", + "\n", + "\n", + "with gr.Blocks(title=\"Nhận dạng Thực thể (NER) với PhoBERT\") as demo:\n", + " gr.Markdown(\"## 📌 Hệ thống Nhận dạng Thực thể Tên (NER) sử dụng PhoBERT + Softmax\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " input_text = gr.Textbox(\n", + " lines=4,\n", + " label=\"✍️ Nhập văn bản đầu vào\",\n", + " placeholder=\"Ví dụ: Nguyễn Văn A sinh ra ở Hà Nội.\"\n", + " )\n", + " btn = gr.Button(\"🚀 Nhận dạng Thực thể\")\n", + "\n", + " with gr.Column(scale=5):\n", + " output_table = gr.Dataframe(\n", + " headers=[\"Token\", \"Label\", \"ID\"],\n", + " label=\"📄 Kết quả nhận dạng thực thể\",\n", + " wrap=True\n", + " )\n", + " output_json = gr.JSON(visible=False) # Có thể bật nếu muốn hiển thị JSON\n", + "\n", + " with gr.Row():\n", + " download_trigger = gr.Button(\"💾 Tải kết quả dưới dạng JSON\")\n", + " download_file = gr.File(label=\"📥 File JSON đã xử lý\")\n", + "\n", + " # Hành động xử lý NER\n", + " btn.click(fn=ner_interface, inputs=input_text, outputs=[output_table, output_json])\n", + " download_trigger.click(fn=json_to_file, inputs=output_json, outputs=download_file)\n", + "\n", + "demo.launch()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "mvQgpNetjo02" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "DiCxlUcHQ9NJ" + ], + "gpuType": "T4", + "provenance": [] + }, + "kaggle": { + "accelerator": "nvidiaTeslaT4", + "dataSources": [], + "dockerImageVersionId": 31040, + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1d9aaa035056485c959f66b60cf41714": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_71054a4397e344a2a66e32892a37b59b", + "IPY_MODEL_6c582399be1d42c3b5a4ef21743d1a26", + "IPY_MODEL_26a6ef7f967a4504a698ff3152ccb24e" + ], + "layout": "IPY_MODEL_82669dd27686486588fdd7d11f49edd3" + } + }, + "71054a4397e344a2a66e32892a37b59b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_461c136b3eac4a9dadb8a3af7c11c98a", + "placeholder": "​", + "style": "IPY_MODEL_4fb68dcea1ac4e54b82c92ad64e9be95", + "value": "config.json: 100%" + } + }, + "6c582399be1d42c3b5a4ef21743d1a26": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df397db853874f6db911acb667785ec0", + "max": 557, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2de5befe0ab24de9a62ba076e5abf78e", + "value": 557 + } + }, + "26a6ef7f967a4504a698ff3152ccb24e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d7da38e7c5e1484597bb1faae3c2d7f5", + "placeholder": "​", + "style": "IPY_MODEL_2ec3573cb04143a8ba5e555bfaf60165", + "value": " 557/557 [00:00<00:00, 52.7kB/s]" + } + }, + "82669dd27686486588fdd7d11f49edd3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "461c136b3eac4a9dadb8a3af7c11c98a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fb68dcea1ac4e54b82c92ad64e9be95": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "df397db853874f6db911acb667785ec0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2de5befe0ab24de9a62ba076e5abf78e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d7da38e7c5e1484597bb1faae3c2d7f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ec3573cb04143a8ba5e555bfaf60165": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "67c557f2651b4e5c81e9af82531898fe": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a5a04069ebd41fdba7835e1b5da585e", + "IPY_MODEL_6c5b30dc67d94071af4d4b14cf4be7e4", + "IPY_MODEL_e9200c8269fa4ebd9c1157cc1b871005" + ], + "layout": "IPY_MODEL_50ab44e431a54c27b409dc74c068c392" + } + }, + "5a5a04069ebd41fdba7835e1b5da585e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9929425b4df94d3792dc454afe59b3fd", + "placeholder": "​", + "style": "IPY_MODEL_03efbd25cd4341cea6714ffc3585632a", + "value": "vocab.txt: 100%" + } + }, + "6c5b30dc67d94071af4d4b14cf4be7e4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ae22d9a4de574ce3905a6f6c82fac1aa", + "max": 895321, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_71fe669e2f68444ba4b81ffc14a39c03", + "value": 895321 + } + }, + "e9200c8269fa4ebd9c1157cc1b871005": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d4c131206f448c7ba6d5c4e41126d41", + "placeholder": "​", + "style": "IPY_MODEL_ec260d8279bd4a40ad2bbc3c1d7dadaa", + "value": " 895k/895k [00:00<00:00, 1.91MB/s]" + } + }, + "50ab44e431a54c27b409dc74c068c392": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9929425b4df94d3792dc454afe59b3fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "03efbd25cd4341cea6714ffc3585632a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ae22d9a4de574ce3905a6f6c82fac1aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "71fe669e2f68444ba4b81ffc14a39c03": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4d4c131206f448c7ba6d5c4e41126d41": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec260d8279bd4a40ad2bbc3c1d7dadaa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "08611ca37f8c462db079dc2883f06002": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_47eca14208cc4e5085d44cce42872a35", + "IPY_MODEL_49d3e88f60e744e7b37bcbcca5bbb087", + "IPY_MODEL_81b9bb3315e4402ea5b6768d0d189591" + ], + "layout": "IPY_MODEL_4b88329de5ed48738ca0da054a1f0131" + } + }, + "47eca14208cc4e5085d44cce42872a35": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1507e771a5ca4056b0605cd453d89c60", + "placeholder": "​", + "style": "IPY_MODEL_44946e0a5d31408a851b0e8ab5217c43", + "value": "bpe.codes: 100%" + } + }, + "49d3e88f60e744e7b37bcbcca5bbb087": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8c999b66e84cbda17702e916d48a3f", + "max": 1135173, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c6228d416944599b110ffa97b20bd8c", + "value": 1135173 + } + }, + "81b9bb3315e4402ea5b6768d0d189591": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c519c27334b742ee8f14e29da2ebdf9b", + "placeholder": "​", + "style": "IPY_MODEL_7d422740d70546559703cd0304be663f", + "value": " 1.14M/1.14M [00:00<00:00, 1.74MB/s]" + } + }, + "4b88329de5ed48738ca0da054a1f0131": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1507e771a5ca4056b0605cd453d89c60": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "44946e0a5d31408a851b0e8ab5217c43": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee8c999b66e84cbda17702e916d48a3f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c6228d416944599b110ffa97b20bd8c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c519c27334b742ee8f14e29da2ebdf9b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d422740d70546559703cd0304be663f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "371ab2b9d7c84402b3c4b934e89eca4b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ead683f167c408f88ba72b2ee1599d1", + "IPY_MODEL_737d3d1e5a3146de96c17ca8ec72d75e", + "IPY_MODEL_7ec9beb535c9428a954367613fa7f4cd" + ], + "layout": "IPY_MODEL_3c51b3c0b7ba4a2eacb1b8b2be8e024c" + } + }, + "7ead683f167c408f88ba72b2ee1599d1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_50dc5fc6a7354394ace536241fa01714", + "placeholder": "​", + "style": "IPY_MODEL_78ba04a44d9e42dba9fb7617d28c91e9", + "value": "tokenizer.json: 100%" + } + }, + "737d3d1e5a3146de96c17ca8ec72d75e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ca87704af3bc4c7590eea8f8f0f50d94", + "max": 3132320, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f3870bce67da4affa8925d9d898638da", + "value": 3132320 + } + }, + "7ec9beb535c9428a954367613fa7f4cd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e6efac23366643dd861caf121a8a220b", + "placeholder": "​", + "style": "IPY_MODEL_1044cf40d7e54337859bab0057aa0b54", + "value": " 3.13M/3.13M [00:00<00:00, 6.78MB/s]" + } + }, + "3c51b3c0b7ba4a2eacb1b8b2be8e024c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "50dc5fc6a7354394ace536241fa01714": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "78ba04a44d9e42dba9fb7617d28c91e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ca87704af3bc4c7590eea8f8f0f50d94": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3870bce67da4affa8925d9d898638da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e6efac23366643dd861caf121a8a220b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1044cf40d7e54337859bab0057aa0b54": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3a6bb86ac3db4f82ae139507f94607e0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_04a96e9ef4774bd1a3cd3a1dd20fe194", + "IPY_MODEL_6158b1f05f6c4851b492df312f0312ca", + "IPY_MODEL_b81c4146511045e280ba4fa226074679" + ], + "layout": "IPY_MODEL_3676c8217c654790972189be8c1f4627" + } + }, + "04a96e9ef4774bd1a3cd3a1dd20fe194": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_762b2c8d23824070b2eb115e151f0c73", + "placeholder": "​", + "style": "IPY_MODEL_d9d57b09b01846a4805882b4adf64e55", + "value": "pytorch_model.bin: 100%" + } + }, + "6158b1f05f6c4851b492df312f0312ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e804cb9279dd458aa8b661d28c4427ff", + "max": 542923308, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3169169dbabb4b1aa7906a0415eacdcb", + "value": 542923308 + } + }, + "b81c4146511045e280ba4fa226074679": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_408130d71bb74141906cbc1d2123bb63", + "placeholder": "​", + "style": "IPY_MODEL_4be265bd67e1470cbd856dd268908c00", + "value": " 543M/543M [00:01<00:00, 366MB/s]" + } + }, + "3676c8217c654790972189be8c1f4627": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "762b2c8d23824070b2eb115e151f0c73": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9d57b09b01846a4805882b4adf64e55": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e804cb9279dd458aa8b661d28c4427ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3169169dbabb4b1aa7906a0415eacdcb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "408130d71bb74141906cbc1d2123bb63": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4be265bd67e1470cbd856dd268908c00": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fabfacd2a2964d23994070bcb6bc4b3c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_09db5366452347dfa40bbd192d22d489", + "IPY_MODEL_706046f4266a4ade9f5f1718fc0bbf4f", + "IPY_MODEL_d39e9db3bbf84736b090e12ecf9bd1fd" + ], + "layout": "IPY_MODEL_654a5b527d0c4c51afce2e65fb3b36aa" + } + }, + "09db5366452347dfa40bbd192d22d489": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fa1045a3003d4496a2a5c3055355120d", + "placeholder": "​", + "style": "IPY_MODEL_f00793fc0f1948fca2fda701eb461505", + "value": "model.safetensors: 100%" + } + }, + "706046f4266a4ade9f5f1718fc0bbf4f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9994f44df6a0451792559446361557e9", + "max": 542900336, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cc2a9437c039472f8447f9d0194459dc", + "value": 542900336 + } + }, + "d39e9db3bbf84736b090e12ecf9bd1fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2c64cf42d35a4722a3b32b366d1dcd1b", + "placeholder": "​", + "style": "IPY_MODEL_a9886da1334d4df0b4c6536255df6420", + "value": " 543M/543M [00:02<00:00, 213MB/s]" + } + }, + "654a5b527d0c4c51afce2e65fb3b36aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fa1045a3003d4496a2a5c3055355120d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f00793fc0f1948fca2fda701eb461505": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9994f44df6a0451792559446361557e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc2a9437c039472f8447f9d0194459dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2c64cf42d35a4722a3b32b366d1dcd1b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9886da1334d4df0b4c6536255df6420": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file