{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "5e06060e-f3d7-4e1e-b97e-dc57d8d17ce5", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "6831b89a-0776-4014-a3db-9e1860a4c80c", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /home/inflaton/code/projects/courses/novel-translation\n" ] } ], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "1bdd4cdb-cb26-4527-862d-66ea2a7a1f05", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading env vars from: /home/inflaton/code/projects/courses/novel-translation/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "14807e21-2648-48a3-9916-6c576fc61d2e", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "('unsloth/Qwen2-1.5B-Instruct',\n", " True,\n", " 'models/Qwen2-1.5B-Instruct-MAC-',\n", " 'Qwen2-1.5B-Instruct-MAC-',\n", " 2048,\n", " 10,\n", " None,\n", " 'datasets/mac/mac.tsv')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "model_name = os.getenv(\"MODEL_NAME\")\n", "token = os.getenv(\"HF_TOKEN\") or None\n", "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n", "local_model = os.getenv(\"LOCAL_MODEL\")\n", "hub_model = os.getenv(\"HUB_MODEL\")\n", "num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n", "data_path = os.getenv(\"DATA_PATH\")\n", "\n", "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = (\n", " None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", ")\n", "\n", "model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "bc44b98b-6394-4b2c-af2f-8caa40b28453", "showTitle": false, "title": "" }, "id": "r2v_X2fA0Df5" }, "source": [ "* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc\n", "* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.\n", "* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.\n", "* With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co/unsloth) has Llama, Mistral 4bit models.\n", "* [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "b952e9b9-edf1-4bb8-b52b-bb714852c721", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 353, "referenced_widgets": [ "98c58f23f4d549518832cb2d18f796e8", "09b76013aa9e45efb6deb23a7a0d0925", "39b29a75374b45c0a22506010be2b84e", "78e5400bff924a92a4cc61c4ff18b182", "2a58d04b428c46f4b3dbadd3bc6cd529", "dea41c5260884aa6879b5e1d1697b14f", "89965917796a4f81b899fdc7685f33df", "30cdc32298134cb0be4d41615b9e5774", "47928317548c454bba6358ab132e8dee", "b9b313fd861948f5aba25b24b1518d30", "4c666f4ace3943f8b80ecd20e7503236", "c22f71b1f85843209d7e5321506b9cb9", "1f44c9ce1adf470cbb19784493ed209f", "f1addc4479d849879e743cf9089e6540", "8b3505352a5a42bf910428c40ce40465", "4c4c88d4c701450692fa0f6b0c5764b0", "0c34be936c8145d3ab41282f30a70713", "0a92c56bfa134ef583220d7ef0b13e17", "43dec2ede91341f5af60eb522e18e984", "d8e5318cead340c4adbeaccc05d39225", "49277aeeac16434a865a4d12308b1abc", "2157f01726d748f8a9ae4a00664430da", "fce7a61c25ec4390af43d92b7c473a45", "30307300bc4e4baf96560e30969a82b6", "8fc142b628fb40568730234de1cafde2", "a8464a4c711e4e00aafdfc919b60d07e", "5f40db8173dd4d76b6ef5ed6d9ec8b6e", "e36a3f9eff0e4cf68834d66b0213ae96", "a0037bdccf254159becde630bee3d1db", "4ae7e449e4ea4c729b5f34607c18ebae", "3572201bd4d74a58b7a665f9bdfdcdba", "fb995c740590427b882572c81d4e848c", "201b59ccd9f845e197029b57e424aefc", "cf245afeb1c04f29a24d291608c3d157", "b518dcee69074b87be73957cd810e7ed", "e29104486d594b2992d7285e0ef77371", "6578fd7acdb54c4c93528ea431fd0144", "d35db8148a354c56aaac56dbae22536f", "d891f8d0b1fc462f8008d02bb2a15692", "cced8fd7e998472794f3f3e3018956a5", "a9f0cc51fc3d4d7b874c32dcf1c5bdf2", "2f6c70dd266c4816bfad3fd3d192929a", "370692d819df41828b48c4ad446f977b", "a0bf9160eb2647409b3200270914b90f", "2d18ddf6482c4d97829ac0e5a7b9868f", "9f679ad3ec7f4fe8ad0510ffb57bc2ab", "f2df530d22c74977b249dd9fb5f4829b", "89b2ef0dbfea47ab8e6f8d659e3351d1", "3056b148aa9f4e6e8aa3b61d26886255", "4ea63adfce694725bdba878aef709dd3", "74501720ac7e4dbb911a4a99b3633bc6", "21db8a77b00d4a4e82fdfa608657531f", "6dbbedeca9314e66ae50e44ffa31a414", "b8908fa0df3743ecb9d12983a739104f", "177c78fce95d4b4ab33057c5a048d693", "27155728b6b84cb199c91c940095d0a8", "6b91feeed5464877991ac2c207aebe7c", "cca8113c54c0495daedce1327bf9c68b", "2e63a29e2f7247bba5beede9a568c99f", "5c9d781c28944f3eb86e2a6d44efdf18", "4b2061b8a73c43ffb0c2f83daf0d0183", "69ac12aec0714318bf2c83d4f4e745f5", "e02f9b7849c64531835eb77b860d1c93", "56aee4853b7740e6a977254f5d1fa66d", "b993eaec6b224440bf80c0958c6fb536", "de868e26e7154f62aa86223a539ad421" ] }, "id": "QmUBVEnvCDJv", "outputId": "a0e2d781-4934-415a-90b4-35165b9e44c5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a44529371839466cae7797d068873634", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/707 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "==((====))== Unsloth: Fast Qwen2 patching release 2024.5\n", " \\\\ /| GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.994 GB. Platform = Linux.\n", "O^O/ \\_/ \\ Pytorch: 2.2.2+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.\n", "\\ / Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.\n", " \"-____-\" Free Apache license: http://github.com/unslothai/unsloth\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6303708b46824ec791429f29c5fc9e3c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/3.09G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1f7bee3044444f50bb516e950154cd8a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "generation_config.json: 0%| | 0.00/242 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3400b41d20884eed8a36b4b7abe91035", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/1.32k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "443feea33b4a4ed5b703b6963c79e7c5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "vocab.json: 0%| | 0.00/2.78M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "53f5f46aa4de429b81ecaa8c0af52630", "version_major": 2, "version_minor": 0 }, "text/plain": [ "merges.txt: 0%| | 0.00/1.67M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "15ab203e3e8c4c3da4591ecb09d71d77", "version_major": 2, "version_minor": 0 }, "text/plain": [ "added_tokens.json: 0%| | 0.00/80.0 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "09a9a9b536c5472b950db964893a1176", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/367 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9fb16fa647f341d19545f4ce5d7c7816", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/7.03M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 25.5 s, sys: 17.6 s, total: 43.1 s\n", "Wall time: 4min 14s\n" ] } ], "source": [ "%%time\n", "\n", "from llm_toolkit.translation_engine import *\n", "\n", "model, tokenizer = load_model(model_name, load_in_4bit)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "28049473-3b0f-4aa6-bcad-11b8954d8066", "showTitle": false, "title": "" }, "id": "SXd9bTZd1aaL" }, "source": [ "We now add LoRA adapters so we only need to update 1 to 10% of all parameters!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "f1615e9f-a306-472f-9fa3-7c78b0edc319", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "6bZsfBuZDeCL", "outputId": "bc6d9ce7-f82a-4191-d0c5-ec8247d9b9eb" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Unsloth 2024.5 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 12.6 s, sys: 0 ns, total: 12.6 s\n", "Wall time: 1.88 s\n" ] } ], "source": [ "%%time\n", "\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules=[\n", " \"q_proj\",\n", " \"k_proj\",\n", " \"v_proj\",\n", " \"o_proj\",\n", " \"gate_proj\",\n", " \"up_proj\",\n", " \"down_proj\",\n", " ],\n", " lora_alpha=16,\n", " lora_dropout=0, # Supports any, but = 0 is optimized\n", " bias=\"none\", # Supports any, but = \"none\" is optimized\n", " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n", " use_gradient_checkpointing=\"unsloth\", # True or \"unsloth\" for very long context\n", " random_state=3407,\n", " use_rslora=False, # We support rank stabilized LoRA\n", " loftq_config=None, # And LoftQ\n", ")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "16e3c2ff-36ff-4895-bfd0-59ab1b2130cc", "showTitle": false, "title": "" }, "id": "vITh0KVJ10qX" }, "source": [ "\n", "### Data Prep\n", "We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.\n", "\n", "**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).\n", "\n", "**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!\n", "\n", "If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).\n", "\n", "For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "4426fdab-78f7-4a28-abf7-dc55b19db864", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading train/test data files\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "95a1b6aa815f461a8281e33633a28a9b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/4528 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "06c43290ece44320a77fae8dd24fe380", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1133 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['chinese', 'english', 'text', 'prompt'],\n", " num_rows: 4528\n", " })\n", " test: Dataset({\n", " features: ['chinese', 'english', 'text', 'prompt'],\n", " num_rows: 1133\n", " })\n", "})\n" ] } ], "source": [ "import os\n", "from llm_toolkit.translation_engine import *\n", "\n", "datasets = load_translation_dataset(data_path, tokenizer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "14384095-a677-4439-b906-bd4f545775cd", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "({'chinese': '全仗着狐仙搭救。',\n", " 'english': 'Because I was protected by a fox fairy.',\n", " 'text': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n全仗着狐仙搭救。<|im_end|>\\n<|im_start|>assistant\\nBecause I was protected by a fox fairy.<|im_end|>',\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n全仗着狐仙搭救。<|im_end|>\\n<|im_start|>assistant\\n'},\n", " {'chinese': '老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。',\n", " 'english': 'Old Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.',\n", " 'text': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。<|im_end|>\\n<|im_start|>assistant\\nOld Geng picked up his shotgun, squinted, and pulled the trigger. Two sparrows crashed to the ground like hailstones as shotgun pellets tore noisily through the branches.<|im_end|>',\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞迸着,嚓嚓有声。<|im_end|>\\n<|im_start|>assistant\\n'})" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets[\"train\"][0], datasets[\"test\"][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "3e839830-d2da-48e3-b6f4-63da7a7b9dab", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "({'chinese': '周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。',\n", " 'english': \"'She said they don't really belong to the family but were adopted into the clan years ago when your grandfather and theirs were working in the same office.\",\n", " 'text': \"<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。<|im_end|>\\n<|im_start|>assistant\\n'She said they don't really belong to the family but were adopted into the clan years ago when your grandfather and theirs were working in the same office.<|im_end|>\",\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n周瑞家的道:“太太说:‘他们原不是一家子; 当年他们的祖和太老爷在一处做官,因连了宗的。<|im_end|>\\n<|im_start|>assistant\\n'},\n", " {'chinese': '“听到了吗?',\n", " 'english': \"'Did you hear that?'\",\n", " 'text': \"<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n“听到了吗?<|im_end|>\\n<|im_start|>assistant\\n'Did you hear that?'<|im_end|>\",\n", " 'prompt': '<|im_start|>system\\nYou are an expert in translating Chinese into English.<|im_end|>\\n<|im_start|>user\\nTranslate from Chinese to English.\\n“听到了吗?<|im_end|>\\n<|im_start|>assistant\\n'})" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets[\"train\"][1000], datasets[\"test\"][1000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "03a3c02c-d3d9-49f4-87b5-2e568c174175", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "26e4202cca81496a90d15a0dd4ca9cf1", "ba90fdb8822d47dab7ba203bee297f37", "61560ff6a36b44f4a9dfdae5c52791d4", "95fbe66647904c06a20f640630d6dc0e", "57182a263d324a3dbf1471c74290a0d5", "0f8b6bfe16894500838793f2491d403f", "bb19f6c747754682a514373a3a0535ba", "db19fc8d37db4e45a5790a876836d8c4", "36166c7bcb854b34aca1f41a5d6ea50b", "b0a370dc20654b279b9680692e34418e", "cfeb365ddf7548d58b2557f22737fcf5", "73e352a3404f4c7dad0737f57d29e92f", "988a0e8c1f89446086858da0a891a79c", "4ccedf0d93094e63b57a0f8a434fba06", "6b2012c3f88547af8884a9ea90e3164b", "7e29cb8dd4df4d5b94407cd8fd3f2011", "ad2be500fc164c0f86f33e914ef8e6a0", "5234566b1bfc4655b8d582ea5b46ed9f", "4463edd481c1467f914c7dcd6c6e6ffc", "6d3b9a05db0b4dadb638c686faa0c40a", "938f45f1b3e24118b815d96ae34ba86a", "9367047a800747f79c6b225d92397846", "d1b47d39450d4019ae85c9b2f943eeaf", "4dcf6ff672d24983a1877a8431709aa9", "7975adbc2ec5489ea7fa0167e620d85c", "71ce208e20d6483abb9ed923510c86d7", "cfe8cae0e22b495bafa221a63d13b283", "5807d5fb827d490fb3bc698f801ffff5", "c4f2b06a82fd4987b8b659524a7b503b", "6e34619b45934040b6092e6fb01ea7fe", "271ddaa553a042d09b6db7b450643d8f", "d69dc491b3ab44d7852b21873ed7bb7f", "f401d53bf28e44eb906bce6c05412662", "daf4cd890b35422683d22fd30bc71e83", "b0240cd9a4554b29ae11f8051984a1c6", "bc883d4cf13e4f8b8a4fe5f410cb6efd", "99fdbb0300c14c139d1937c646f0cfe7", "c161d94df0f04feba9542237e0856c22", "edaf890370314a218f138015faa0b05d", "697f027529b54ee9956bae78a11e0611", "e9159e03e61f4f56978ece9c3bca49b2", "810ff6c0e17d4fa09a30fef27eacff90", "7358cdad832342c983e31efb8754ab78", "e9adf418296e436fb48bb9f78885598b" ] }, "id": "LjY75GoYUCB8", "outputId": "7e2045fb-9ce9-49b1-b6e7-d5c9bc92455c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|im_start|>system\n", "You are an expert in translating Chinese into English.<|im_end|>\n", "<|im_start|>user\n", "Translate from Chinese to English.\n", "“听到了吗?<|im_end|>\n", "<|im_start|>assistant\n", "\n", "----------------------------------------\n", "<|im_start|>system\n", "You are an expert in translating Chinese into English.<|im_end|>\n", "<|im_start|>user\n", "Translate from Chinese to English.\n", "“听到了吗?<|im_end|>\n", "<|im_start|>assistant\n", "Did you hear that?<|im_end|>\n", "CPU times: user 1.8 s, sys: 873 ms, total: 2.68 s\n", "Wall time: 2.72 s\n" ] } ], "source": [ "%%time\n", "\n", "prompt1 = datasets[\"test\"][\"prompt\"][1000]\n", "print(prompt1)\n", "print(\"--\" * 20)\n", "test_model(model, tokenizer, prompt1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "22ad05ed-04e7-420f-82bf-8f990efce37c", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1133/1133 [30:01<00:00, 1.59s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 27min 10s, sys: 2min 52s, total: 30min 2s\n", "Wall time: 30min 1s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "%%time\n", "\n", "predictions = eval_model(model, tokenizer, datasets[\"test\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "eeba4278-d952-4391-8f63-c123e6098ffd", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "{'accuracy': 0.00176522506619594,\n", " 'correct_ids': [658, 659],\n", " 'bleu_scores': {'bleu': 0.08285381577653864,\n", " 'precisions': [0.40636974021865224,\n", " 0.12583290620194773,\n", " 0.051405438435685916,\n", " 0.02290685609386224],\n", " 'brevity_penalty': 0.9405675222192741,\n", " 'length_ratio': 0.9422656508777741,\n", " 'translation_length': 28447,\n", " 'reference_length': 30190},\n", " 'rouge_scores': {'rouge1': 0.38844471682897896,\n", " 'rouge2': 0.14120062297432684,\n", " 'rougeL': 0.3280668137668106,\n", " 'rougeLsum': 0.3280344032501499}}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "calc_metrics(datasets[\"test\"][\"english\"], predictions, debug=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "2485caac-9b06-42f5-a4da-213d3e522a06", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 chinese \\\n", "0 0 老耿端起枪,眯缝起一只三角眼,一搂扳机响了枪,冰雹般的金麻雀劈哩啪啦往下落,铁砂子在柳枝间飞... \n", "\n", " english \\\n", "0 Old Geng picked up his shotgun, squinted, and ... \n", "\n", " unsloth/Qwen2-0.5B-Instruct(finetuned) \\\n", "0 Old Geng lifted his rifle and narrowed his eye... \n", "\n", " unsloth/Qwen2-1.5B-Instruct \n", "0 Old Geng took up his gun, squinted one of its ... \n" ] } ], "source": [ "save_results(\n", " model_name,\n", " \"results/mac-results.csv\",\n", " datasets[\"test\"],\n", " predictions,\n", " debug=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "5c3f9939-9068-4edf-b057-e4898efeb94e", "showTitle": false, "title": "" }, "id": "idAEIeSQ3xdS" }, "source": [ "\n", "### Train the model\n", "Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "053bd880-409c-4ae0-a5a5-06084ada19d5", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 122, "referenced_widgets": [ "3cf2dd993b5e4d3daecf61e4bab5a404", "087b76a8b7514269b1f0ab29b062e444", "35b0e8c26d6640e9bd0ed7b242a423d8", "54ad89e05fd74576b9b8b5b5a10eaf8d", "a41dc44766444a998bec2d777f249d23", "a069d2ab23824f29aa320ac256e2cfe9", "06e806c82c7b4cbea31c5358dd9c3434", "2e5087c76f98437cb5dc729230358cba", "036fc5746f43416db18c19ad8fd36677", "fdb1941405ed4e4aa06019933892deb3", "668d5377ca56426a99753867e6e24862" ] }, "id": "95_Nn-89DhsL", "outputId": "bce9db22-b022-4e43-de3f-c7ea4c9c3c4e" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6b952d520d494e58811bae80cf5ae883", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map (num_proc=2): 0%| | 0/4528 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from trl import SFTTrainer\n", "from llm_toolkit.transformers import TrainingArguments\n", "from unsloth import is_bfloat16_supported\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " tokenizer=tokenizer,\n", " train_dataset=datasets[\"train\"],\n", " dataset_text_field=\"text\",\n", " max_seq_length=max_seq_length,\n", " dataset_num_proc=2,\n", " packing=False, # Can make training 5x faster for short sequences.\n", " args=TrainingArguments(\n", " per_device_train_batch_size=2,\n", " gradient_accumulation_steps=4,\n", " warmup_steps=5,\n", " num_train_epochs=num_train_epochs,\n", " learning_rate=2e-4,\n", " fp16=not is_bfloat16_supported(),\n", " bf16=is_bfloat16_supported(),\n", " logging_steps=100,\n", " optim=\"adamw_8bit\",\n", " weight_decay=0.01,\n", " lr_scheduler_type=\"linear\",\n", " seed=3407,\n", " output_dir=\"outputs\",\n", " ),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "b7b322ec-e1bb-467e-9a24-7a9cff6c2402", "showTitle": false, "title": "" }, "cellView": "form", "colab": { "base_uri": "https://localhost:8080/" }, "id": "2ejIt2xSNKKp", "outputId": "c73d8dfa-f4a1-4a01-a6dc-018bf82516a2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GPU = NVIDIA GeForce RTX 4080 Laptop GPU. Max memory = 11.994 GB.\n", "3.633 GB of memory reserved.\n" ] } ], "source": [ "# @title Show current memory stats\n", "import torch\n", "\n", "gpu_stats = torch.cuda.get_device_properties(0)\n", "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", "print(f\"{start_gpu_memory} GB of memory reserved.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "31565b22-348c-4ebd-a478-ecac933086a6", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "yqxqAZ7KJ4oL", "outputId": "69117b9b-b6f8-4d0e-c262-6998ba2c46bd" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1\n", " \\\\ /| Num examples = 4,528 | Num Epochs = 10\n", "O^O/ \\_/ \\ Batch size per device = 2 | Gradient Accumulation steps = 4\n", "\\ / Total batch size = 8 | Total steps = 5,660\n", " \"-____-\" Number of trainable parameters = 18,464,768\n" ] }, { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|---|
100 | \n", "1.919100 | \n", "
200 | \n", "1.774900 | \n", "
300 | \n", "1.722600 | \n", "
400 | \n", "1.721900 | \n", "
500 | \n", "1.695700 | \n", "
600 | \n", "1.612500 | \n", "
700 | \n", "1.473700 | \n", "
800 | \n", "1.518000 | \n", "
900 | \n", "1.452100 | \n", "
1000 | \n", "1.454900 | \n", "
1100 | \n", "1.509600 | \n", "
1200 | \n", "1.272200 | \n", "
1300 | \n", "1.128400 | \n", "
1400 | \n", "1.161200 | \n", "
1500 | \n", "1.165600 | \n", "
1600 | \n", "1.169700 | \n", "
1700 | \n", "1.140900 | \n", "
1800 | \n", "0.796500 | \n", "
1900 | \n", "0.812800 | \n", "
2000 | \n", "0.815000 | \n", "
2100 | \n", "0.806600 | \n", "
2200 | \n", "0.850100 | \n", "
2300 | \n", "0.737200 | \n", "
2400 | \n", "0.533900 | \n", "
2500 | \n", "0.521600 | \n", "
2600 | \n", "0.562600 | \n", "
2700 | \n", "0.557700 | \n", "
2800 | \n", "0.563000 | \n", "
2900 | \n", "0.418500 | \n", "
3000 | \n", "0.343000 | \n", "
3100 | \n", "0.353900 | \n", "
3200 | \n", "0.368300 | \n", "
3300 | \n", "0.367600 | \n", "
3400 | \n", "0.361000 | \n", "
3500 | \n", "0.230000 | \n", "
3600 | \n", "0.244000 | \n", "
3700 | \n", "0.246400 | \n", "
3800 | \n", "0.245400 | \n", "
3900 | \n", "0.256800 | \n", "
4000 | \n", "0.232000 | \n", "
4100 | \n", "0.178700 | \n", "
4200 | \n", "0.186600 | \n", "
4300 | \n", "0.189200 | \n", "
4400 | \n", "0.189600 | \n", "
4500 | \n", "0.190100 | \n", "
4600 | \n", "0.160900 | \n", "
4700 | \n", "0.155000 | \n", "
4800 | \n", "0.155300 | \n", "
4900 | \n", "0.157400 | \n", "
5000 | \n", "0.159500 | \n", "
5100 | \n", "0.157000 | \n", "
5200 | \n", "0.138300 | \n", "
5300 | \n", "0.138600 | \n", "
5400 | \n", "0.139500 | \n", "
5500 | \n", "0.141400 | \n", "
5600 | \n", "0.144900 | \n", "
"
],
"text/plain": [
"