{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "0ea8b46b-839b-445b-8043-ccdf4e920ace", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "6d394937-6c99-4a7c-9d32-7600a280032f", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /home/inflaton/code/projects/courses/llm-finetuning\n" ] } ], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "\n", "workding_dir = str(Path.cwd().parent)\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "9f67ec60-2f24-411c-84eb-0dd664b44775", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading env vars from: /home/inflaton/code/projects/courses/llm-finetuning/.env\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", "found_dotenv = find_dotenv(\".env\")\n", "\n", "if len(found_dotenv) == 0:\n", " found_dotenv = find_dotenv(\".env.example\")\n", "print(f\"loading env vars from: {found_dotenv}\")\n", "load_dotenv(found_dotenv, override=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "f1597656-8042-4878-9d3b-9ebfb8dd86dc", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/plain": [ "('unsloth/Qwen2-0.5B-Instruct-bnb-4bit',\n", " True,\n", " None,\n", " None,\n", " 2048,\n", " 10,\n", " None,\n", " 'datasets/mac/mac.tsv',\n", " 'results/mac-results_lf.csv')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "model_name = os.getenv(\"MODEL_NAME\")\n", "token = os.getenv(\"HF_TOKEN\") or None\n", "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n", "local_model = os.getenv(\"LOCAL_MODEL\")\n", "hub_model = os.getenv(\"HUB_MODEL\")\n", "num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n", "data_path = os.getenv(\"DATA_PATH\")\n", "results_path = os.getenv(\"RESULTS_PATH\")\n", "\n", "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = (\n", " None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", ")\n", "\n", "model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path, results_path" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sat Jun 29 17:26:00 2024 \n", "+---------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 545.23.07 Driver Version: 546.12 CUDA Version: 12.3 |\n", "|-----------------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|=========================================+======================+======================|\n", "| 0 NVIDIA GeForce RTX 4080 ... On | 00000000:01:00.0 Off | N/A |\n", "| N/A 50C P8 4W / 150W | 129MiB / 12282MiB | 0% Default |\n", "| | | N/A |\n", "+-----------------------------------------+----------------------+----------------------+\n", " \n", "+---------------------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=======================================================================================|\n", "| No running processes found |\n", "+---------------------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "๐ฆฅ Unsloth: Will patch your computer to enable 2x faster free finetuning.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "loading /home/inflaton/code/projects/courses/llm-finetuning/llm_toolkit/translation_engine.py\n", "loading train/test data files\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fabc731ff8e5499a9c842ef6833f3e98", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2e186baa65dc4dd1956fa2db0d83b4a1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating test split: 0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['chinese', 'english'],\n", " num_rows: 4528\n", " })\n", " test: Dataset({\n", " features: ['chinese', 'english'],\n", " num_rows: 1133\n", " })\n", "})\n" ] } ], "source": [ "from llm_toolkit.translation_engine import load_translation_dataset\n", "\n", "dataset = load_translation_dataset(data_path)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df = dataset[\"train\"].to_pandas()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df_alpaca = pd.DataFrame({\"instruction\": [\"Please translate the following Chinese text into English and provide only the translated content, nothing else.\"]*len(df)})" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | instruction | \n", "input | \n", "output | \n", "
---|---|---|---|
0 | \n", "Please translate the following Chinese text in... | \n", "ๅ จไป็็ไปๆญๆใ | \n", "Because I was protected by a fox fairy. | \n", "
1 | \n", "Please translate the following Chinese text in... | \n", "่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ | \n", "He was the director, the cousin later told the... | \n", "
2 | \n", "Please translate the following Chinese text in... | \n", "่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ | \n", "Xi-feng suddenly seemed to remember something,... | \n", "
3 | \n", "Please translate the following Chinese text in... | \n", "ไธไธช่็บขๅซๅ ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... | \n", "The three old Red Guards stood in front of Ye ... | \n", "
4 | \n", "Please translate the following Chinese text in... | \n", "็จๅ ็็ งๅๅ จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ | \n", "Mr. Cheng accepted their toast with equanimity... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
4523 | \n", "Please translate the following Chinese text in... | \n", "ๅค่พนๆไธคๅผ ่ ฟๆญช้ข่ฃ็ๅ ซไปๆกๅญ๏ผๆกๆ่กไนฑๆก็ๅ ๆก็ญ็ช็ๆจๅณใ | \n", "Two rickety tables with scarred tops and a few... | \n", "
4524 | \n", "Please translate the following Chinese text in... | \n", "่ดพ็ๅฌไบ๏ผๅ็ๆ่ณๆ ่ ฎใ | \n", "At this last remark Jia Rui positively scratch... | \n", "
4525 | \n", "Please translate the following Chinese text in... | \n", "ๅฌไบ่ฟๆ ท็่ฏไปท๏ผๆไปฌๅฟๆ ๆฟๅจ๏ผๅๅคงๅฎถไธ่ตทๆฏ่้ซๅผ๏ผๆๅ็ไบ๏ผ | \n", "Hearing comments like this, our emotions were ... | \n", "
4526 | \n", "Please translate the following Chinese text in... | \n", "ๆตท่ๅ ฌ้๏ผโ่ฎฐไฝไบๅ๏ผโ | \n", "'Can you remember that?' | \n", "
4527 | \n", "Please translate the following Chinese text in... | \n", "ไธ้ข่ฏด๏ผ่ฟๆ ทๅ็ผบๅฐ็ป่ใ | \n", "This time the opinions from above said it need... | \n", "
4528 rows ร 3 columns
\n", "\n", " | instruction | \n", "input | \n", "output | \n", "
---|---|---|---|
0 | \n", "Please translate the following Chinese text in... | \n", "ๅ จไป็็ไปๆญๆใ | \n", "Because I was protected by a fox fairy. | \n", "
1 | \n", "Please translate the following Chinese text in... | \n", "่ฟๅ๏ผ่กจๅฅๅ่ฏๅฅนไฟฉ๏ผ่ฟไบบๆฏๅฏผๆผ๏ผๅจๅคๅฝ็่ฟๅญฆ็๏ผ่ฟไผ็ผๅง๏ผไปๅคฉๆ็่ฟๆ๏ผๅฐฑๆฏไป่ช็ผ่ชๅฏผ็ใ | \n", "He was the director, the cousin later told the... | \n", "
2 | \n", "Please translate the following Chinese text in... | \n", "่ฟๅคๅงๅฟฝ็ถๆณ่ตทไธไปถไบๆฅ๏ผไพฟๅ็ชๅคๅซ๏ผโ่ๅฟๅๆฅ๏ผโ | \n", "Xi-feng suddenly seemed to remember something,... | \n", "
3 | \n", "Please translate the following Chinese text in... | \n", "ไธไธช่็บขๅซๅ ต่ตฐๅฐๅถๆๆด้ขๅ๏ผ้ขๅฏน็ๅฅน็ซๆไบไธๆโโๅฝๅนด๏ผๅฅนไปฌไนๆฏ่ฟๆ ท้ขๅฏนๅถๅฒๆณฐ็โโ่ฏๅพๅ็ฐ... | \n", "The three old Red Guards stood in front of Ye ... | \n", "
4 | \n", "Please translate the following Chinese text in... | \n", "็จๅ ็็ งๅๅ จๆถ๏ผ้ฝๆฏไธไธชโ่ฐขโๅญ๏ผ็ถๅ้ฎ็็ฆ็ถๆไปไน่ฏ่ฏดใ | \n", "Mr. Cheng accepted their toast with equanimity... | \n", "