ateneoscsl
/

BUOD_bert2bert_TM

Tagalog

English

Generated from Trainer

Model card Files Files and versions Community

0xhaz commited on May 1, 2023

Commit

3257c25

•

1 Parent(s): 3d6ba30

Upload bert2bert_kami_3000.ipynb

Browse files

Files changed (1) hide show

bert2bert_kami_3000.ipynb +708 -0

bert2bert_kami_3000.ipynb ADDED Viewed

	@@ -0,0 +1,708 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qcv24GSIQE5d"
+      },
+      "outputs": [],
+      "source": [
+        "from IPython.display import HTML, display\n",
+        "\n",
+        "def set_css():\n",
+        "  display(HTML('''\n",
+        "  <style>\n",
+        "    pre {\n",
+        "        white-space: pre-wrap;\n",
+        "    }\n",
+        "  </style>\n",
+        "  '''))\n",
+        "get_ipython().events.register('pre_run_cell', set_css)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SH8dkqPxQtP7"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install --upgrade pip\n",
+        "!pip install transformers\n",
+        "!pip install datasets\n",
+        "!pip install sentencepiece"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D8hhA8gaQwRR"
+      },
+      "source": [
+        "# 📂 Dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NF-ouJiDQ1FO"
+      },
+      "source": [
+        "### Loading the dataset\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "moK3d7mTQ1v-"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "!wget 'https://raw.githubusercontent.com/jamesesguerra/dataset_repo/main/kami-3000.csv'\n",
+        "\n",
+        "dataset = load_dataset('csv', data_files='kami-3000.csv')\n",
+        "\n",
+        "print(dataset)\n",
+        "print()\n",
+        "print(dataset['train'].features)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "HEWGrOI_VlkN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "'''USE THIS CODE BLOCK FOR LOCAL INITIALIZATION'''\n",
+        "\n",
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset = load_dataset('csv', data_files='C:/Users/Public/Documents/hazielle/kami-3000.csv')\n",
+        "\n",
+        "print(dataset)\n",
+        "print()\n",
+        "print(dataset['train'].features)"
+      ],
+      "metadata": {
+        "id": "NgtZQydpwpB-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zbxmMmtWRCtX"
+      },
+      "source": [
+        "### Filtering rows\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QgoQRt8QREVi"
+      },
+      "source": [
+        "**Removing rows with blank article text and blank summary**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "twzcsfXuRFQQ"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = dataset.filter(lambda x: x['article_text'] is not None)\n",
+        "dataset = dataset.filter(lambda x: x['summary'] is not None)\n",
+        "\n",
+        "print(dataset['train'])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "30Xl1LGoRKkY"
+      },
+      "source": [
+        "**Removing rows with `len(article text)` < 25** and **`len(summary)` < 10**\n",
+        "(based on [this paper](http://www.diva-portal.org/smash/get/diva2:1563580/FULLTEXT01.pdf))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6MjsxAZPRLFk"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = dataset.filter(lambda x: len(x['article_text'].split()) > 25)\n",
+        "dataset = dataset.filter(lambda x: len(x['summary'].split()) > 10)\n",
+        "\n",
+        "print(dataset['train'])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YLA2bQeNRPAl"
+      },
+      "source": [
+        "### Cleaning\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z26t9F1URSCO"
+      },
+      "source": [
+        "**Unescaping HTML character codes**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BcUTqeFwRQpC"
+      },
+      "outputs": [],
+      "source": [
+        "import html\n",
+        "\n",
+        "dataset = dataset.map(\n",
+        "    lambda x: {'article_text': [html.unescape(o) for o in x['article_text']]}, batched=True\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Y9BFM_A-RVdR"
+      },
+      "source": [
+        "**Removing unicode hard spaces**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D-MJvuTkRY8c"
+      },
+      "outputs": [],
+      "source": [
+        "from unicodedata import normalize\n",
+        "\n",
+        "dataset = dataset.map(lambda x: {'article_text': normalize('NFKD', x['article_text'])})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6th91MJ3RmJW"
+      },
+      "source": [
+        "## Dataset splits\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jVJ--r53RoL6"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = dataset['train'].train_test_split(train_size=0.8, seed=42)\n",
+        "\n",
+        "dataset['validation'] = dataset.pop('test')\n",
+        "\n",
+        "print(dataset)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UFN9ufDYRp9G"
+      },
+      "source": [
+        "# 🪙 Tokenization"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rP1sC2L0R0HB"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import AutoTokenizer\n",
+        "\n",
+        "checkpoint = \"patrickvonplaten/bert2bert-cnn_dailymail-fp16\"\n",
+        "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1X9Ji15LR8et"
+      },
+      "source": [
+        "**Define preprocess function**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T1L-Q2v8R93o"
+      },
+      "outputs": [],
+      "source": [
+        "# set upper limit on how long the articles and their summaries can be\n",
+        "max_input_length = 512\n",
+        "max_target_length = 128\n",
+        "\n",
+        "def preprocess_function(rows):\n",
+        "  model_inputs = tokenizer(rows['article_text'], max_length=max_input_length, truncation=True)\n",
+        "  \n",
+        "  with tokenizer.as_target_tokenizer():\n",
+        "    labels = tokenizer(rows['summary'], max_length=max_target_length, truncation=True)\n",
+        "  \n",
+        "  model_inputs['labels'] = labels['input_ids']\n",
+        "  return model_inputs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JEVi769uSARU"
+      },
+      "source": [
+        "**Tokenize the dataset**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IU5943MESBrK"
+      },
+      "outputs": [],
+      "source": [
+        "tokenized_dataset = dataset.map(preprocess_function, batched=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "o8D04VHjSI6b"
+      },
+      "source": [
+        "# 📊 Evaluation Metrics"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2GB7-jfKSMrE"
+      },
+      "source": [
+        "## ROUGE\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3TljkwZbSQZV"
+      },
+      "source": [
+        "**installing `rouge_score` and loading the metric**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HItzZO_mSQG-"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install rouge_score"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7wrZ5kAMSOlH"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_metric\n",
+        "rouge_score = load_metric('rouge')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tGOAR4SnSeVY"
+      },
+      "source": [
+        "## Creating a lead-3 baseline\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3OAa8kIfSgC5"
+      },
+      "source": [
+        "**import and download dependencies**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x8LFH_0qShRO"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install nltk\n",
+        "import nltk\n",
+        "\n",
+        "nltk.download(\"punkt\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WdcIWK8GShzb"
+      },
+      "source": [
+        "**define fn to extract the first 3 sentences in an article**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "17LcLH1FSjtz"
+      },
+      "outputs": [],
+      "source": [
+        "from nltk.tokenize import sent_tokenize\n",
+        "\n",
+        "def extract_sentences(text):\n",
+        "    return \"\\n\".join(sent_tokenize(text)[:3])\n",
+        "\n",
+        "print(extract_sentences(dataset[\"train\"][4][\"article_text\"]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0aHfU4_tSolA"
+      },
+      "source": [
+        "**define fn to extract summaries from the data and compute ROUGE scores for the baseline**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "08n3A6OGSqK2"
+      },
+      "outputs": [],
+      "source": [
+        "def evaluate_baseline(dataset, metric):\n",
+        "    summaries = [extract_sentences(text) for text in dataset[\"article_text\"]]\n",
+        "    return metric.compute(predictions=summaries, references=dataset[\"summary\"])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0fZ67opnSsbe"
+      },
+      "source": [
+        "**use fn to compute ROUGE scores over the validation set**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nMfTYxxOSwRk"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "score = evaluate_baseline(dataset[\"validation\"], rouge_score)\n",
+        "rouge_names = [\"rouge1\", \"rouge2\", \"rougeL\", \"rougeLsum\"]\n",
+        "rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)\n",
+        "print(rouge_dict)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tyfkBzlxSyA7"
+      },
+      "source": [
+        "# 🔩 Fine-tuning"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PqlM9-HgS804"
+      },
+      "source": [
+        "**Loading the model**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "R1y2goZ3S-CC"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import EncoderDecoderModel\n",
+        "\n",
+        "model = EncoderDecoderModel.from_pretrained(checkpoint, pad_token_id=0)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MMsjH4Z6TA73"
+      },
+      "source": [
+        "**Logging in Hugging Face Hub**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BLSPzmoBTCLk"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "notebook_login()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IHH0nuznTD2L"
+      },
+      "source": [
+        "**set up hyperparameters for training**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CCEGxd76TEff"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Seq2SeqTrainingArguments\n",
+        "\n",
+        "batch_size = 4\n",
+        "num_train_epochs = 2\n",
+        "logging_steps = len(tokenized_dataset['train']) // batch_size\n",
+        "model_name = checkpoint.split('/')[-1]\n",
+        "\n",
+        "args = Seq2SeqTrainingArguments(\n",
+        "    output_dir=f\"{model_name}-finetuned-1.0.0\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    learning_rate=5e-5,\n",
+        "    per_device_train_batch_size=batch_size,\n",
+        "    per_device_eval_batch_size=batch_size,\n",
+        "    weight_decay=0.01,\n",
+        "    save_total_limit=3,\n",
+        "    num_train_epochs=num_train_epochs,\n",
+        "    predict_with_generate=True,\n",
+        "    logging_steps=logging_steps,\n",
+        "    push_to_hub=True,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PdY0ecY9THT8"
+      },
+      "source": [
+        "**define fn to evaluate model during training**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d6DqOp4ITKGs"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "\n",
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)\n",
+        "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+        "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+        "    decoded_preds = [\"\\n\".join(sent_tokenize(pred.strip())) for pred in decoded_preds]\n",
+        "    decoded_labels = [\"\\n\".join(sent_tokenize(label.strip())) for label in decoded_labels]\n",
+        "    result = rouge_score.compute(\n",
+        "        predictions=decoded_preds, references=decoded_labels, use_stemmer=True\n",
+        "    )\n",
+        "    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}\n",
+        "    return {k: round(v, 4) for k, v in result.items()}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "y_wEoWIWTMjr"
+      },
+      "source": [
+        "**define data collator for dynamic padding**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ThUqaIr2TPh4"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import DataCollatorForSeq2Seq\n",
+        "\n",
+        "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "v_Q4XoW7UaTi"
+      },
+      "source": [
+        "**instantiate trainer with arguments**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zkCyYVTdUbE7"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Seq2SeqTrainer\n",
+        "\n",
+        "trainer = Seq2SeqTrainer(\n",
+        "    model,\n",
+        "    args,\n",
+        "    train_dataset=tokenized_dataset[\"train\"],\n",
+        "    eval_dataset=tokenized_dataset[\"validation\"],\n",
+        "    data_collator=data_collator,\n",
+        "    tokenizer=tokenizer,\n",
+        "    compute_metrics=compute_metrics,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ksa_utSpUnO6"
+      },
+      "source": [
+        "**launch training run**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YBGhf1xYUp7B"
+      },
+      "outputs": [],
+      "source": [
+        "trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YCwxQuydUI7K"
+      },
+      "outputs": [],
+      "source": [
+        "trainer.evaluate()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4eNoOqM2rWw1"
+      },
+      "outputs": [],
+      "source": [
+        "trainer.push_to_hub()"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}