{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "eiQ3FKJDhBW0" }, "source": [ "NLP EXERCISE: TEXT-CLASSIFICATION ON A DRUG DATASET.\n", "\n", "DATA WRANGLING, FINE-TUNING AND PUSHING THE MODEL TO THE HUGGING-FACE HUB" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HedrkDuchBW2" }, "outputs": [], "source": [ "#let's install the necessary libraries from Hugging Face\n", "!pip install datasets evaluate transformers[sentencepiece] transformers[torch]" ] }, { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "#authenticate into the huggin face\n", "notebook_login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "d02e8f7bdeb145d4bae8ebcbceab2e72", "6db4a180376840d6aa8e38c46033cc93", "893c0a9c9b51450d99c360a44aee61d4", "3def0a68df05423baf2effef91ff64d0", "3b78ea24be0c42ea8074a48ae4671e12", "50b3cf9cb0c742f2907423f456e9863f", "d252a4002a8d434db262f5056860c5c9", "319deda6147e44ac9e1f53a75f6fdd41", "bfe9fdcac47c4ba8b26b54a8d588afd7", "ba29ea12a9ce4a099992df57d4b851c2", "c5bb554a9f0e4b9b8178a25ed1fb1344", "9d6442503c794161bf927884aaac959e", "69f29945f16842d3bffea0b0dd50921e", "948a520ec0e742868f2f61f00c329f0f", "b9d706620bf24817b391cd76ee0a0866", "a536259beed540aabdabbe694515eb56", "b5089fdb55134b7a840d882b2488b658", "2d7b18bfad5c45ccbce5edbc8adc364c", "4515e7be5a5d4e0a83e7d3717eebe626", "0ac78e74c26044faa5a484cf5e658856", "a9cc217968cb447e875c25a60472bfea", "4e2105e83c4743428555be2aa7f4076e", "1160eec48195412ab7f7f9d29dcf4673", "6b1e242e660649f0b00d75c419569e25", "04c5b7bde3b840919d3ad3e1e04f7c6f", "9fffc2e7f3654cc3addfe5a786d32620", "120d065f31614ae395a30f3fdadf5b79", "dd54337179ed44fca73911f7177896a7", "fc4149e666d940fead7acb14b6ed0ff7", "da5f42f0817e407bbd9cb7001970d92b", "2d508ef9a1304e40b725c03774c4a8d6", "241d70158ae740f0ba73be9f2b633801" ] }, "id": "Z4IolwVzaQQh", "outputId": "c5901a51-7f5f-4ad9-b90d-0cbb1b8c381a" }, "execution_count": 2, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "Precision | \n", "Recall | \n", "F1 | \n", "
---|---|---|---|---|---|---|
1 | \n", "0.846900 | \n", "0.827508 | \n", "0.767283 | \n", "0.768594 | \n", "0.767283 | \n", "0.755066 | \n", "
2 | \n", "0.631900 | \n", "0.689465 | \n", "0.809415 | \n", "0.808975 | \n", "0.809415 | \n", "0.797826 | \n", "
3 | \n", "0.411600 | \n", "0.667829 | \n", "0.837578 | \n", "0.832482 | \n", "0.837578 | \n", "0.831665 | \n", "
" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=40170, training_loss=0.7559129410606727, metrics={'train_runtime': 4322.1673, 'train_samples_per_second': 74.352, 'train_steps_per_second': 9.294, 'total_flos': 3.0660538964049024e+16, 'train_loss': 0.7559129410606727, 'epoch': 3.0})" ] }, "metadata": {}, "execution_count": 27 } ] } ], "metadata": { "colab": { "provenance": [] }, "language_info": { "name": "python" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "d02e8f7bdeb145d4bae8ebcbceab2e72": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_a9cc217968cb447e875c25a60472bfea", "IPY_MODEL_4e2105e83c4743428555be2aa7f4076e", "IPY_MODEL_1160eec48195412ab7f7f9d29dcf4673", "IPY_MODEL_6b1e242e660649f0b00d75c419569e25" ], "layout": "IPY_MODEL_d252a4002a8d434db262f5056860c5c9" } }, "6db4a180376840d6aa8e38c46033cc93": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_319deda6147e44ac9e1f53a75f6fdd41", "placeholder": "", "style": "IPY_MODEL_bfe9fdcac47c4ba8b26b54a8d588afd7", "value": "