{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "colab": { "name": "AraBert output Embeddings - PyTorch.ipynb", "provenance": [], "collapsed_sections": [] }, "widgets": { "application/vnd.jupyter.widget-state+json": { "854fc85905904877bdd188392584a0eb": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_1833452be0b04736a7f1fa8f2d9cb4da", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_79be542f4c144beebdc038976bcf9cc8", "IPY_MODEL_e06f6ef13f5c46ef9cdad1901605a159" ] } }, "1833452be0b04736a7f1fa8f2d9cb4da": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "79be542f4c144beebdc038976bcf9cc8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_fb12ee856871474d99492064c1b1a814", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 580, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 580, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_074d994a7d8642e483c0a011fd9b7b45" } }, "e06f6ef13f5c46ef9cdad1901605a159": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_b086bc6629684e7e91b377bc681976eb", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 580/580 [00:00<00:00, 746B/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_bcfed4bb69f14a08ab2afaf1c2b70b5c" } }, "fb12ee856871474d99492064c1b1a814": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "074d994a7d8642e483c0a011fd9b7b45": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "b086bc6629684e7e91b377bc681976eb": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "bcfed4bb69f14a08ab2afaf1c2b70b5c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9217e1d8d2424d3a85cd38555a18bf59": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_3c585aefa1b54066bcd2a7a931ac5a1c", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_1a7fff9d58ad4a2ca7090bbab5b21733", "IPY_MODEL_35835ae44d19486cba8c34a0c488561d" ] } }, "3c585aefa1b54066bcd2a7a931ac5a1c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "1a7fff9d58ad4a2ca7090bbab5b21733": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_8fe725f0fa3140538a5df031d1785643", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 717153, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 717153, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_0c9e747cfd004bd1b93fc02fa04a54cf" } }, "35835ae44d19486cba8c34a0c488561d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_81ce4cb0ea0c46dbb2c63466fde2a4eb", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 717k/717k [00:03<00:00, 182kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_d756fbf4ba1d4c2f9bd1ca1d5359d923" } }, "8fe725f0fa3140538a5df031d1785643": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "0c9e747cfd004bd1b93fc02fa04a54cf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "81ce4cb0ea0c46dbb2c63466fde2a4eb": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "d756fbf4ba1d4c2f9bd1ca1d5359d923": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "a87dd1736ddb45aba797e15f44c8776b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_78e8617af99f4175923b83f99177fb46", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_eef9894d92044e558cfaf45ff2b3e66a", "IPY_MODEL_0e991a2df4584261a64af02637279af6" ] } }, "78e8617af99f4175923b83f99177fb46": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "eef9894d92044e558cfaf45ff2b3e66a": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_781ba8c7f77f49718a35d4ab918ae806", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 543450661, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 543450661, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_22fa6ae514134836975907afb146de1d" } }, "0e991a2df4584261a64af02637279af6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_8673398ee3184573953cfd042c00c6f6", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 543M/543M [00:08<00:00, 63.9MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_7e3ad5b7e8484d309999b43c8fd56322" } }, "781ba8c7f77f49718a35d4ab918ae806": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "22fa6ae514134836975907afb146de1d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8673398ee3184573953cfd042c00c6f6": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "7e3ad5b7e8484d309999b43c8fd56322": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "code", "metadata": { "id": "N-FLoZGM2tjC", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "ce8637c8-1d03-4a90-873a-847b216a736b" }, "source": [ "!pip install transformers\n", "!git clone https://github.com/aub-mind/arabert\n", "!pip install pyarabic\n", "!pip install farasapy" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Collecting transformers\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)\n", "\u001b[K |████████████████████████████████| 757kB 2.8MB/s \n", "\u001b[?25hCollecting sentencepiece\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n", "\u001b[K |████████████████████████████████| 1.1MB 14.6MB/s \n", "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", "Collecting tokenizers==0.8.0-rc4\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", "\u001b[K |████████████████████████████████| 3.0MB 13.9MB/s \n", "\u001b[?25hCollecting sacremoses\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", "\u001b[K |████████████████████████████████| 890kB 41.0MB/s \n", "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)\n", "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.15.1)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.9)\n", "Building wheels for collected packages: sacremoses\n", " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893260 sha256=44a0bddb233906b1fed45e51bdb7aa7737141f24ae25aeb9464eba3d2faf0e6e\n", " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", "Successfully built sacremoses\n", "Installing collected packages: sentencepiece, tokenizers, sacremoses, transformers\n", "Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.0rc4 transformers-3.0.0\n", "Cloning into 'arabert'...\n", "remote: Enumerating objects: 198, done.\u001b[K\n", "remote: Counting objects: 100% (198/198), done.\u001b[K\n", "remote: Compressing objects: 100% (165/165), done.\u001b[K\n", "remote: Total 198 (delta 112), reused 73 (delta 31), pack-reused 0\u001b[K\n", "Receiving objects: 100% (198/198), 2.24 MiB | 2.19 MiB/s, done.\n", "Resolving deltas: 100% (112/112), done.\n", "Collecting pyarabic\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2c/d4/8b5b7288dd313680d6fdea9c9ded2946f12ed2c81be4b44940bbd478da8c/PyArabic-0.6.8.tar.gz (105kB)\n", "\u001b[K |████████████████████████████████| 112kB 2.8MB/s \n", "\u001b[?25hBuilding wheels for collected packages: pyarabic\n", " Building wheel for pyarabic (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pyarabic: filename=PyArabic-0.6.8-cp36-none-any.whl size=110609 sha256=5ca1c47ddd7d091338376db356cd41bc0c5ae1b7bf6c4496b4ff27dd444931c6\n", " Stored in directory: /root/.cache/pip/wheels/ca/fa/26/d82414a6635b3ee82bee0729bfdb1dc8d09879742206e004bb\n", "Successfully built pyarabic\n", "Installing collected packages: pyarabic\n", "Successfully installed pyarabic-0.6.8\n", "Collecting farasapy\n", " Downloading https://files.pythonhosted.org/packages/c9/32/3647a6763dbd2cb4d5777a9a7b0f8443daa2924277518d7a9700617e82c4/farasapy-0.0.5-py3-none-any.whl\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from farasapy) (4.41.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from farasapy) (2.23.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->farasapy) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->farasapy) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->farasapy) (2020.6.20)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->farasapy) (2.9)\n", "Installing collected packages: farasapy\n", "Successfully installed farasapy-0.0.5\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "ieh6jjDVwAMP", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 385, "referenced_widgets": [ "854fc85905904877bdd188392584a0eb", "1833452be0b04736a7f1fa8f2d9cb4da", "79be542f4c144beebdc038976bcf9cc8", "e06f6ef13f5c46ef9cdad1901605a159", "fb12ee856871474d99492064c1b1a814", "074d994a7d8642e483c0a011fd9b7b45", "b086bc6629684e7e91b377bc681976eb", "bcfed4bb69f14a08ab2afaf1c2b70b5c", "9217e1d8d2424d3a85cd38555a18bf59", "3c585aefa1b54066bcd2a7a931ac5a1c", "1a7fff9d58ad4a2ca7090bbab5b21733", "35835ae44d19486cba8c34a0c488561d", "8fe725f0fa3140538a5df031d1785643", "0c9e747cfd004bd1b93fc02fa04a54cf", "81ce4cb0ea0c46dbb2c63466fde2a4eb", "d756fbf4ba1d4c2f9bd1ca1d5359d923", "a87dd1736ddb45aba797e15f44c8776b", "78e8617af99f4175923b83f99177fb46", "eef9894d92044e558cfaf45ff2b3e66a", "0e991a2df4584261a64af02637279af6", "781ba8c7f77f49718a35d4ab918ae806", "22fa6ae514134836975907afb146de1d", "8673398ee3184573953cfd042c00c6f6", "7e3ad5b7e8484d309999b43c8fd56322" ] }, "outputId": "a9c20bd0-4a7d-494a-d3d6-8c275284a7c4" }, "source": [ "from transformers import AutoTokenizer, AutoModel\n", "from arabert.preprocess_arabert import never_split_tokens, preprocess\n", "from farasa.segmenter import FarasaSegmenter\n", "import torch\n", "\n", "arabert_tokenizer = AutoTokenizer.from_pretrained(\n", " \"aubmindlab/bert-base-arabert\",\n", " do_lower_case=False,\n", " do_basic_tokenize=True,\n", " never_split=never_split_tokens)\n", "arabert_model = AutoModel.from_pretrained(\"aubmindlab/bert-base-arabert\") #you can replace the path here with the folder containing the the pytorch model\n", "\n", "farasa_segmenter = FarasaSegmenter(interactive=True)" ], "execution_count": 2, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "854fc85905904877bdd188392584a0eb", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=580.0, style=ProgressStyle(description_…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9217e1d8d2424d3a85cd38555a18bf59", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=717153.0, style=ProgressStyle(descripti…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a87dd1736ddb45aba797e15f44c8776b", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543450661.0, style=ProgressStyle(descri…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n", "perform system check...\n", "check java version...\n", "Your java version is 11.0 which is compatiple with Farasa \n", "check toolkit binaries...\n", "some binaries are not existed..\n", "downloading zipped binaries...\n", " 99%|█████████▉| 198M/200M [00:13<00:00, 14.8MiB/s]extracting...\n", "toolkit binaries are downloaded and extracted.\n", "Dependencies seem to be satisfied..\n", "\u001b[37minitializing [SEGMENT] task in \u001b[32mINTERACTIVE \u001b[37mmode...\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/farasa/__base.py:45: UserWarning: Be careful with large lines as they may break on interactive mode. You may switch to Standalone mode for such cases.\n", " \"Be careful with large lines as they may break on interactive mode. You may switch to Standalone mode for such cases.\"\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "task [SEGMENT] is initialized interactively.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "7nFgjN_KwSyQ", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 68 }, "outputId": "a8c7c48f-1fe5-4ffd-a73d-f9985c9e7ae5" }, "source": [ "text= \"الجو جميل اليوم\"\n", "text_preprocessed = preprocess( text,\n", " do_farasa_tokenization = True,\n", " farasa = farasa_segmenter,\n", " use_farasapy = True) # if you want to use AraBERT v0.1 do_farasa_tokenization = False\n", "print(text)\n", "print(\"---------------------\")\n", "print(text_preprocessed)" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "الجو جميل اليوم\n", "---------------------\n", "ال+ جو جميل ال+ يوم\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "xfd4wszWwaxs", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 51 }, "outputId": "ca2b70bb-c70b-4a67-98c2-64db1b6edfb2" }, "source": [ "arabert_input = arabert_tokenizer.encode(text_preprocessed,add_special_tokens=True)\n", "print(arabert_input)\n", "print(arabert_tokenizer.convert_ids_to_tokens(arabert_input))\n", "# you should ignore the fisrt and the last embeddings" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "[29756, 3000, 516, 15724, 3000, 7447, 29758]\n", "['[CLS]', 'ال+', 'جو', 'جميل', 'ال+', 'يوم', '[SEP]']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "zXQ-bhV8x035", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "548a5be2-0419-4b1a-dccc-b871981edd7a" }, "source": [ "tensor_input_ids = torch.tensor(arabert_input).unsqueeze(0)\n", "print(tensor_input_ids)" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "tensor([[29756, 3000, 516, 15724, 3000, 7447, 29758]])\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "kH6DquqBxGIi", "colab_type": "code", "colab": {} }, "source": [ "output = arabert_model(tensor_input_ids)" ], "execution_count": 6, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "HhuCCQgExQpv", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "e5bcc1ce-6515-4a32-cab3-68fc205143d9" }, "source": [ "output[0].shape # batch_size x seq_len x emb_dim" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "torch.Size([1, 7, 768])" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "code", "metadata": { "id": "l7D4sTFv0eHz", "colab_type": "code", "colab": {} }, "source": [ "embeddings = output[0][0][1:-1]" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yg8vCudW1ftS", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "outputId": "a20c7480-bb43-4e35-f5c6-6c943a85e68b" }, "source": [ "print(embeddings.shape)\n", "print(embeddings)" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "torch.Size([5, 768])\n", "tensor([[ 0.3447, 1.3039, 1.3462, ..., -0.4002, 0.5298, 0.2837],\n", " [-0.0045, -0.0522, 0.4295, ..., -1.0770, 0.0021, -0.3939],\n", " [ 0.3940, -0.2821, 0.3980, ..., -0.2659, 0.3846, -0.4675],\n", " [ 0.3452, 1.3047, 1.3463, ..., -0.3996, 0.5308, 0.2845],\n", " [ 0.3442, 0.0026, -0.1066, ..., -0.2442, 0.2427, -0.3018]],\n", " grad_fn=)\n" ], "name": "stdout" } ] } ] }