{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30787,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2024-10-10T09:28:17.789573Z","iopub.execute_input":"2024-10-10T09:28:17.789974Z","iopub.status.idle":"2024-10-10T09:28:17.796087Z","shell.execute_reply.started":"2024-10-10T09:28:17.789935Z","shell.execute_reply":"2024-10-10T09:28:17.795140Z"},"trusted":true},"execution_count":70,"outputs":[]},{"cell_type":"code","source":"import torch\nfrom torch.utils.data import DataLoader\nfrom transformers import AutoTokenizer, XLMRobertaForTokenClassification, AdamW, get_linear_schedule_with_warmup\nfrom datasets import DatasetDict\nfrom seqeval.metrics import classification_report","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:09.927528Z","iopub.execute_input":"2024-10-10T09:31:09.928160Z","iopub.status.idle":"2024-10-10T09:31:09.933799Z","shell.execute_reply.started":"2024-10-10T09:31:09.928120Z","shell.execute_reply":"2024-10-10T09:31:09.932931Z"},"trusted":true},"execution_count":77,"outputs":[]},{"cell_type":"code","source":"from datasets import load_dataset\ndataset = load_dataset('masakhane/masakhaner2', 'kin') ","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:10.622463Z","iopub.execute_input":"2024-10-10T09:31:10.623463Z","iopub.status.idle":"2024-10-10T09:31:11.330817Z","shell.execute_reply.started":"2024-10-10T09:31:10.623419Z","shell.execute_reply":"2024-10-10T09:31:11.329841Z"},"trusted":true},"execution_count":78,"outputs":[]},{"cell_type":"code","source":"print(\"Original dataset sizes:\")\nprint(f\"Train: {len(dataset['train'])}\")\nprint(f\"Validation: {len(dataset['validation'])}\")\nprint(f\"Test: {len(dataset['test'])}\")","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:13.439874Z","iopub.execute_input":"2024-10-10T09:31:13.440252Z","iopub.status.idle":"2024-10-10T09:31:13.445759Z","shell.execute_reply.started":"2024-10-10T09:31:13.440213Z","shell.execute_reply":"2024-10-10T09:31:13.444745Z"},"trusted":true},"execution_count":79,"outputs":[{"name":"stdout","text":"Original dataset sizes:\nTrain: 7825\nValidation: 1118\nTest: 2235\n","output_type":"stream"}]},{"cell_type":"code","source":"def tokenize_and_align_labels(examples, tokenizer, max_length=128):\n tokenized_inputs = tokenizer(examples[\"tokens\"], truncation=True, is_split_into_words=True, \n max_length=max_length, padding=\"max_length\")\n\n labels = []\n for i, label in enumerate(examples[\"ner_tags\"]):\n word_ids = tokenized_inputs.word_ids(batch_index=i)\n previous_word_idx = None\n label_ids = []\n for word_idx in word_ids:\n if word_idx is None:\n label_ids.append(-100)\n elif word_idx != previous_word_idx:\n label_ids.append(label[word_idx])\n else:\n label_ids.append(-100)\n previous_word_idx = word_idx\n labels.append(label_ids)\n\n tokenized_inputs[\"labels\"] = labels\n return tokenized_inputs","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:13.983275Z","iopub.execute_input":"2024-10-10T09:31:13.983670Z","iopub.status.idle":"2024-10-10T09:31:13.990882Z","shell.execute_reply.started":"2024-10-10T09:31:13.983632Z","shell.execute_reply":"2024-10-10T09:31:13.989970Z"},"trusted":true},"execution_count":80,"outputs":[]},{"cell_type":"code","source":"model_name = 'Davlan/afro-xlmr-base'\ntokenizer = AutoTokenizer.from_pretrained(model_name)","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:37.056557Z","iopub.execute_input":"2024-10-10T09:31:37.056927Z","iopub.status.idle":"2024-10-10T09:31:38.513591Z","shell.execute_reply.started":"2024-10-10T09:31:37.056894Z","shell.execute_reply":"2024-10-10T09:31:38.512784Z"},"trusted":true},"execution_count":83,"outputs":[]},{"cell_type":"code","source":"tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, \n fn_kwargs={\"tokenizer\": tokenizer})","metadata":{"execution":{"iopub.status.busy":"2024-10-10T09:31:39.503200Z","iopub.execute_input":"2024-10-10T09:31:39.503601Z","iopub.status.idle":"2024-10-10T09:31:42.720584Z","shell.execute_reply.started":"2024-10-10T09:31:39.503563Z","shell.execute_reply":"2024-10-10T09:31:42.719623Z"},"trusted":true},"execution_count":84,"outputs":[{"output_type":"display_data","data":{"text/plain":"Map: 0%| | 0/7825 [00:00