"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [1638/1827 1:55:38 < 3:16:54, 0.02 it/s, Epoch 0.90/1]\n",
"
\n",
" \n",
" \n",
" \n",
" Epoch | \n",
" Training Loss | \n",
" Validation Loss | \n",
"
\n",
" \n",
" \n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer.train(resume_from_checkpoint = True)\n",
"wandb.finish()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-10T18:26:14.038132Z",
"iopub.status.busy": "2023-07-10T18:26:14.037456Z",
"iopub.status.idle": "2023-07-10T18:57:49.712287Z",
"shell.execute_reply": "2023-07-10T18:57:49.711640Z",
"shell.execute_reply.started": "2023-07-10T18:26:14.038103Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [50731/50731 31:35]\n",
"
\n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Perplexity: 78.33\n"
]
}
],
"source": [
"eval_results = trainer.evaluate()\n",
"print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "f04eaadd-a13d-4651-ad14-91bcc01f92e1",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-10T18:58:07.077477Z",
"iopub.status.busy": "2023-07-10T18:58:07.076979Z",
"iopub.status.idle": "2023-07-10T18:58:07.082203Z",
"shell.execute_reply": "2023-07-10T18:58:07.081426Z",
"shell.execute_reply.started": "2023-07-10T18:58:07.077454Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'eval_loss': 4.360935211181641,\n",
" 'eval_runtime': 1895.6573,\n",
" 'eval_samples_per_second': 214.09,\n",
" 'eval_steps_per_second': 26.762}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"eval_results"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d3417a50-f0a7-4cd7-bc3b-14106660be58",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-10T18:58:52.507374Z",
"iopub.status.busy": "2023-07-10T18:58:52.506748Z",
"iopub.status.idle": "2023-07-10T18:58:53.770508Z",
"shell.execute_reply": "2023-07-10T18:58:53.769992Z",
"shell.execute_reply.started": "2023-07-10T18:58:52.507341Z"
}
},
"outputs": [],
"source": [
"trainer.save_model(\"path_to_save\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "6a3b42de-552c-41fc-a454-afe8a0bf567d",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-10T18:59:46.871782Z",
"iopub.status.busy": "2023-07-10T18:59:46.871272Z",
"iopub.status.idle": "2023-07-10T18:59:49.794057Z",
"shell.execute_reply": "2023-07-10T18:59:49.793583Z",
"shell.execute_reply.started": "2023-07-10T18:59:46.871761Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
"- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"from transformers import AutoModel \n",
"model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-10T19:01:49.192299Z",
"iopub.status.busy": "2023-07-10T19:01:49.191549Z"
}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "77a76086b50b43a2a0bb1cc869ef8e26",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/1.27G [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model.push_to_hub('9wimu9/sinhala-bert-1.2')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "d4553ec7-1e38-4b44-8c5f-e46786cd3cfc",
"metadata": {
"execution": {
"iopub.execute_input": "2023-07-09T13:08:15.514124Z",
"iopub.status.busy": "2023-07-09T13:08:15.513517Z",
"iopub.status.idle": "2023-07-09T13:08:15.918801Z",
"shell.execute_reply": "2023-07-09T13:08:15.918326Z",
"shell.execute_reply.started": "2023-07-09T13:08:15.514097Z"
}
},
"outputs": [],
"source": [
"from huggingface_hub import HfApi\n",
"api = HfApi()\n",
"files = ['tokenizer.json','training_args.bin','trainer.ipynb']\n",
"for file in files:\n",
" api.upload_file(\n",
" path_or_fileobj=\"/notebooks/path_to_save/\"+file,\n",
" path_in_repo=file,\n",
" repo_id=\"9wimu9/sinhala-bert-1.1\",\n",
" repo_type=\"model\",\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1614503-df5d-454f-a81d-d96bb1899443",
"metadata": {},
"outputs": [],
"source": [
"learning rate scheduler details can be find here\n",
"https://dev.classmethod.jp/articles/huggingface-usage-scheluder-type/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd184295-1c0b-4625-a516-da417beb814f",
"metadata": {},
"outputs": [],
"source": [
"bert hyper params\n",
"======================\n",
"β1 = 0.9,\n",
"β2 = 0.999, \n",
"ǫ = 1e-6\n",
"L2 weight decay = 0.01\n",
"learning rate = warmed up first 10k to a peak of 1e-4 then linearly decayed\n",
"drop out 0.1\n",
"batch size = 256\n",
"step size = 1m\n",
"max_token_length = 512\n",
"\n",
"roberta\n",
"============\n",
"β2 = 0.98 for lareg batch sizs\n",
"max_token_length = 512\n",
"batch size = 2k\n",
"lr = 7e-4\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}