Commit
•
60a0e26
1
Parent(s):
2db8b11
Cleaning the notebook code
Browse files- flisol-cordoba-2023.ipynb +16 -43
flisol-cordoba-2023.ipynb
CHANGED
@@ -607,7 +607,7 @@
|
|
607 |
"import torch\n",
|
608 |
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
609 |
"\n",
|
610 |
-
"BASE_MODEL =
|
611 |
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
|
612 |
"model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
|
613 |
]
|
@@ -685,20 +685,6 @@
|
|
685 |
}
|
686 |
},
|
687 |
"outputs": [
|
688 |
-
{
|
689 |
-
"data": {
|
690 |
-
"application/vnd.jupyter.widget-view+json": {
|
691 |
-
"model_id": "123690f207a94d3e850acef7a13133a6",
|
692 |
-
"version_major": 2,
|
693 |
-
"version_minor": 0
|
694 |
-
},
|
695 |
-
"text/plain": [
|
696 |
-
" 0%| | 0/2 [00:00<?, ?it/s]"
|
697 |
-
]
|
698 |
-
},
|
699 |
-
"metadata": {},
|
700 |
-
"output_type": "display_data"
|
701 |
-
},
|
702 |
{
|
703 |
"name": "stdout",
|
704 |
"output_type": "stream",
|
@@ -718,9 +704,9 @@
|
|
718 |
"source": [
|
719 |
"from datasets import load_dataset\n",
|
720 |
"\n",
|
721 |
-
"datasets = load_dataset(
|
722 |
-
"
|
723 |
-
"print('\\n'.join(datasets[
|
724 |
]
|
725 |
},
|
726 |
{
|
@@ -752,7 +738,7 @@
|
|
752 |
"source": [
|
753 |
"from utils import tokenize # local module in the repository\n",
|
754 |
"\n",
|
755 |
-
"tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=[
|
756 |
]
|
757 |
},
|
758 |
{
|
@@ -879,7 +865,7 @@
|
|
879 |
}
|
880 |
],
|
881 |
"source": [
|
882 |
-
"print(tokenizer.decode(lm_datasets[
|
883 |
]
|
884 |
},
|
885 |
{
|
@@ -907,22 +893,7 @@
|
|
907 |
"slide_type": "fragment"
|
908 |
}
|
909 |
},
|
910 |
-
"outputs": [
|
911 |
-
{
|
912 |
-
"data": {
|
913 |
-
"application/vnd.jupyter.widget-view+json": {
|
914 |
-
"model_id": "94b41ffd721d4bbf8840df3fee46bbb2",
|
915 |
-
"version_major": 2,
|
916 |
-
"version_minor": 0
|
917 |
-
},
|
918 |
-
"text/plain": [
|
919 |
-
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
920 |
-
]
|
921 |
-
},
|
922 |
-
"metadata": {},
|
923 |
-
"output_type": "display_data"
|
924 |
-
}
|
925 |
-
],
|
926 |
"source": [
|
927 |
"from huggingface_hub import notebook_login\n",
|
928 |
"\n",
|
@@ -1040,8 +1011,8 @@
|
|
1040 |
"from transformers import Trainer, TrainingArguments\n",
|
1041 |
"\n",
|
1042 |
"training_args = TrainingArguments(\n",
|
1043 |
-
"
|
1044 |
-
" evaluation_strategy
|
1045 |
" num_train_epochs=10,\n",
|
1046 |
" learning_rate=2e-5,\n",
|
1047 |
" weight_decay=0.01,\n",
|
@@ -1051,12 +1022,13 @@
|
|
1051 |
"trainer = Trainer(\n",
|
1052 |
" model=model,\n",
|
1053 |
" args=training_args,\n",
|
1054 |
-
" train_dataset=lm_datasets[
|
1055 |
-
" eval_dataset=lm_datasets[
|
1056 |
")\n",
|
1057 |
"\n",
|
1058 |
"trainer.train()\n",
|
1059 |
-
"trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository"
|
|
|
1060 |
]
|
1061 |
},
|
1062 |
{
|
@@ -1105,8 +1077,9 @@
|
|
1105 |
"import torch\n",
|
1106 |
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
1107 |
"\n",
|
1108 |
-
"
|
1109 |
-
"
|
|
|
1110 |
"\n",
|
1111 |
"torch.manual_seed(42) # To ensure determinism\n",
|
1112 |
"\n",
|
607 |
"import torch\n",
|
608 |
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
609 |
"\n",
|
610 |
+
"BASE_MODEL = 'DeepESP/gpt2-spanish' # We play with a smaller model\n",
|
611 |
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
|
612 |
"model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
|
613 |
]
|
685 |
}
|
686 |
},
|
687 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
688 |
{
|
689 |
"name": "stdout",
|
690 |
"output_type": "stream",
|
704 |
"source": [
|
705 |
"from datasets import load_dataset\n",
|
706 |
"\n",
|
707 |
+
"datasets = load_dataset('text', data_files={'train': './data/martin-fierro_train.txt',\n",
|
708 |
+
" 'validation': './data/martin-fierro_validation.txt'})\n",
|
709 |
+
"print('\\n'.join(datasets['train'][:9]['text']))"
|
710 |
]
|
711 |
},
|
712 |
{
|
738 |
"source": [
|
739 |
"from utils import tokenize # local module in the repository\n",
|
740 |
"\n",
|
741 |
+
"tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=['text'])"
|
742 |
]
|
743 |
},
|
744 |
{
|
865 |
}
|
866 |
],
|
867 |
"source": [
|
868 |
+
"print(tokenizer.decode(lm_datasets['train'][0]['input_ids']))"
|
869 |
]
|
870 |
},
|
871 |
{
|
893 |
"slide_type": "fragment"
|
894 |
}
|
895 |
},
|
896 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
897 |
"source": [
|
898 |
"from huggingface_hub import notebook_login\n",
|
899 |
"\n",
|
1011 |
"from transformers import Trainer, TrainingArguments\n",
|
1012 |
"\n",
|
1013 |
"training_args = TrainingArguments(\n",
|
1014 |
+
" 'flisol-cba-martin-fierro',\n",
|
1015 |
+
" evaluation_strategy='epoch',\n",
|
1016 |
" num_train_epochs=10,\n",
|
1017 |
" learning_rate=2e-5,\n",
|
1018 |
" weight_decay=0.01,\n",
|
1022 |
"trainer = Trainer(\n",
|
1023 |
" model=model,\n",
|
1024 |
" args=training_args,\n",
|
1025 |
+
" train_dataset=lm_datasets['train'],\n",
|
1026 |
+
" eval_dataset=lm_datasets['validation']\n",
|
1027 |
")\n",
|
1028 |
"\n",
|
1029 |
"trainer.train()\n",
|
1030 |
+
"trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository\n",
|
1031 |
+
"tokenizer.push_to_hub('flisol-cba-martin-fierro')"
|
1032 |
]
|
1033 |
},
|
1034 |
{
|
1077 |
"import torch\n",
|
1078 |
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
1079 |
"\n",
|
1080 |
+
"MODEL = 'flisol-cba-martin-fierro'\n",
|
1081 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
1082 |
+
"model = AutoModelForCausalLM.from_pretrained(MODEL)\n",
|
1083 |
"\n",
|
1084 |
"torch.manual_seed(42) # To ensure determinism\n",
|
1085 |
"\n",
|