crscardellino commited on
Commit
60a0e26
1 Parent(s): 2db8b11

Cleaning the notebook code

Browse files
Files changed (1) hide show
  1. flisol-cordoba-2023.ipynb +16 -43
flisol-cordoba-2023.ipynb CHANGED
@@ -607,7 +607,7 @@
607
  "import torch\n",
608
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
609
  "\n",
610
- "BASE_MODEL = \"DeepESP/gpt2-spanish\" # We play with a smaller model\n",
611
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
612
  "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
613
  ]
@@ -685,20 +685,6 @@
685
  }
686
  },
687
  "outputs": [
688
- {
689
- "data": {
690
- "application/vnd.jupyter.widget-view+json": {
691
- "model_id": "123690f207a94d3e850acef7a13133a6",
692
- "version_major": 2,
693
- "version_minor": 0
694
- },
695
- "text/plain": [
696
- " 0%| | 0/2 [00:00<?, ?it/s]"
697
- ]
698
- },
699
- "metadata": {},
700
- "output_type": "display_data"
701
- },
702
  {
703
  "name": "stdout",
704
  "output_type": "stream",
@@ -718,9 +704,9 @@
718
  "source": [
719
  "from datasets import load_dataset\n",
720
  "\n",
721
- "datasets = load_dataset(\"text\", data_files={\"train\": './data/martin-fierro_train.txt',\n",
722
- " \"validation\": './data/martin-fierro_validation.txt'})\n",
723
- "print('\\n'.join(datasets[\"train\"][:9]['text']))"
724
  ]
725
  },
726
  {
@@ -752,7 +738,7 @@
752
  "source": [
753
  "from utils import tokenize # local module in the repository\n",
754
  "\n",
755
- "tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=[\"text\"])"
756
  ]
757
  },
758
  {
@@ -879,7 +865,7 @@
879
  }
880
  ],
881
  "source": [
882
- "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
883
  ]
884
  },
885
  {
@@ -907,22 +893,7 @@
907
  "slide_type": "fragment"
908
  }
909
  },
910
- "outputs": [
911
- {
912
- "data": {
913
- "application/vnd.jupyter.widget-view+json": {
914
- "model_id": "94b41ffd721d4bbf8840df3fee46bbb2",
915
- "version_major": 2,
916
- "version_minor": 0
917
- },
918
- "text/plain": [
919
- "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
920
- ]
921
- },
922
- "metadata": {},
923
- "output_type": "display_data"
924
- }
925
- ],
926
  "source": [
927
  "from huggingface_hub import notebook_login\n",
928
  "\n",
@@ -1040,8 +1011,8 @@
1040
  "from transformers import Trainer, TrainingArguments\n",
1041
  "\n",
1042
  "training_args = TrainingArguments(\n",
1043
- " \"flisol-cba-martin-fierro\",\n",
1044
- " evaluation_strategy=\"epoch\",\n",
1045
  " num_train_epochs=10,\n",
1046
  " learning_rate=2e-5,\n",
1047
  " weight_decay=0.01,\n",
@@ -1051,12 +1022,13 @@
1051
  "trainer = Trainer(\n",
1052
  " model=model,\n",
1053
  " args=training_args,\n",
1054
- " train_dataset=lm_datasets[\"train\"],\n",
1055
- " eval_dataset=lm_datasets[\"validation\"]\n",
1056
  ")\n",
1057
  "\n",
1058
  "trainer.train()\n",
1059
- "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository"
 
1060
  ]
1061
  },
1062
  {
@@ -1105,8 +1077,9 @@
1105
  "import torch\n",
1106
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
1107
  "\n",
1108
- "tokenizer = AutoTokenizer.from_pretrained(\"DeepESP/gpt2-spanish\")\n",
1109
- "model = AutoModelForCausalLM.from_pretrained(\"crscardellino/flisol-cba-martin-fierro\")\n",
 
1110
  "\n",
1111
  "torch.manual_seed(42) # To ensure determinism\n",
1112
  "\n",
 
607
  "import torch\n",
608
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
609
  "\n",
610
+ "BASE_MODEL = 'DeepESP/gpt2-spanish' # We play with a smaller model\n",
611
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
612
  "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
613
  ]
 
685
  }
686
  },
687
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
  {
689
  "name": "stdout",
690
  "output_type": "stream",
 
704
  "source": [
705
  "from datasets import load_dataset\n",
706
  "\n",
707
+ "datasets = load_dataset('text', data_files={'train': './data/martin-fierro_train.txt',\n",
708
+ " 'validation': './data/martin-fierro_validation.txt'})\n",
709
+ "print('\\n'.join(datasets['train'][:9]['text']))"
710
  ]
711
  },
712
  {
 
738
  "source": [
739
  "from utils import tokenize # local module in the repository\n",
740
  "\n",
741
+ "tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=['text'])"
742
  ]
743
  },
744
  {
 
865
  }
866
  ],
867
  "source": [
868
+ "print(tokenizer.decode(lm_datasets['train'][0]['input_ids']))"
869
  ]
870
  },
871
  {
 
893
  "slide_type": "fragment"
894
  }
895
  },
896
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  "source": [
898
  "from huggingface_hub import notebook_login\n",
899
  "\n",
 
1011
  "from transformers import Trainer, TrainingArguments\n",
1012
  "\n",
1013
  "training_args = TrainingArguments(\n",
1014
+ " 'flisol-cba-martin-fierro',\n",
1015
+ " evaluation_strategy='epoch',\n",
1016
  " num_train_epochs=10,\n",
1017
  " learning_rate=2e-5,\n",
1018
  " weight_decay=0.01,\n",
 
1022
  "trainer = Trainer(\n",
1023
  " model=model,\n",
1024
  " args=training_args,\n",
1025
+ " train_dataset=lm_datasets['train'],\n",
1026
+ " eval_dataset=lm_datasets['validation']\n",
1027
  ")\n",
1028
  "\n",
1029
  "trainer.train()\n",
1030
+ "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository\n",
1031
+ "tokenizer.push_to_hub('flisol-cba-martin-fierro')"
1032
  ]
1033
  },
1034
  {
 
1077
  "import torch\n",
1078
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
1079
  "\n",
1080
+ "MODEL = 'flisol-cba-martin-fierro'\n",
1081
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
1082
+ "model = AutoModelForCausalLM.from_pretrained(MODEL)\n",
1083
  "\n",
1084
  "torch.manual_seed(42) # To ensure determinism\n",
1085
  "\n",