crscardellino commited on
Commit
8cbe4b9
1 Parent(s): f781de0

Updated the notebook

Browse files
Files changed (1) hide show
  1. flisol-cordoba-2023.ipynb +44 -226
flisol-cordoba-2023.ipynb CHANGED
@@ -258,7 +258,7 @@
258
  },
259
  {
260
  "cell_type": "code",
261
- "execution_count": 1,
262
  "id": "0e0d53be",
263
  "metadata": {
264
  "slideshow": {
@@ -276,7 +276,9 @@
276
  "\n",
277
  "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
278
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
279
- "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, low_cpu_mem_usage=True, torch_dtype=\"auto\").to(device)"
 
 
280
  ]
281
  },
282
  {
@@ -339,7 +341,7 @@
339
  },
340
  {
341
  "cell_type": "code",
342
- "execution_count": 2,
343
  "id": "c1227c49",
344
  "metadata": {
345
  "slideshow": {
@@ -378,32 +380,14 @@
378
  },
379
  {
380
  "cell_type": "code",
381
- "execution_count": 3,
382
  "id": "11bec6de",
383
  "metadata": {
384
  "slideshow": {
385
  "slide_type": "fragment"
386
  }
387
  },
388
- "outputs": [
389
- {
390
- "name": "stdout",
391
- "output_type": "stream",
392
- "text": [
393
- "La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
394
- "El EXPERTO le ayuda al HUMANO con preguntas acerca de software libre.\n",
395
- "El EXPERTO es conversacional, optimista, flexible, creativo y genera respuestas parecidas a un humano.\n",
396
- "\n",
397
- "HUMANO: Hola, ¿Cómo estás?\n",
398
- "EXPERTO: Hola, pmuy bien. Estoy acá para ayudarte con preguntas respecto al software libre.\n",
399
- "\n",
400
- "HUMANO: ¿Qué es el software libre?\n",
401
- "EXPERTO: El software libre es un software que se puede modificar, redistribuir y distribuir libremente.\n",
402
- "HUMANO: ¿En qué consiste la licencia GPL?\n",
403
- "EXPERTO: La licencia GPL es una licencia de software libre que permite a los usuarios modificar, redistribuir\n"
404
- ]
405
- }
406
- ],
407
  "source": [
408
  "MAX_TOKENS = 50\n",
409
  "input_ids = tokenizer.encode(PROMPT, return_tensors=\"pt\").to(device)\n",
@@ -433,33 +417,14 @@
433
  },
434
  {
435
  "cell_type": "code",
436
- "execution_count": 4,
437
  "id": "dc66f288",
438
  "metadata": {
439
  "slideshow": {
440
  "slide_type": "fragment"
441
  }
442
  },
443
- "outputs": [
444
- {
445
- "name": "stdout",
446
- "output_type": "stream",
447
- "text": [
448
- "La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
449
- "El EXPERTO le ayuda al HUMANO con preguntas acerca de software libre.\n",
450
- "El EXPERTO es conversacional, optimista, flexible, creativo y genera respuestas parecidas a un humano.\n",
451
- "\n",
452
- "HUMANO: Hola, ¿Cómo estás?\n",
453
- "EXPERTO: Hola, pmuy bien. Estoy acá para ayudarte con preguntas respecto al software libre.\n",
454
- "\n",
455
- "HUMANO: ¿Qué es el software libre?\n",
456
- "EXPERTO: El software libre, es aquel software que esta escrito en un lenguaje de programación que puede ser modificado y copiado por cualquier persona o entidad.\n",
457
- "\n",
458
- "HUMANO: ¿En general cuáles son los usos que se pueden dar a un software libre?\n",
459
- "EXPERTO\n"
460
- ]
461
- }
462
- ],
463
  "source": [
464
  "torch.manual_seed(42) # To ensure determinism\n",
465
  "\n",
@@ -604,7 +569,7 @@
604
  },
605
  {
606
  "cell_type": "code",
607
- "execution_count": 1,
608
  "id": "17f2884d",
609
  "metadata": {
610
  "slideshow": {
@@ -616,6 +581,8 @@
616
  "import torch\n",
617
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
618
  "\n",
 
 
619
  "BASE_MODEL = \"DeepESP/gpt2-spanish\" # We play with a smaller model\n",
620
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
621
  "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL).to(device)"
@@ -637,26 +604,14 @@
637
  },
638
  {
639
  "cell_type": "code",
640
- "execution_count": 2,
641
  "id": "322a4a9b",
642
  "metadata": {
643
  "slideshow": {
644
  "slide_type": "fragment"
645
  }
646
  },
647
- "outputs": [
648
- {
649
- "name": "stdout",
650
- "output_type": "stream",
651
- "text": [
652
- "Aquí me pongo a cantar y a llorar. \n",
653
- "\n",
654
- "Los sollozos de Meggie se desvanecen por la noche en el salón. Al parecer no se ve nada. \n",
655
- "\n",
656
- "—¿Y si no fuera el final del mundo, el fin de un mundo?\n"
657
- ]
658
- }
659
- ],
660
  "source": [
661
  "torch.manual_seed(42) # To ensure determinism\n",
662
  "\n",
@@ -686,30 +641,14 @@
686
  },
687
  {
688
  "cell_type": "code",
689
- "execution_count": 3,
690
  "id": "5a27197e",
691
  "metadata": {
692
  "slideshow": {
693
  "slide_type": "fragment"
694
  }
695
  },
696
- "outputs": [
697
- {
698
- "name": "stdout",
699
- "output_type": "stream",
700
- "text": [
701
- "I - Cantor y Gaucho.\n",
702
- "\n",
703
- "1\n",
704
- "Aquí me pongo a cantar\n",
705
- "Al compás de la vigüela,\n",
706
- "Que el hombre que lo desvela\n",
707
- "Una pena estraordinaria\n",
708
- "Como la ave solitaria\n",
709
- "Con el cantar se consuela.\n"
710
- ]
711
- }
712
- ],
713
  "source": [
714
  "from datasets import load_dataset\n",
715
  "\n",
@@ -740,7 +679,7 @@
740
  },
741
  {
742
  "cell_type": "code",
743
- "execution_count": 4,
744
  "id": "33059c5f",
745
  "metadata": {
746
  "scrolled": true,
@@ -778,7 +717,7 @@
778
  },
779
  {
780
  "cell_type": "code",
781
- "execution_count": 5,
782
  "id": "3100e195",
783
  "metadata": {
784
  "scrolled": true,
@@ -817,23 +756,14 @@
817
  },
818
  {
819
  "cell_type": "code",
820
- "execution_count": 6,
821
  "id": "b9d33b7b",
822
  "metadata": {
823
  "slideshow": {
824
  "slide_type": "fragment"
825
  }
826
  },
827
- "outputs": [
828
- {
829
- "name": "stdout",
830
- "output_type": "stream",
831
- "text": [
832
- "128\n",
833
- "[50, 1368, 6505, 282, 324, 24275, 526, 23, 208, 208]\n"
834
- ]
835
- }
836
- ],
837
  "source": [
838
  "print(len(lm_datasets[\"train\"][0][\"input_ids\"]))\n",
839
  "print(lm_datasets[\"train\"][0][\"input_ids\"][:10])"
@@ -841,44 +771,14 @@
841
  },
842
  {
843
  "cell_type": "code",
844
- "execution_count": 7,
845
  "id": "7dfb316d",
846
  "metadata": {
847
  "slideshow": {
848
  "slide_type": "fragment"
849
  }
850
  },
851
- "outputs": [
852
- {
853
- "name": "stdout",
854
- "output_type": "stream",
855
- "text": [
856
- "I - Cantor y Gaucho.\n",
857
- "\n",
858
- "1\n",
859
- "Aquí me pongo a cantar\n",
860
- "Al compás de la vigüela,\n",
861
- "Que el hombre que lo desvela\n",
862
- "Una pena estraordinaria\n",
863
- "Como la ave solitaria\n",
864
- "Con el cantar se consuela.\n",
865
- "\n",
866
- "2\n",
867
- "Pido a los Santos del Cielo\n",
868
- "Que ayuden mi pensamiento;\n",
869
- "Les pido en este momento\n",
870
- "Que voy a cantar mi historia\n",
871
- "Me refresquen la memoria\n",
872
- "Y aclaren mi entendimiento.\n",
873
- "\n",
874
- "3\n",
875
- "Vengan Santos milagrosos,\n",
876
- "Vengan todos en mi ayuda,\n",
877
- "Que la lengua se me añuda\n",
878
- "Y se me turba\n"
879
- ]
880
- }
881
- ],
882
  "source": [
883
  "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
884
  ]
@@ -900,7 +800,7 @@
900
  },
901
  {
902
  "cell_type": "code",
903
- "execution_count": 8,
904
  "id": "a8b90ba2",
905
  "metadata": {
906
  "scrolled": true,
@@ -935,100 +835,21 @@
935
  },
936
  {
937
  "cell_type": "code",
938
- "execution_count": 9,
939
- "id": "3b121d21",
940
  "metadata": {
941
  "slideshow": {
942
  "slide_type": "subslide"
943
  }
944
  },
945
- "outputs": [
946
- {
947
- "data": {
948
- "text/html": [
949
- "\n",
950
- " <div>\n",
951
- " \n",
952
- " <progress value='180' max='180' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
953
- " [180/180 11:44, Epoch 10/10]\n",
954
- " </div>\n",
955
- " <table border=\"1\" class=\"dataframe\">\n",
956
- " <thead>\n",
957
- " <tr style=\"text-align: left;\">\n",
958
- " <th>Epoch</th>\n",
959
- " <th>Training Loss</th>\n",
960
- " <th>Validation Loss</th>\n",
961
- " </tr>\n",
962
- " </thead>\n",
963
- " <tbody>\n",
964
- " <tr>\n",
965
- " <td>1</td>\n",
966
- " <td>4.386400</td>\n",
967
- " <td>4.202457</td>\n",
968
- " </tr>\n",
969
- " <tr>\n",
970
- " <td>2</td>\n",
971
- " <td>3.948000</td>\n",
972
- " <td>4.043974</td>\n",
973
- " </tr>\n",
974
- " <tr>\n",
975
- " <td>3</td>\n",
976
- " <td>3.796200</td>\n",
977
- " <td>3.980350</td>\n",
978
- " </tr>\n",
979
- " <tr>\n",
980
- " <td>4</td>\n",
981
- " <td>3.610500</td>\n",
982
- " <td>3.945783</td>\n",
983
- " </tr>\n",
984
- " <tr>\n",
985
- " <td>5</td>\n",
986
- " <td>3.444400</td>\n",
987
- " <td>3.927984</td>\n",
988
- " </tr>\n",
989
- " <tr>\n",
990
- " <td>6</td>\n",
991
- " <td>3.385500</td>\n",
992
- " <td>3.919229</td>\n",
993
- " </tr>\n",
994
- " <tr>\n",
995
- " <td>7</td>\n",
996
- " <td>3.314200</td>\n",
997
- " <td>3.909090</td>\n",
998
- " </tr>\n",
999
- " <tr>\n",
1000
- " <td>8</td>\n",
1001
- " <td>3.219200</td>\n",
1002
- " <td>3.907399</td>\n",
1003
- " </tr>\n",
1004
- " <tr>\n",
1005
- " <td>9</td>\n",
1006
- " <td>3.161500</td>\n",
1007
- " <td>3.906959</td>\n",
1008
- " </tr>\n",
1009
- " <tr>\n",
1010
- " <td>10</td>\n",
1011
- " <td>3.163700</td>\n",
1012
- " <td>3.906726</td>\n",
1013
- " </tr>\n",
1014
- " </tbody>\n",
1015
- "</table><p>"
1016
- ],
1017
- "text/plain": [
1018
- "<IPython.core.display.HTML object>"
1019
- ]
1020
- },
1021
- "metadata": {},
1022
- "output_type": "display_data"
1023
- }
1024
- ],
1025
  "source": [
1026
  "from transformers import Trainer, TrainingArguments\n",
1027
  "\n",
1028
  "training_args = TrainingArguments(\n",
1029
  " \"flisol-cba-martin-fierro\",\n",
1030
- " evaluation_strategy=\"epoch\",\n",
1031
- " num_train_epochs=10,\n",
1032
  " learning_rate=2e-5,\n",
1033
  " weight_decay=0.01,\n",
1034
  " logging_steps=5,\n",
@@ -1041,7 +862,20 @@
1041
  " eval_dataset=lm_datasets[\"validation\"],\n",
1042
  ")\n",
1043
  "\n",
1044
- "trainer.train()\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository\n",
1046
  "tokenizer.push_to_hub(\"flisol-cba-martin-fierro\")"
1047
  ]
@@ -1064,30 +898,14 @@
1064
  },
1065
  {
1066
  "cell_type": "code",
1067
- "execution_count": 1,
1068
  "id": "6a35e80f",
1069
  "metadata": {
1070
  "slideshow": {
1071
  "slide_type": "fragment"
1072
  }
1073
  },
1074
- "outputs": [
1075
- {
1076
- "name": "stdout",
1077
- "output_type": "stream",
1078
- "text": [
1079
- "Aquí me pongo a cantar;\n",
1080
- "y si tengo el sueño:\n",
1081
- "de pronto se me ha quedado la sangre:\n",
1082
- "como te asombre se me\n",
1083
- "lo oí decir muchas veces,\n",
1084
- "pero el tiempo me ha borrado.\n",
1085
- "\n",
1086
- "2\n",
1087
- "Soy\n"
1088
- ]
1089
- }
1090
- ],
1091
  "source": [
1092
  "import torch\n",
1093
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 
258
  },
259
  {
260
  "cell_type": "code",
261
+ "execution_count": null,
262
  "id": "0e0d53be",
263
  "metadata": {
264
  "slideshow": {
 
276
  "\n",
277
  "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
278
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
279
+ "model = AutoModelForCausalLM.from_pretrained(\n",
280
+ " BASE_MODEL, torch_dtype=\"auto\"\n",
281
+ ").to(device)"
282
  ]
283
  },
284
  {
 
341
  },
342
  {
343
  "cell_type": "code",
344
+ "execution_count": null,
345
  "id": "c1227c49",
346
  "metadata": {
347
  "slideshow": {
 
380
  },
381
  {
382
  "cell_type": "code",
383
+ "execution_count": null,
384
  "id": "11bec6de",
385
  "metadata": {
386
  "slideshow": {
387
  "slide_type": "fragment"
388
  }
389
  },
390
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  "source": [
392
  "MAX_TOKENS = 50\n",
393
  "input_ids = tokenizer.encode(PROMPT, return_tensors=\"pt\").to(device)\n",
 
417
  },
418
  {
419
  "cell_type": "code",
420
+ "execution_count": null,
421
  "id": "dc66f288",
422
  "metadata": {
423
  "slideshow": {
424
  "slide_type": "fragment"
425
  }
426
  },
427
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  "source": [
429
  "torch.manual_seed(42) # To ensure determinism\n",
430
  "\n",
 
569
  },
570
  {
571
  "cell_type": "code",
572
+ "execution_count": null,
573
  "id": "17f2884d",
574
  "metadata": {
575
  "slideshow": {
 
581
  "import torch\n",
582
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
583
  "\n",
584
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
585
+ "\n",
586
  "BASE_MODEL = \"DeepESP/gpt2-spanish\" # We play with a smaller model\n",
587
  "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
588
  "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL).to(device)"
 
604
  },
605
  {
606
  "cell_type": "code",
607
+ "execution_count": null,
608
  "id": "322a4a9b",
609
  "metadata": {
610
  "slideshow": {
611
  "slide_type": "fragment"
612
  }
613
  },
614
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
615
  "source": [
616
  "torch.manual_seed(42) # To ensure determinism\n",
617
  "\n",
 
641
  },
642
  {
643
  "cell_type": "code",
644
+ "execution_count": null,
645
  "id": "5a27197e",
646
  "metadata": {
647
  "slideshow": {
648
  "slide_type": "fragment"
649
  }
650
  },
651
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  "source": [
653
  "from datasets import load_dataset\n",
654
  "\n",
 
679
  },
680
  {
681
  "cell_type": "code",
682
+ "execution_count": null,
683
  "id": "33059c5f",
684
  "metadata": {
685
  "scrolled": true,
 
717
  },
718
  {
719
  "cell_type": "code",
720
+ "execution_count": null,
721
  "id": "3100e195",
722
  "metadata": {
723
  "scrolled": true,
 
756
  },
757
  {
758
  "cell_type": "code",
759
+ "execution_count": null,
760
  "id": "b9d33b7b",
761
  "metadata": {
762
  "slideshow": {
763
  "slide_type": "fragment"
764
  }
765
  },
766
+ "outputs": [],
 
 
 
 
 
 
 
 
 
767
  "source": [
768
  "print(len(lm_datasets[\"train\"][0][\"input_ids\"]))\n",
769
  "print(lm_datasets[\"train\"][0][\"input_ids\"][:10])"
 
771
  },
772
  {
773
  "cell_type": "code",
774
+ "execution_count": null,
775
  "id": "7dfb316d",
776
  "metadata": {
777
  "slideshow": {
778
  "slide_type": "fragment"
779
  }
780
  },
781
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  "source": [
783
  "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
784
  ]
 
800
  },
801
  {
802
  "cell_type": "code",
803
+ "execution_count": null,
804
  "id": "a8b90ba2",
805
  "metadata": {
806
  "scrolled": true,
 
835
  },
836
  {
837
  "cell_type": "code",
838
+ "execution_count": null,
839
+ "id": "ccd8e608-7e14-4796-9e52-c55b6df3ce6f",
840
  "metadata": {
841
  "slideshow": {
842
  "slide_type": "subslide"
843
  }
844
  },
845
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  "source": [
847
  "from transformers import Trainer, TrainingArguments\n",
848
  "\n",
849
  "training_args = TrainingArguments(\n",
850
  " \"flisol-cba-martin-fierro\",\n",
851
+ " eval_strategy=\"epoch\",\n",
852
+ " num_train_epochs=25,\n",
853
  " learning_rate=2e-5,\n",
854
  " weight_decay=0.01,\n",
855
  " logging_steps=5,\n",
 
862
  " eval_dataset=lm_datasets[\"validation\"],\n",
863
  ")\n",
864
  "\n",
865
+ "trainer.train();"
866
+ ]
867
+ },
868
+ {
869
+ "cell_type": "code",
870
+ "execution_count": null,
871
+ "id": "16695fc7-9baa-41f7-9367-d8fbcd987b79",
872
+ "metadata": {
873
+ "slideshow": {
874
+ "slide_type": "subslide"
875
+ }
876
+ },
877
+ "outputs": [],
878
+ "source": [
879
  "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository\n",
880
  "tokenizer.push_to_hub(\"flisol-cba-martin-fierro\")"
881
  ]
 
898
  },
899
  {
900
  "cell_type": "code",
901
+ "execution_count": null,
902
  "id": "6a35e80f",
903
  "metadata": {
904
  "slideshow": {
905
  "slide_type": "fragment"
906
  }
907
  },
908
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  "source": [
910
  "import torch\n",
911
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",