crscardellino commited on
Commit
b161832
1 Parent(s): 184726d

Finished presentation notebook

Browse files
Files changed (1) hide show
  1. flisol-cordoba-2023.ipynb +80 -88
flisol-cordoba-2023.ipynb CHANGED
@@ -261,7 +261,7 @@
261
  },
262
  {
263
  "cell_type": "code",
264
- "execution_count": null,
265
  "id": "0e0d53be",
266
  "metadata": {
267
  "slideshow": {
@@ -330,7 +330,7 @@
330
  },
331
  {
332
  "cell_type": "code",
333
- "execution_count": null,
334
  "id": "c1227c49",
335
  "metadata": {
336
  "slideshow": {
@@ -369,14 +369,32 @@
369
  },
370
  {
371
  "cell_type": "code",
372
- "execution_count": null,
373
  "id": "11bec6de",
374
  "metadata": {
375
  "slideshow": {
376
  "slide_type": "fragment"
377
  }
378
  },
379
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  "source": [
381
  "MAX_TOKENS = 50\n",
382
  "input_ids = tokenizer.encode(PROMPT, return_tensors='pt')\n",
@@ -406,14 +424,33 @@
406
  },
407
  {
408
  "cell_type": "code",
409
- "execution_count": null,
410
  "id": "dc66f288",
411
  "metadata": {
412
  "slideshow": {
413
  "slide_type": "fragment"
414
  }
415
  },
416
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  "source": [
418
  "torch.manual_seed(42) # To ensure determinism\n",
419
  "\n",
@@ -473,8 +510,7 @@
473
  "\"\"\".strip()\n",
474
  "\n",
475
  "chatbot = ChatBot(\n",
476
- " base_model=model,\n",
477
- " tokenizer=tokenizer,\n",
478
  " initial_prompt=PROMPT,\n",
479
  " keep_context=True,\n",
480
  " creative=True,\n",
@@ -597,14 +633,6 @@
597
  }
598
  },
599
  "outputs": [
600
- {
601
- "name": "stderr",
602
- "output_type": "stream",
603
- "text": [
604
- "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
605
- "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
606
- ]
607
- },
608
  {
609
  "name": "stdout",
610
  "output_type": "stream",
@@ -654,17 +682,10 @@
654
  }
655
  },
656
  "outputs": [
657
- {
658
- "name": "stderr",
659
- "output_type": "stream",
660
- "text": [
661
- "Found cached dataset text (/home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n"
662
- ]
663
- },
664
  {
665
  "data": {
666
  "application/vnd.jupyter.widget-view+json": {
667
- "model_id": "0fe0bb8953f24e05b2a56ad08c462976",
668
  "version_major": 2,
669
  "version_minor": 0
670
  },
@@ -724,16 +745,7 @@
724
  "slide_type": "fragment"
725
  }
726
  },
727
- "outputs": [
728
- {
729
- "name": "stderr",
730
- "output_type": "stream",
731
- "text": [
732
- "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-5a0f77d99160fc1c_*_of_00004.arrow\n",
733
- "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-003d85e2eebe3231_*_of_00004.arrow\n"
734
- ]
735
- }
736
- ],
737
  "source": [
738
  "from utils import tokenize # local module in the repository\n",
739
  "\n",
@@ -769,16 +781,7 @@
769
  "slide_type": "fragment"
770
  }
771
  },
772
- "outputs": [
773
- {
774
- "name": "stderr",
775
- "output_type": "stream",
776
- "text": [
777
- "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-01936c1905752293_*_of_00004.arrow\n",
778
- "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-af8dcd60a546c28d_*_of_00004.arrow\n"
779
- ]
780
- }
781
- ],
782
  "source": [
783
  "from functools import partial\n",
784
  "from utils import group_texts # local module in the repository\n",
@@ -903,13 +906,18 @@
903
  },
904
  "outputs": [
905
  {
906
- "name": "stdout",
907
- "output_type": "stream",
908
- "text": [
909
- "Token is valid.\n",
910
- "Your token has been saved to /home/crscardellino/.cache/huggingface/token\n",
911
- "Login successful\n"
912
- ]
 
 
 
 
 
913
  }
914
  ],
915
  "source": [
@@ -946,14 +954,6 @@
946
  }
947
  },
948
  "outputs": [
949
- {
950
- "name": "stderr",
951
- "output_type": "stream",
952
- "text": [
953
- "/home/crscardellino/Projects/research/flisol/flisol-cba-martin-fierro/venv/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
954
- " warnings.warn(\n"
955
- ]
956
- },
957
  {
958
  "data": {
959
  "text/html": [
@@ -1031,16 +1031,6 @@
1031
  },
1032
  "metadata": {},
1033
  "output_type": "display_data"
1034
- },
1035
- {
1036
- "data": {
1037
- "text/plain": [
1038
- "TrainOutput(global_step=180, training_loss=3.5808190133836533, metrics={'train_runtime': 707.4357, 'train_samples_per_second': 1.951, 'train_steps_per_second': 0.254, 'total_flos': 90145751040000.0, 'train_loss': 3.5808190133836533, 'epoch': 10.0})"
1039
- ]
1040
- },
1041
- "execution_count": 9,
1042
- "metadata": {},
1043
- "output_type": "execute_result"
1044
  }
1045
  ],
1046
  "source": [
@@ -1062,20 +1052,7 @@
1062
  " eval_dataset=lm_datasets[\"validation\"]\n",
1063
  ")\n",
1064
  "\n",
1065
- "trainer.train()"
1066
- ]
1067
- },
1068
- {
1069
- "cell_type": "code",
1070
- "execution_count": null,
1071
- "id": "d43c5555",
1072
- "metadata": {
1073
- "slideshow": {
1074
- "slide_type": "-"
1075
- }
1076
- },
1077
- "outputs": [],
1078
- "source": [
1079
  "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository"
1080
  ]
1081
  },
@@ -1097,21 +1074,36 @@
1097
  },
1098
  {
1099
  "cell_type": "code",
1100
- "execution_count": null,
1101
  "id": "6a35e80f",
1102
  "metadata": {
1103
  "slideshow": {
1104
  "slide_type": "fragment"
1105
  }
1106
  },
1107
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108
  "source": [
1109
  "import torch\n",
1110
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
1111
  "\n",
1112
- "BASE_MODEL = \"crscardellino/flisol-cba-martin-fierro\"\n",
1113
- "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
1114
- "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)\n",
1115
  "\n",
1116
  "torch.manual_seed(42) # To ensure determinism\n",
1117
  "\n",
 
261
  },
262
  {
263
  "cell_type": "code",
264
+ "execution_count": 1,
265
  "id": "0e0d53be",
266
  "metadata": {
267
  "slideshow": {
 
330
  },
331
  {
332
  "cell_type": "code",
333
+ "execution_count": 2,
334
  "id": "c1227c49",
335
  "metadata": {
336
  "slideshow": {
 
369
  },
370
  {
371
  "cell_type": "code",
372
+ "execution_count": 3,
373
  "id": "11bec6de",
374
  "metadata": {
375
  "slideshow": {
376
  "slide_type": "fragment"
377
  }
378
  },
379
+ "outputs": [
380
+ {
381
+ "name": "stdout",
382
+ "output_type": "stream",
383
+ "text": [
384
+ "La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
385
+ "El EXPERTO le ayuda al HUMANO con preguntas acerca de software libre.\n",
386
+ "El EXPERTO es conversacional, optimista, flexible, creativo y genera respuestas parecidas a un humano.\n",
387
+ "\n",
388
+ "HUMANO: Hola, ¿Cómo estás?\n",
389
+ "EXPERTO: Hola, pmuy bien. Estoy acá para ayudarte con preguntas respecto al software libre.\n",
390
+ "\n",
391
+ "HUMANO: ¿Qué es el software libre?\n",
392
+ "EXPERTO: El software libre es un software que se puede modificar, redistribuir y distribuir libremente.\n",
393
+ "HUMANO: ¿En qué consiste la licencia GPL?\n",
394
+ "EXPERTO: La licencia GPL es una licencia de software libre que permite a los usuarios modificar, redistribuir\n"
395
+ ]
396
+ }
397
+ ],
398
  "source": [
399
  "MAX_TOKENS = 50\n",
400
  "input_ids = tokenizer.encode(PROMPT, return_tensors='pt')\n",
 
424
  },
425
  {
426
  "cell_type": "code",
427
+ "execution_count": 4,
428
  "id": "dc66f288",
429
  "metadata": {
430
  "slideshow": {
431
  "slide_type": "fragment"
432
  }
433
  },
434
+ "outputs": [
435
+ {
436
+ "name": "stdout",
437
+ "output_type": "stream",
438
+ "text": [
439
+ "La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
440
+ "El EXPERTO le ayuda al HUMANO con preguntas acerca de software libre.\n",
441
+ "El EXPERTO es conversacional, optimista, flexible, creativo y genera respuestas parecidas a un humano.\n",
442
+ "\n",
443
+ "HUMANO: Hola, ¿Cómo estás?\n",
444
+ "EXPERTO: Hola, pmuy bien. Estoy acá para ayudarte con preguntas respecto al software libre.\n",
445
+ "\n",
446
+ "HUMANO: ¿Qué es el software libre?\n",
447
+ "EXPERTO: El software libre, es aquel software que esta escrito en un lenguaje de programación que puede ser modificado y copiado por cualquier persona o entidad.\n",
448
+ "\n",
449
+ "HUMANO: ¿En general cuáles son los usos que se pueden dar a un software libre?\n",
450
+ "EXPERTO\n"
451
+ ]
452
+ }
453
+ ],
454
  "source": [
455
  "torch.manual_seed(42) # To ensure determinism\n",
456
  "\n",
 
510
  "\"\"\".strip()\n",
511
  "\n",
512
  "chatbot = ChatBot(\n",
513
+ " base_model='bigscience/bloom-3b',\n",
 
514
  " initial_prompt=PROMPT,\n",
515
  " keep_context=True,\n",
516
  " creative=True,\n",
 
633
  }
634
  },
635
  "outputs": [
 
 
 
 
 
 
 
 
636
  {
637
  "name": "stdout",
638
  "output_type": "stream",
 
682
  }
683
  },
684
  "outputs": [
 
 
 
 
 
 
 
685
  {
686
  "data": {
687
  "application/vnd.jupyter.widget-view+json": {
688
+ "model_id": "123690f207a94d3e850acef7a13133a6",
689
  "version_major": 2,
690
  "version_minor": 0
691
  },
 
745
  "slide_type": "fragment"
746
  }
747
  },
748
+ "outputs": [],
 
 
 
 
 
 
 
 
 
749
  "source": [
750
  "from utils import tokenize # local module in the repository\n",
751
  "\n",
 
781
  "slide_type": "fragment"
782
  }
783
  },
784
+ "outputs": [],
 
 
 
 
 
 
 
 
 
785
  "source": [
786
  "from functools import partial\n",
787
  "from utils import group_texts # local module in the repository\n",
 
906
  },
907
  "outputs": [
908
  {
909
+ "data": {
910
+ "application/vnd.jupyter.widget-view+json": {
911
+ "model_id": "94b41ffd721d4bbf8840df3fee46bbb2",
912
+ "version_major": 2,
913
+ "version_minor": 0
914
+ },
915
+ "text/plain": [
916
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
917
+ ]
918
+ },
919
+ "metadata": {},
920
+ "output_type": "display_data"
921
  }
922
  ],
923
  "source": [
 
954
  }
955
  },
956
  "outputs": [
 
 
 
 
 
 
 
 
957
  {
958
  "data": {
959
  "text/html": [
 
1031
  },
1032
  "metadata": {},
1033
  "output_type": "display_data"
 
 
 
 
 
 
 
 
 
 
1034
  }
1035
  ],
1036
  "source": [
 
1052
  " eval_dataset=lm_datasets[\"validation\"]\n",
1053
  ")\n",
1054
  "\n",
1055
+ "trainer.train()\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
  "trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository"
1057
  ]
1058
  },
 
1074
  },
1075
  {
1076
  "cell_type": "code",
1077
+ "execution_count": 1,
1078
  "id": "6a35e80f",
1079
  "metadata": {
1080
  "slideshow": {
1081
  "slide_type": "fragment"
1082
  }
1083
  },
1084
+ "outputs": [
1085
+ {
1086
+ "name": "stdout",
1087
+ "output_type": "stream",
1088
+ "text": [
1089
+ "Aquí me pongo a cantar;\n",
1090
+ "y si tengo el sueño:\n",
1091
+ "de pronto se me ha quedado la sangre:\n",
1092
+ "como te asombre se me\n",
1093
+ "lo oí decir muchas veces,\n",
1094
+ "pero el tiempo me ha borrado.\n",
1095
+ "\n",
1096
+ "2\n",
1097
+ "Soy\n"
1098
+ ]
1099
+ }
1100
+ ],
1101
  "source": [
1102
  "import torch\n",
1103
  "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
1104
  "\n",
1105
+ "tokenizer = AutoTokenizer.from_pretrained(\"DeepESP/gpt2-spanish\")\n",
1106
+ "model = AutoModelForCausalLM.from_pretrained(\"crscardellino/flisol-cba-martin-fierro\")\n",
 
1107
  "\n",
1108
  "torch.manual_seed(42) # To ensure determinism\n",
1109
  "\n",