crscardellino
commited on
Commit
•
dd15d4a
1
Parent(s):
2b999a0
Full blank notebook. Needs to be run with the training model ready
Browse files- flisol-cordoba-2023.ipynb +443 -14
flisol-cordoba-2023.ipynb
CHANGED
@@ -53,7 +53,7 @@
|
|
53 |
"1. [¿Qué hay detrás de ChatGPT?](#¿Qué-hay-detrás-de-ChatGPT?)\n",
|
54 |
"2. [Hugging Face](#Hugging-Face)\n",
|
55 |
"3. [Utilizando un modelo de Hugging Face](#Utilizando-un-modelo-de-Hugging-Face)\n",
|
56 |
-
"4. Personalizando un modelo de Hugging Face"
|
57 |
]
|
58 |
},
|
59 |
{
|
@@ -244,7 +244,7 @@
|
|
244 |
},
|
245 |
{
|
246 |
"cell_type": "markdown",
|
247 |
-
"id": "
|
248 |
"metadata": {
|
249 |
"slideshow": {
|
250 |
"slide_type": "subslide"
|
@@ -262,10 +262,10 @@
|
|
262 |
{
|
263 |
"cell_type": "code",
|
264 |
"execution_count": null,
|
265 |
-
"id": "
|
266 |
"metadata": {
|
267 |
"slideshow": {
|
268 |
-
"slide_type": "
|
269 |
}
|
270 |
},
|
271 |
"outputs": [],
|
@@ -287,7 +287,7 @@
|
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "markdown",
|
290 |
-
"id": "
|
291 |
"metadata": {
|
292 |
"slideshow": {
|
293 |
"slide_type": "subslide"
|
@@ -308,7 +308,7 @@
|
|
308 |
},
|
309 |
{
|
310 |
"cell_type": "markdown",
|
311 |
-
"id": "
|
312 |
"metadata": {
|
313 |
"slideshow": {
|
314 |
"slide_type": "subslide"
|
@@ -331,7 +331,7 @@
|
|
331 |
{
|
332 |
"cell_type": "code",
|
333 |
"execution_count": null,
|
334 |
-
"id": "
|
335 |
"metadata": {
|
336 |
"slideshow": {
|
337 |
"slide_type": "subslide"
|
@@ -353,7 +353,7 @@
|
|
353 |
},
|
354 |
{
|
355 |
"cell_type": "markdown",
|
356 |
-
"id": "
|
357 |
"metadata": {
|
358 |
"slideshow": {
|
359 |
"slide_type": "subslide"
|
@@ -370,7 +370,7 @@
|
|
370 |
{
|
371 |
"cell_type": "code",
|
372 |
"execution_count": null,
|
373 |
-
"id": "
|
374 |
"metadata": {
|
375 |
"slideshow": {
|
376 |
"slide_type": "fragment"
|
@@ -388,7 +388,7 @@
|
|
388 |
},
|
389 |
{
|
390 |
"cell_type": "markdown",
|
391 |
-
"id": "
|
392 |
"metadata": {
|
393 |
"slideshow": {
|
394 |
"slide_type": "subslide"
|
@@ -407,7 +407,7 @@
|
|
407 |
{
|
408 |
"cell_type": "code",
|
409 |
"execution_count": null,
|
410 |
-
"id": "
|
411 |
"metadata": {
|
412 |
"slideshow": {
|
413 |
"slide_type": "fragment"
|
@@ -431,7 +431,7 @@
|
|
431 |
},
|
432 |
{
|
433 |
"cell_type": "markdown",
|
434 |
-
"id": "
|
435 |
"metadata": {
|
436 |
"slideshow": {
|
437 |
"slide_type": "subslide"
|
@@ -453,7 +453,7 @@
|
|
453 |
{
|
454 |
"cell_type": "code",
|
455 |
"execution_count": null,
|
456 |
-
"id": "
|
457 |
"metadata": {
|
458 |
"slideshow": {
|
459 |
"slide_type": "subslide"
|
@@ -461,7 +461,7 @@
|
|
461 |
},
|
462 |
"outputs": [],
|
463 |
"source": [
|
464 |
-
"from chatbot import ChatBot\n",
|
465 |
"\n",
|
466 |
"PROMPT = \"\"\"\n",
|
467 |
"La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
|
@@ -488,6 +488,435 @@
|
|
488 |
" break\n",
|
489 |
" print(chatbot.chat(input_text))"
|
490 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
491 |
}
|
492 |
],
|
493 |
"metadata": {
|
|
|
53 |
"1. [¿Qué hay detrás de ChatGPT?](#¿Qué-hay-detrás-de-ChatGPT?)\n",
|
54 |
"2. [Hugging Face](#Hugging-Face)\n",
|
55 |
"3. [Utilizando un modelo de Hugging Face](#Utilizando-un-modelo-de-Hugging-Face)\n",
|
56 |
+
"4. [Personalizando un modelo de Hugging Face](#Personalizando-un-modelo-de-Hugging-Face)"
|
57 |
]
|
58 |
},
|
59 |
{
|
|
|
244 |
},
|
245 |
{
|
246 |
"cell_type": "markdown",
|
247 |
+
"id": "d06c7318",
|
248 |
"metadata": {
|
249 |
"slideshow": {
|
250 |
"slide_type": "subslide"
|
|
|
262 |
{
|
263 |
"cell_type": "code",
|
264 |
"execution_count": null,
|
265 |
+
"id": "0e0d53be",
|
266 |
"metadata": {
|
267 |
"slideshow": {
|
268 |
+
"slide_type": "fragment"
|
269 |
}
|
270 |
},
|
271 |
"outputs": [],
|
|
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "markdown",
|
290 |
+
"id": "022de9b5",
|
291 |
"metadata": {
|
292 |
"slideshow": {
|
293 |
"slide_type": "subslide"
|
|
|
308 |
},
|
309 |
{
|
310 |
"cell_type": "markdown",
|
311 |
+
"id": "6e6b4464",
|
312 |
"metadata": {
|
313 |
"slideshow": {
|
314 |
"slide_type": "subslide"
|
|
|
331 |
{
|
332 |
"cell_type": "code",
|
333 |
"execution_count": null,
|
334 |
+
"id": "c1227c49",
|
335 |
"metadata": {
|
336 |
"slideshow": {
|
337 |
"slide_type": "subslide"
|
|
|
353 |
},
|
354 |
{
|
355 |
"cell_type": "markdown",
|
356 |
+
"id": "82988db2",
|
357 |
"metadata": {
|
358 |
"slideshow": {
|
359 |
"slide_type": "subslide"
|
|
|
370 |
{
|
371 |
"cell_type": "code",
|
372 |
"execution_count": null,
|
373 |
+
"id": "11bec6de",
|
374 |
"metadata": {
|
375 |
"slideshow": {
|
376 |
"slide_type": "fragment"
|
|
|
388 |
},
|
389 |
{
|
390 |
"cell_type": "markdown",
|
391 |
+
"id": "ba05a269",
|
392 |
"metadata": {
|
393 |
"slideshow": {
|
394 |
"slide_type": "subslide"
|
|
|
407 |
{
|
408 |
"cell_type": "code",
|
409 |
"execution_count": null,
|
410 |
+
"id": "dc66f288",
|
411 |
"metadata": {
|
412 |
"slideshow": {
|
413 |
"slide_type": "fragment"
|
|
|
431 |
},
|
432 |
{
|
433 |
"cell_type": "markdown",
|
434 |
+
"id": "98bdd38e",
|
435 |
"metadata": {
|
436 |
"slideshow": {
|
437 |
"slide_type": "subslide"
|
|
|
453 |
{
|
454 |
"cell_type": "code",
|
455 |
"execution_count": null,
|
456 |
+
"id": "a3e12232",
|
457 |
"metadata": {
|
458 |
"slideshow": {
|
459 |
"slide_type": "subslide"
|
|
|
461 |
},
|
462 |
"outputs": [],
|
463 |
"source": [
|
464 |
+
"from chatbot import ChatBot # local module in the repository\n",
|
465 |
"\n",
|
466 |
"PROMPT = \"\"\"\n",
|
467 |
"La siguiente es una conversación entre un HUMANO y un bot EXPERTO en software libre.\n",
|
|
|
488 |
" break\n",
|
489 |
" print(chatbot.chat(input_text))"
|
490 |
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"cell_type": "markdown",
|
494 |
+
"id": "6e570fc3",
|
495 |
+
"metadata": {
|
496 |
+
"slideshow": {
|
497 |
+
"slide_type": "slide"
|
498 |
+
}
|
499 |
+
},
|
500 |
+
"source": [
|
501 |
+
"# Personalizando un modelo de Hugging Face"
|
502 |
+
]
|
503 |
+
},
|
504 |
+
{
|
505 |
+
"cell_type": "markdown",
|
506 |
+
"id": "adb09645",
|
507 |
+
"metadata": {
|
508 |
+
"slideshow": {
|
509 |
+
"slide_type": "subslide"
|
510 |
+
}
|
511 |
+
},
|
512 |
+
"source": [
|
513 |
+
"## ¿Cómo se entrenan los LLMs?\n",
|
514 |
+
"\n",
|
515 |
+
"- Para entrenar LLMs se requiere de muchos datos y mucho cómputo.\n",
|
516 |
+
"- El modelo de GPT-3 se estima que tuvo un costo de entrenamiento cercano a los U$D4.6 Millones\n",
|
517 |
+
" - Requirió de varias semanas de entrenamiento\n",
|
518 |
+
" - El corpus reportado es de aproximadamente 500 mil millones (billions) de palabras.\n",
|
519 |
+
" - Varios GPUS para entrenarlo y hardware especializado\n",
|
520 |
+
" - No son modelos que entren en la memoria de una sola GPU."
|
521 |
+
]
|
522 |
+
},
|
523 |
+
{
|
524 |
+
"cell_type": "markdown",
|
525 |
+
"id": "16a26206",
|
526 |
+
"metadata": {
|
527 |
+
"slideshow": {
|
528 |
+
"slide_type": "fragment"
|
529 |
+
}
|
530 |
+
},
|
531 |
+
"source": [
|
532 |
+
"### ¿Y entonces qué puedo hacer?\n",
|
533 |
+
"\n",
|
534 |
+
"- Una ventaja de los LLMs es que el entrenamiento es sobre texto libre, pero se puede **especializar**.\n",
|
535 |
+
"- Uno puede entrenar modelos para diversas tareas especializados en corpus más chico.\n",
|
536 |
+
"- El hecho de que no sea \"desde cero\" ayuda a evitar sobreajuste (overfit) y tiene buen desempeño.\n",
|
537 |
+
"- El procedimiento de **especialización** se conoce como **fine-tuning**."
|
538 |
+
]
|
539 |
+
},
|
540 |
+
{
|
541 |
+
"cell_type": "markdown",
|
542 |
+
"id": "649e2ef4",
|
543 |
+
"metadata": {
|
544 |
+
"slideshow": {
|
545 |
+
"slide_type": "subslide"
|
546 |
+
}
|
547 |
+
},
|
548 |
+
"source": [
|
549 |
+
"## ¿Cómo personalizar un modelo de Hugging Face?\n",
|
550 |
+
"\n",
|
551 |
+
"- Se inicia por algún modelo pre-entrenado para la tarea específica que uno busca (e.g. clasificación, generación, etc).\n",
|
552 |
+
"- Se toma un corpus especializado (anotado, revisado, etc.) y se entrena utilizando dicho corpus.\n",
|
553 |
+
"- Intentaremos [entrenar que un modelo genere texto](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb) con el estilo del **Martín Fierro**.\n",
|
554 |
+
"- Para hacerlo menos pesado, utilizaremos un modelo más chico `DeepESP/gpt2-spanish` como base.\n"
|
555 |
+
]
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"cell_type": "code",
|
559 |
+
"execution_count": null,
|
560 |
+
"id": "17f2884d",
|
561 |
+
"metadata": {
|
562 |
+
"slideshow": {
|
563 |
+
"slide_type": "fragment"
|
564 |
+
}
|
565 |
+
},
|
566 |
+
"outputs": [],
|
567 |
+
"source": [
|
568 |
+
"import torch\n",
|
569 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
570 |
+
"\n",
|
571 |
+
"BASE_MODEL = \"DeepESP/gpt2-spanish\" # We play with a smaller model\n",
|
572 |
+
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
|
573 |
+
"model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
|
574 |
+
]
|
575 |
+
},
|
576 |
+
{
|
577 |
+
"cell_type": "markdown",
|
578 |
+
"id": "16690597",
|
579 |
+
"metadata": {
|
580 |
+
"slideshow": {
|
581 |
+
"slide_type": "subslide"
|
582 |
+
}
|
583 |
+
},
|
584 |
+
"source": [
|
585 |
+
"### Probando el Modelo Base\n",
|
586 |
+
"\n",
|
587 |
+
"- Antes de ajustar el modelo vemos cómo se desenvuelve si le damos como entrada el primer verso del \"Martín Fierro\"."
|
588 |
+
]
|
589 |
+
},
|
590 |
+
{
|
591 |
+
"cell_type": "code",
|
592 |
+
"execution_count": null,
|
593 |
+
"id": "322a4a9b",
|
594 |
+
"metadata": {
|
595 |
+
"slideshow": {
|
596 |
+
"slide_type": "fragment"
|
597 |
+
}
|
598 |
+
},
|
599 |
+
"outputs": [],
|
600 |
+
"source": [
|
601 |
+
"torch.manual_seed(42) # To ensure determinism\n",
|
602 |
+
"\n",
|
603 |
+
"input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors='pt')\n",
|
604 |
+
"sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
|
605 |
+
"output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
|
606 |
+
"\n",
|
607 |
+
"print(output)"
|
608 |
+
]
|
609 |
+
},
|
610 |
+
{
|
611 |
+
"cell_type": "markdown",
|
612 |
+
"id": "ec722e81",
|
613 |
+
"metadata": {
|
614 |
+
"slideshow": {
|
615 |
+
"slide_type": "subslide"
|
616 |
+
}
|
617 |
+
},
|
618 |
+
"source": [
|
619 |
+
"### Cargando el Dataset\n",
|
620 |
+
"\n",
|
621 |
+
"- Utilizamos la librería [datasets](https://huggingface.co/docs/datasets/index) de Hugging Face para cargar el corpus.\n",
|
622 |
+
"- En el directorio [`./data`] tenemos dos archivos: [`martin-fierro_train.txt`](./data/martin-fierro_train.txt) y [`martin-fierro_validation.txt`](./data/martin-fierro_validation.txt).\n",
|
623 |
+
" - El archivo de entrenamiento es sobre las 12 primeras partes.\n",
|
624 |
+
" - El archivo de validación es sobre la última parte."
|
625 |
+
]
|
626 |
+
},
|
627 |
+
{
|
628 |
+
"cell_type": "code",
|
629 |
+
"execution_count": null,
|
630 |
+
"id": "5a27197e",
|
631 |
+
"metadata": {
|
632 |
+
"slideshow": {
|
633 |
+
"slide_type": "fragment"
|
634 |
+
}
|
635 |
+
},
|
636 |
+
"outputs": [],
|
637 |
+
"source": [
|
638 |
+
"from datasets import load_dataset\n",
|
639 |
+
"\n",
|
640 |
+
"datasets = load_dataset(\"text\", data_files={\"train\": './data/martin-fierro_train.txt',\n",
|
641 |
+
" \"validation\": './data/martin-fierro_validation.txt'})\n",
|
642 |
+
"print('\\n'.join(datasets[\"train\"][:9]['text']))"
|
643 |
+
]
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"cell_type": "markdown",
|
647 |
+
"id": "9504707f",
|
648 |
+
"metadata": {
|
649 |
+
"slideshow": {
|
650 |
+
"slide_type": "subslide"
|
651 |
+
}
|
652 |
+
},
|
653 |
+
"source": [
|
654 |
+
"### Tokenizando los datos\n",
|
655 |
+
"\n",
|
656 |
+
"- La función auxiliar `tokenize` del módulo [`utils`](./utils.py) sirve para tokenizar y codificar el conjunto de datos de manera eficiente mediante el [método `map`](https://huggingface.co/docs/datasets/about_map_batch).\n",
|
657 |
+
"- Lo que devuelve es un nuevo dataset cuyos tokens están convertidos en índices del vocabulario y [máscaras de atención](https://huggingface.co/docs/transformers/glossary#attention-mask)"
|
658 |
+
]
|
659 |
+
},
|
660 |
+
{
|
661 |
+
"cell_type": "code",
|
662 |
+
"execution_count": null,
|
663 |
+
"id": "33059c5f",
|
664 |
+
"metadata": {
|
665 |
+
"scrolled": true,
|
666 |
+
"slideshow": {
|
667 |
+
"slide_type": "fragment"
|
668 |
+
}
|
669 |
+
},
|
670 |
+
"outputs": [],
|
671 |
+
"source": [
|
672 |
+
"from utils import tokenize # local module in the repository\n",
|
673 |
+
"\n",
|
674 |
+
"tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=[\"text\"])"
|
675 |
+
]
|
676 |
+
},
|
677 |
+
{
|
678 |
+
"cell_type": "markdown",
|
679 |
+
"id": "81d67b22",
|
680 |
+
"metadata": {
|
681 |
+
"slideshow": {
|
682 |
+
"slide_type": "subslide"
|
683 |
+
}
|
684 |
+
},
|
685 |
+
"source": [
|
686 |
+
"### Agrupando los textos\n",
|
687 |
+
"\n",
|
688 |
+
"- Para entrenar de manera más eficiente es común utilizar lo que se conoce como **mini-batch gradient descent**.\n",
|
689 |
+
"- La idea es tomar los textos de a bloques de un valor máximo.\n",
|
690 |
+
" - El valor máximo estará limitado por la memoria de la unidad de procesamiento (e.g. GPU).\n",
|
691 |
+
"- Utilizamos la función auxiliar `group_texts` del módulo [`utils`](./utils.py).\n",
|
692 |
+
" - La función además establece las etiquetas que utilizará Hugging Face para entrenar.\n",
|
693 |
+
" - En este caso las etiquetas son las mismas palabras, porque busca predecir la palabra siguiente."
|
694 |
+
]
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"cell_type": "code",
|
698 |
+
"execution_count": null,
|
699 |
+
"id": "3100e195",
|
700 |
+
"metadata": {
|
701 |
+
"scrolled": true,
|
702 |
+
"slideshow": {
|
703 |
+
"slide_type": "fragment"
|
704 |
+
}
|
705 |
+
},
|
706 |
+
"outputs": [],
|
707 |
+
"source": [
|
708 |
+
"from functools import partial\n",
|
709 |
+
"from utils import group_texts # local module in the repository\n",
|
710 |
+
"\n",
|
711 |
+
"lm_datasets = tokenized_datasets.map(\n",
|
712 |
+
" partial(group_texts, block_size=128),\n",
|
713 |
+
" batched=True,\n",
|
714 |
+
" batch_size=1024,\n",
|
715 |
+
" num_proc=4,\n",
|
716 |
+
")"
|
717 |
+
]
|
718 |
+
},
|
719 |
+
{
|
720 |
+
"cell_type": "markdown",
|
721 |
+
"id": "d64a23ec",
|
722 |
+
"metadata": {
|
723 |
+
"slideshow": {
|
724 |
+
"slide_type": "subslide"
|
725 |
+
}
|
726 |
+
},
|
727 |
+
"source": [
|
728 |
+
"### Decodificando\n",
|
729 |
+
"\n",
|
730 |
+
"- Podemos ver que los textos pasan a estar agrupados en bloques de 128 tokens.\n",
|
731 |
+
"- Además, vemos que el texto fue reemplazado por números (índices en el vocabulario).\n",
|
732 |
+
"- Por último, si decodificamos estos números, obtenemos el texto original."
|
733 |
+
]
|
734 |
+
},
|
735 |
+
{
|
736 |
+
"cell_type": "code",
|
737 |
+
"execution_count": null,
|
738 |
+
"id": "b9d33b7b",
|
739 |
+
"metadata": {
|
740 |
+
"slideshow": {
|
741 |
+
"slide_type": "fragment"
|
742 |
+
}
|
743 |
+
},
|
744 |
+
"outputs": [],
|
745 |
+
"source": [
|
746 |
+
"print(len(lm_datasets['train'][0]['input_ids']))\n",
|
747 |
+
"print(lm_datasets['train'][0]['input_ids'][:10])"
|
748 |
+
]
|
749 |
+
},
|
750 |
+
{
|
751 |
+
"cell_type": "code",
|
752 |
+
"execution_count": null,
|
753 |
+
"id": "7dfb316d",
|
754 |
+
"metadata": {
|
755 |
+
"scrolled": false,
|
756 |
+
"slideshow": {
|
757 |
+
"slide_type": "fragment"
|
758 |
+
}
|
759 |
+
},
|
760 |
+
"outputs": [],
|
761 |
+
"source": [
|
762 |
+
"print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
|
763 |
+
]
|
764 |
+
},
|
765 |
+
{
|
766 |
+
"cell_type": "markdown",
|
767 |
+
"id": "d7e2032f",
|
768 |
+
"metadata": {
|
769 |
+
"slideshow": {
|
770 |
+
"slide_type": "subslide"
|
771 |
+
}
|
772 |
+
},
|
773 |
+
"source": [
|
774 |
+
"### Compartir Modelo en Hugging Face\n",
|
775 |
+
"\n",
|
776 |
+
"- Una opción a la hora de entrenar un modelo es subirlo a Hugging Face para compartirlo con la comunidad.\n",
|
777 |
+
"- Para eso, una vez que tengan la cuenta de Hugging Face, y creado el modelo, hay que hacer login mediante un [token](https://huggingface.co/settings/tokens)."
|
778 |
+
]
|
779 |
+
},
|
780 |
+
{
|
781 |
+
"cell_type": "code",
|
782 |
+
"execution_count": null,
|
783 |
+
"id": "a8b90ba2",
|
784 |
+
"metadata": {
|
785 |
+
"slideshow": {
|
786 |
+
"slide_type": "fragment"
|
787 |
+
}
|
788 |
+
},
|
789 |
+
"outputs": [],
|
790 |
+
"source": [
|
791 |
+
"from huggingface_hub import notebook_login\n",
|
792 |
+
"\n",
|
793 |
+
"notebook_login()"
|
794 |
+
]
|
795 |
+
},
|
796 |
+
{
|
797 |
+
"cell_type": "markdown",
|
798 |
+
"id": "a6b775d3",
|
799 |
+
"metadata": {
|
800 |
+
"slideshow": {
|
801 |
+
"slide_type": "subslide"
|
802 |
+
}
|
803 |
+
},
|
804 |
+
"source": [
|
805 |
+
"### Entrenamiento\n",
|
806 |
+
"\n",
|
807 |
+
"- Una vez definido el conjunto de datos, pasamos a la parte más intensa computacionalmente, el entrenamiento.\n",
|
808 |
+
"- Podemos decidir guardar el modelo localmente o hacer un backup de cada época del modelo en Hugging Face.\n",
|
809 |
+
"- Definimos las propiedades del entrenamiento mediante [`TrainingArguments`](https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/trainer#transformers.TrainingArguments).\n",
|
810 |
+
"- Definimos el entrenamiento del modelo mediante [`Trainer`](https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/trainer#transformers.Trainer).\n",
|
811 |
+
" - El entrenamiento tardará desde unos segundos hasta varios minutos dependiendo el poder de cómputo."
|
812 |
+
]
|
813 |
+
},
|
814 |
+
{
|
815 |
+
"cell_type": "code",
|
816 |
+
"execution_count": null,
|
817 |
+
"id": "d43c5555",
|
818 |
+
"metadata": {
|
819 |
+
"slideshow": {
|
820 |
+
"slide_type": "subslide"
|
821 |
+
}
|
822 |
+
},
|
823 |
+
"outputs": [],
|
824 |
+
"source": [
|
825 |
+
"from transformers import Trainer, TrainingArguments\n",
|
826 |
+
"\n",
|
827 |
+
"training_args = TrainingArguments(\n",
|
828 |
+
" \"flisol-cba-martinfierro\",\n",
|
829 |
+
" evaluation_strategy=\"epoch\",\n",
|
830 |
+
" num_train_epochs=15,\n",
|
831 |
+
" learning_rate=2e-5,\n",
|
832 |
+
" weight_decay=0.01,\n",
|
833 |
+
" logging_steps=5\n",
|
834 |
+
")\n",
|
835 |
+
"\n",
|
836 |
+
"trainer = Trainer(\n",
|
837 |
+
" model=model,\n",
|
838 |
+
" args=training_args,\n",
|
839 |
+
" train_dataset=lm_datasets[\"train\"],\n",
|
840 |
+
" eval_dataset=lm_datasets[\"validation\"]\n",
|
841 |
+
")\n",
|
842 |
+
"\n",
|
843 |
+
"trainer.train()\n",
|
844 |
+
"trainer.push_to_hub() # This pushes the trained model to Hugging Face model repository"
|
845 |
+
]
|
846 |
+
},
|
847 |
+
{
|
848 |
+
"cell_type": "markdown",
|
849 |
+
"id": "db2099f4",
|
850 |
+
"metadata": {
|
851 |
+
"slideshow": {
|
852 |
+
"slide_type": "subslide"
|
853 |
+
}
|
854 |
+
},
|
855 |
+
"source": [
|
856 |
+
"### Probando el Nuevo Modelo\n",
|
857 |
+
"\n",
|
858 |
+
"- Ahora que tenemos el modelo entrenado, la pregunta es, ¿Cómo se comportará?\n",
|
859 |
+
"- Para ello volvemos a hacer la prueba anterior, quizás esta vez con mejores resultados.\n",
|
860 |
+
" - Para evitar tener que entrenar el modelo nuevamente directamente tomo el [modelo compartido en Hugging Face](https://huggingface.co/crscardellino/flisol-cba-martin-fierro)."
|
861 |
+
]
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"cell_type": "code",
|
865 |
+
"execution_count": null,
|
866 |
+
"id": "6a35e80f",
|
867 |
+
"metadata": {
|
868 |
+
"slideshow": {
|
869 |
+
"slide_type": "fragment"
|
870 |
+
}
|
871 |
+
},
|
872 |
+
"outputs": [],
|
873 |
+
"source": [
|
874 |
+
"import torch\n",
|
875 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
876 |
+
"\n",
|
877 |
+
"BASE_MODEL = \"crscardellino/flisol-cba-martin-fierro\"\n",
|
878 |
+
"tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
|
879 |
+
"model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)\n",
|
880 |
+
"\n",
|
881 |
+
"torch.manual_seed(42) # To ensure determinism\n",
|
882 |
+
"\n",
|
883 |
+
"input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors='pt')\n",
|
884 |
+
"sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
|
885 |
+
"output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
|
886 |
+
"\n",
|
887 |
+
"print(output)"
|
888 |
+
]
|
889 |
+
},
|
890 |
+
{
|
891 |
+
"cell_type": "markdown",
|
892 |
+
"id": "f4e33157",
|
893 |
+
"metadata": {
|
894 |
+
"slideshow": {
|
895 |
+
"slide_type": "slide"
|
896 |
+
}
|
897 |
+
},
|
898 |
+
"source": [
|
899 |
+
"# ¡Muchas Gracias!"
|
900 |
+
]
|
901 |
+
},
|
902 |
+
{
|
903 |
+
"cell_type": "markdown",
|
904 |
+
"id": "f04a4e4a",
|
905 |
+
"metadata": {
|
906 |
+
"slideshow": {
|
907 |
+
"slide_type": "subslide"
|
908 |
+
}
|
909 |
+
},
|
910 |
+
"source": [
|
911 |
+
"## ¿Preguntas?\n",
|
912 |
+
"\n",
|
913 |
+
"* Twitter: https://twitter.com/crscardellino\n",
|
914 |
+
"* Mastodon: https://mastodon.social/@crscardellino\n",
|
915 |
+
"* LinkedIn: https://www.linkedin.com/in/crscardellino\n",
|
916 |
+
"* Página Personal: https://crscardellino.ar / https://crscardellino.github.io\n",
|
917 |
+
"* GitHub: https://github.com/crscardellino/\n",
|
918 |
+
"* Código y modelo de la presentación: https://huggingface.co/crscardellino/flisol-cba-martin-fierro/"
|
919 |
+
]
|
920 |
}
|
921 |
],
|
922 |
"metadata": {
|