crscardellino
/

flisol-cba-martin-fierro

@@ -17,8 +17,12 @@ prompt.
 import argparse
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,\
-    PreTrainedTokenizerBase
 from typing import Optional, Union
@@ -51,42 +55,47 @@ class ChatBot:
     bot_identifier : str
         The string that will identify the bot speaker in the prompt (e.g.
         EXPERT).
     """
-    def __init__(self,
-                 base_model: Union[str, PreTrainedModel],
-                 tokenizer: Optional[PreTrainedTokenizerBase] = None,
-                 initial_prompt: Optional[str] = None,
-                 keep_context: bool = False,
-                 creative: bool = False,
-                 max_tokens: int = 50,
-                 human_identifier: str = 'HUMAN',
-                 bot_identifier: str = 'EXPERT'):
         if isinstance(base_model, str):
             self.model = AutoModelForCausalLM.from_pretrained(
-                base_model,
-                low_cpu_mem_usage=True,
-                torch_dtype='auto'
-            )
             self.tokenizer = AutoTokenizer.from_pretrained(base_model)
         else:
-            assert isinstance(tokenizer, PreTrainedTokenizerBase),\
-                "If the base model is given, the tokenizer should be given as well"
-            self.model = base_model
             self.tokenizer = tokenizer
         if initial_prompt is None:
-            with open('./prompt.txt', 'r') as fh:
                 self.initial_prompt = fh.read()
         else:
             self.initial_prompt = initial_prompt
         self.keep_context = keep_context
-        self.context = ''
         self.creative = creative
         self.max_tokens = max_tokens
         self.human_identifier = human_identifier
         self.bot_identifier = bot_identifier
     def chat(self, input_text: str) -> str:
         """
@@ -113,10 +122,10 @@ class ChatBot:
         # start the dialog between the human and the bot. Give space for the
         # model to continue from the prompt
         prompt = self.initial_prompt + self.context
-        prompt += f'{self.human_identifier}: {input_text}\n'
-        prompt += f'{self.bot_identifier}: '  # check the space after the colon
-        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
         if self.creative:
             # In case you want the bot to be creative, we sample using `top_k`
             # and `top_p`
@@ -125,13 +134,12 @@ class ChatBot:
                 do_sample=True,
                 max_length=input_ids.shape[1] + self.max_tokens,
                 top_k=50,
-                top_p=0.95
             )[0]
         else:
             # Otherwise we return the most probable token
             output = self.model.generate(
-                input_ids,
-                max_length=input_ids.shape[1] + self.max_tokens
             )[0]
         # Decode the output, removing special tokens for the model (like
@@ -139,11 +147,11 @@ class ChatBot:
         decoded_output = self.tokenizer.decode(output, skip_special_tokens=True)
         # Trim the output, first by removing the original prompt
-        trimmed_output = decoded_output[len(prompt):]
         # Then we find the stop token, in this case the human identifier, and
         # we get up to that point
-        trimmed_output = trimmed_output[:trimmed_output.find(f'{self.human_identifier}:')]
         if self.keep_context:
             # If we want to keep the context of the conversation we add the
@@ -153,36 +161,38 @@ class ChatBot:
         return trimmed_output.strip()  # we only return the trimmed output
-if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model-name', '-m',
-                        default='bigscience/bloom-560m',
-                        help="Name of the base model to use for the chatbot")
-    parser.add_argument('--prompt', '-p',
-                        default='./prompt.txt',
-                        help="Path to the file with the prompt to use")
-    parser.add_argument('--keep-context', '-k',
-                        action='store_true',
-                        help="Keep context of the conversation.")
-    parser.add_argument('--creative', '-c',
-                        action='store_true',
-                        help="Make the bot creative when answering.")
-    parser.add_argument('--random-seed', '-r',
-                        default=42,
-                        help="Seed number for the creative bot.",
-                        type=int)
-    parser.add_argument('--human-identifier', '-i',
-                        default='HUMANO',
-                        help="Name of the human identifier.")
-    parser.add_argument('--bot-identifier', '-b',
-                        default='EXPERTO',
-                        help="Name of the bot identifier.")
     args = parser.parse_args()
     torch.manual_seed(args.random_seed)
-    with open(args.prompt, 'r') as fh:
         initial_prompt = fh.read()
     chatbot = ChatBot(
@@ -191,12 +201,12 @@ if __name__ == '__main__':
         keep_context=args.keep_context,
         creative=args.creative,
         human_identifier=args.human_identifier,
-        bot_identifier=args.bot_identifier
     )
     print("Write `exit` or `quit` to quit")
     while True:
-        input_text = input('> ')
-        if input_text == 'exit' or input_text == 'quit':
             break
         print(chatbot.chat(input_text))

 import argparse
 import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
 from typing import Optional, Union
     bot_identifier : str
         The string that will identify the bot speaker in the prompt (e.g.
         EXPERT).
+    device: torch.device
+        Device to run the model
     """
+    def __init__(
+        self,
+        base_model: Union[str, PreTrainedModel],
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        initial_prompt: Optional[str] = None,
+        keep_context: bool = False,
+        creative: bool = False,
+        max_tokens: int = 50,
+        human_identifier: str = "HUMAN",
+        bot_identifier: str = "EXPERT",
+        device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    ):
         if isinstance(base_model, str):
             self.model = AutoModelForCausalLM.from_pretrained(
+                base_model, low_cpu_mem_usage=True, torch_dtype="auto"
+            ).to(device)
             self.tokenizer = AutoTokenizer.from_pretrained(base_model)
         else:
+            assert isinstance(
+                tokenizer, PreTrainedTokenizerBase
+            ), "If the base model is given, the tokenizer should be given as well"
+            self.model = base_model.to(device)
             self.tokenizer = tokenizer
         if initial_prompt is None:
+            with open("./prompt.txt", "r") as fh:
                 self.initial_prompt = fh.read()
         else:
             self.initial_prompt = initial_prompt
         self.keep_context = keep_context
+        self.context = ""
         self.creative = creative
         self.max_tokens = max_tokens
         self.human_identifier = human_identifier
         self.bot_identifier = bot_identifier
+        self.device = device
     def chat(self, input_text: str) -> str:
         """
         # start the dialog between the human and the bot. Give space for the
         # model to continue from the prompt
         prompt = self.initial_prompt + self.context
+        prompt += f"{self.human_identifier}: {input_text}\n"
+        prompt += f"{self.bot_identifier}: "  # check the space after the colon
+        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
         if self.creative:
             # In case you want the bot to be creative, we sample using `top_k`
             # and `top_p`
                 do_sample=True,
                 max_length=input_ids.shape[1] + self.max_tokens,
                 top_k=50,
+                top_p=0.95,
             )[0]
         else:
             # Otherwise we return the most probable token
             output = self.model.generate(
+                input_ids, max_length=input_ids.shape[1] + self.max_tokens
             )[0]
         # Decode the output, removing special tokens for the model (like
         decoded_output = self.tokenizer.decode(output, skip_special_tokens=True)
         # Trim the output, first by removing the original prompt
+        trimmed_output = decoded_output[len(prompt) :]
         # Then we find the stop token, in this case the human identifier, and
         # we get up to that point
+        trimmed_output = trimmed_output[: trimmed_output.find(f"{self.human_identifier}:")]
         if self.keep_context:
             # If we want to keep the context of the conversation we add the
         return trimmed_output.strip()  # we only return the trimmed output
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        default="bigscience/bloom-560m",
+        help="Name of the base model to use for the chatbot",
+    )
+    parser.add_argument(
+        "--prompt", "-p", default="./prompt.txt", help="Path to the file with the prompt to use"
+    )
+    parser.add_argument(
+        "--keep-context", "-k", action="store_true", help="Keep context of the conversation."
+    )
+    parser.add_argument(
+        "--creative", "-c", action="store_true", help="Make the bot creative when answering."
+    )
+    parser.add_argument(
+        "--random-seed", "-r", default=42, help="Seed number for the creative bot.", type=int
+    )
+    parser.add_argument(
+        "--human-identifier", "-i", default="HUMANO", help="Name of the human identifier."
+    )
+    parser.add_argument(
+        "--bot-identifier", "-b", default="EXPERTO", help="Name of the bot identifier."
+    )
     args = parser.parse_args()
     torch.manual_seed(args.random_seed)
+    with open(args.prompt, "r") as fh:
         initial_prompt = fh.read()
     chatbot = ChatBot(
         keep_context=args.keep_context,
         creative=args.creative,
         human_identifier=args.human_identifier,
+        bot_identifier=args.bot_identifier,
     )
     print("Write `exit` or `quit` to quit")
     while True:
+        input_text = input("> ")
+        if input_text == "exit" or input_text == "quit":
             break
         print(chatbot.chat(input_text))

flisol-cordoba-2023.ipynb CHANGED Viewed

@@ -272,14 +272,11 @@
     "from IPython.display import display, HTML\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
-    "BASE_MODEL = 'bigscience/bloom-3b'  # More models at https://huggingface.co/models\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    BASE_MODEL,\n",
-    "    low_cpu_mem_usage=True,\n",
-    "    torch_dtype='auto'\n",
-    ")"
    ]
   },
   {
@@ -409,7 +406,7 @@
    ],
    "source": [
     "MAX_TOKENS = 50\n",
-    "input_ids = tokenizer.encode(PROMPT, return_tensors='pt')\n",
     "greedy_output = model.generate(input_ids, max_length=input_ids.shape[1] + MAX_TOKENS)\n",
     "output = tokenizer.decode(greedy_output[0], skip_special_tokens=True)\n",
     "\n",
@@ -525,17 +522,17 @@
     "\"\"\".strip()\n",
     "\n",
     "chatbot = ChatBot(\n",
-    "    base_model='bigscience/bloom-3b',\n",
     "    initial_prompt=PROMPT,\n",
     "    keep_context=True,\n",
     "    creative=True,\n",
-    "    human_identifier='HUMANO',\n",
-    "    bot_identifier='EXPERTO'\n",
     ")\n",
     "\n",
     "while True:\n",
-    "    input_text = input('> ')\n",
-    "    if input_text == 'exit':\n",
     "        break\n",
     "    print(chatbot.chat(input_text))"
    ]
@@ -619,9 +616,9 @@
     "import torch\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
-    "BASE_MODEL = 'DeepESP/gpt2-spanish'  # We play with a smaller model\n",
     "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
-    "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)"
    ]
   },
   {
@@ -663,7 +660,7 @@
    "source": [
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
-    "input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors='pt')\n",
     "sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
     "output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
     "\n",
@@ -716,9 +713,14 @@
    "source": [
     "from datasets import load_dataset\n",
     "\n",
-    "datasets = load_dataset('text', data_files={'train': './data/martin-fierro_train.txt',\n",
-    "                                            'validation': './data/martin-fierro_validation.txt'})\n",
-    "print('\\n'.join(datasets['train'][:9]['text']))"
    ]
   },
   {
@@ -750,7 +752,9 @@
    "source": [
     "from utils import tokenize  # local module in the repository\n",
     "\n",
-    "tokenized_datasets = datasets.map(tokenize(tokenizer), batched=True, num_proc=4, remove_columns=['text'])"
    ]
   },
   {
@@ -831,8 +835,8 @@
     }
    ],
    "source": [
-    "print(len(lm_datasets['train'][0]['input_ids']))\n",
-    "print(lm_datasets['train'][0]['input_ids'][:10])"
    ]
   },
   {
@@ -876,7 +880,7 @@
     }
    ],
    "source": [
-    "print(tokenizer.decode(lm_datasets['train'][0]['input_ids']))"
    ]
   },
   {
@@ -1022,24 +1026,24 @@
     "from transformers import Trainer, TrainingArguments\n",
     "\n",
     "training_args = TrainingArguments(\n",
-    "    'flisol-cba-martin-fierro',\n",
-    "    evaluation_strategy='epoch',\n",
     "    num_train_epochs=10,\n",
     "    learning_rate=2e-5,\n",
     "    weight_decay=0.01,\n",
-    "    logging_steps=5\n",
     ")\n",
     "\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
     "    args=training_args,\n",
-    "    train_dataset=lm_datasets['train'],\n",
-    "    eval_dataset=lm_datasets['validation']\n",
     ")\n",
     "\n",
     "trainer.train()\n",
     "trainer.push_to_hub()  # This pushes the trained model to Hugging Face model repository\n",
-    "tokenizer.push_to_hub('flisol-cba-martin-fierro')"
    ]
   },
   {
@@ -1088,13 +1092,13 @@
     "import torch\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
-    "MODEL = 'flisol-cba-martin-fierro'\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
-    "model = AutoModelForCausalLM.from_pretrained(MODEL)\n",
     "\n",
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
-    "input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors='pt')\n",
     "sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
     "output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
     "\n",

     "from IPython.display import display, HTML\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
+    "BASE_MODEL = \"bigscience/bloom-3b\"  # More models at https://huggingface.co/models\n",
     "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
+    "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, low_cpu_mem_usage=True, torch_dtype=\"auto\").to(device)"
    ]
   },
   {
    ],
    "source": [
     "MAX_TOKENS = 50\n",
+    "input_ids = tokenizer.encode(PROMPT, return_tensors=\"pt\").to(device)\n",
     "greedy_output = model.generate(input_ids, max_length=input_ids.shape[1] + MAX_TOKENS)\n",
     "output = tokenizer.decode(greedy_output[0], skip_special_tokens=True)\n",
     "\n",
     "\"\"\".strip()\n",
     "\n",
     "chatbot = ChatBot(\n",
+    "    base_model=\"bigscience/bloom-3b\",\n",
     "    initial_prompt=PROMPT,\n",
     "    keep_context=True,\n",
     "    creative=True,\n",
+    "    human_identifier=\"HUMANO\",\n",
+    "    bot_identifier=\"EXPERTO\",\n",
     ")\n",
     "\n",
     "while True:\n",
+    "    input_text = input(\"> \")\n",
+    "    if input_text == \"exit\":\n",
     "        break\n",
     "    print(chatbot.chat(input_text))"
    ]
     "import torch\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
+    "BASE_MODEL = \"DeepESP/gpt2-spanish\"  # We play with a smaller model\n",
     "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
+    "model = AutoModelForCausalLM.from_pretrained(BASE_MODEL).to(device)"
    ]
   },
   {
    "source": [
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
+    "input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors=\"pt\").to(device)\n",
     "sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
     "output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
     "\n",
    "source": [
     "from datasets import load_dataset\n",
     "\n",
+    "datasets = load_dataset(\n",
+    "    \"text\",\n",
+    "    data_files={\n",
+    "        \"train\": \"./data/martin-fierro_train.txt\",\n",
+    "        \"validation\": \"./data/martin-fierro_validation.txt\",\n",
+    "    },\n",
+    ")\n",
+    "print(\"\\n\".join(datasets[\"train\"][:9][\"text\"]))"
    ]
   },
   {
    "source": [
     "from utils import tokenize  # local module in the repository\n",
     "\n",
+    "tokenized_datasets = datasets.map(\n",
+    "    tokenize(tokenizer), batched=True, num_proc=4, remove_columns=[\"text\"]\n",
+    ")"
    ]
   },
   {
     }
    ],
    "source": [
+    "print(len(lm_datasets[\"train\"][0][\"input_ids\"]))\n",
+    "print(lm_datasets[\"train\"][0][\"input_ids\"][:10])"
    ]
   },
   {
     }
    ],
    "source": [
+    "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
    ]
   },
   {
     "from transformers import Trainer, TrainingArguments\n",
     "\n",
     "training_args = TrainingArguments(\n",
+    "    \"flisol-cba-martin-fierro\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
     "    num_train_epochs=10,\n",
     "    learning_rate=2e-5,\n",
     "    weight_decay=0.01,\n",
+    "    logging_steps=5,\n",
     ")\n",
     "\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
     "    args=training_args,\n",
+    "    train_dataset=lm_datasets[\"train\"],\n",
+    "    eval_dataset=lm_datasets[\"validation\"],\n",
     ")\n",
     "\n",
     "trainer.train()\n",
     "trainer.push_to_hub()  # This pushes the trained model to Hugging Face model repository\n",
+    "tokenizer.push_to_hub(\"flisol-cba-martin-fierro\")"
    ]
   },
   {
     "import torch\n",
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
+    "MODEL = \"flisol-cba-martin-fierro\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
+    "model = AutoModelForCausalLM.from_pretrained(MODEL).to(device)\n",
     "\n",
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
+    "input_ids = tokenizer.encode(\"Aquí me pongo a cantar\", return_tensors=\"pt\").to(device)\n",
     "sampling_output = model.generate(input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95)\n",
     "output = tokenizer.decode(sampling_output[0], skip_special_tokens=True)\n",
     "\n",

utils.py CHANGED Viewed

@@ -23,8 +23,9 @@ from transformers import PreTrainedTokenizerBase
 from typing import Callable, Dict, List
-def tokenize(tokenizer: PreTrainedTokenizerBase,
-             end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]:
     """
     Helper function that returns a function to use with the `map` method of
     datasets.DatasetDict.  It takes a tokenizer and generates a function that
@@ -47,14 +48,14 @@ def tokenize(tokenizer: PreTrainedTokenizerBase,
         The function in charge of the tokenization process.
     """
     def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
-        return tokenizer([f'{e}{end_char}' for e in examples['text']])
     return _tokenize
-def group_texts(examples: Dict[str, List[int]],
-                block_size: int = 128) -> Dict[str, List[int]]:
     """
     Helper function to concatenate a tokenized dataset (with the function above)
     in chunks of `block_size`. The code was taken from
@@ -80,13 +81,13 @@ def group_texts(examples: Dict[str, List[int]],
     """
     # Concatenate all texts.
     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-    total_length = len(concatenated_examples['input_ids'])
     # We drop the small remainder, we could add padding if the model supported
     # it instead of this drop, you can customize this part to your needs
     total_length = (total_length // block_size) * block_size
     # Split by chunks of block_size length
     result = {
-        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
         for k, t in concatenated_examples.items()
     }
     # labels to be used by the training phase, it copies since the Transformers

 from typing import Callable, Dict, List
+def tokenize(
+    tokenizer: PreTrainedTokenizerBase, end_char: str = "\n"
+) -> Callable[[Dict[str, List[str]]], DatasetDict]:
     """
     Helper function that returns a function to use with the `map` method of
     datasets.DatasetDict.  It takes a tokenizer and generates a function that
         The function in charge of the tokenization process.
     """
     def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
+        return tokenizer([f"{e}{end_char}" for e in examples["text"]])
     return _tokenize
+def group_texts(examples: Dict[str, List[int]], block_size: int = 128) -> Dict[str, List[int]]:
     """
     Helper function to concatenate a tokenized dataset (with the function above)
     in chunks of `block_size`. The code was taken from
     """
     # Concatenate all texts.
     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    total_length = len(concatenated_examples["input_ids"])
     # We drop the small remainder, we could add padding if the model supported
     # it instead of this drop, you can customize this part to your needs
     total_length = (total_length // block_size) * block_size
     # Split by chunks of block_size length
     result = {
+        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
         for k, t in concatenated_examples.items()
     }
     # labels to be used by the training phase, it copies since the Transformers