|
""" |
|
Utilities file needed for the presentation. |
|
|
|
flisol-cba-martin-fierro |
|
Copyright (C) 2023 Cristian Cardellino |
|
|
|
This program is free software: you can redistribute it and/or modify |
|
it under the terms of the GNU General Public License as published by |
|
the Free Software Foundation, either version 3 of the License, or |
|
(at your option) any later version. |
|
|
|
This program is distributed in the hope that it will be useful, |
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
GNU General Public License for more details. |
|
|
|
You should have received a copy of the GNU General Public License |
|
along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
""" |
|
|
|
from datasets import DatasetDict |
|
from transformers import PreTrainedTokenizerBase |
|
from typing import Callable, Dict, List |
|
|
|
|
|
def tokenize( |
|
tokenizer: PreTrainedTokenizerBase, end_char: str = "\n" |
|
) -> Callable[[Dict[str, List[str]]], DatasetDict]: |
|
""" |
|
Helper function that returns a function to use with the `map` method of |
|
datasets.DatasetDict. It takes a tokenizer and generates a function that |
|
applies that tokenizer with an optional `end_char` parameter (e.g. a |
|
newline) that might be needed (e.g. when trying to tokenize and keep the |
|
structure of a poem wich needs the newline after each sentence). This is |
|
needed since the function `datasets.load_dataset` forcibly removes the |
|
newlines characters. |
|
|
|
Parameters |
|
---------- |
|
tokenizer : PreTrainedTokenizerBase |
|
The tokenizer to use for the tokenization process. |
|
end_char : str |
|
The end character to append to each line. |
|
|
|
Returns |
|
------- |
|
Callable[[Dict[str, List[str]]], DatasetDict] |
|
The function in charge of the tokenization process. |
|
|
|
""" |
|
|
|
def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict: |
|
return tokenizer([f"{e}{end_char}" for e in examples["text"]]) |
|
|
|
return _tokenize |
|
|
|
|
|
def group_texts(examples: Dict[str, List[int]], block_size: int = 128) -> Dict[str, List[int]]: |
|
""" |
|
Helper function to concatenate a tokenized dataset (with the function above) |
|
in chunks of `block_size`. The code was taken from |
|
https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb |
|
|
|
Parameters |
|
---------- |
|
examples : Dict[str, List[int]] |
|
This is actually a `LazyBatch` from the transformers library, that is |
|
given by the `DatasetDict.map` method. It should be the dataset returned |
|
after tokenization with the function returned by `tokenize`. It should |
|
have 2 main keys: 'input_ids' and 'attention_mask'. |
|
block_size : int |
|
The size of the block to use in the training process. If the total lenght |
|
of the group of texts is not divisible by the block size it will ignore the |
|
remaining data for simplicity. |
|
|
|
Returns |
|
------- |
|
Dict[str, List[str, int]] |
|
The dictionary that will provide the new dataset divided in chunks of |
|
`block_size`. |
|
""" |
|
|
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
|
total_length = len(concatenated_examples["input_ids"]) |
|
|
|
|
|
total_length = (total_length // block_size) * block_size |
|
|
|
result = { |
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
|
for k, t in concatenated_examples.items() |
|
} |
|
|
|
|
|
result["labels"] = result["input_ids"].copy() |
|
return result |
|
|