File size: 3,893 Bytes
2b999a0
23c9c81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b999a0
 
 
 
 
 
 
f781de0
 
 
2b999a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f781de0
2b999a0
f781de0
2b999a0
 
 
 
f781de0
2b999a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f781de0
2b999a0
 
 
 
 
f781de0
2b999a0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Utilities file needed for the presentation.

    flisol-cba-martin-fierro
    Copyright (C) 2023 Cristian Cardellino

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

from datasets import DatasetDict
from transformers import PreTrainedTokenizerBase
from typing import Callable, Dict, List


def tokenize(
    tokenizer: PreTrainedTokenizerBase, end_char: str = "\n"
) -> Callable[[Dict[str, List[str]]], DatasetDict]:
    """
    Helper function that returns a function to use with the `map` method of
    datasets.DatasetDict.  It takes a tokenizer and generates a function that
    applies that tokenizer with an optional `end_char` parameter (e.g. a
    newline) that might be needed (e.g. when trying to tokenize and keep the
    structure of a poem wich needs the newline after each sentence). This is
    needed since the function `datasets.load_dataset` forcibly removes the
    newlines characters.

    Parameters
    ----------
    tokenizer : PreTrainedTokenizerBase
        The tokenizer to use for the tokenization process.
    end_char : str
        The end character to append to each line.

    Returns
    -------
    Callable[[Dict[str, List[str]]], DatasetDict]
        The function in charge of the tokenization process.

    """

    def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
        return tokenizer([f"{e}{end_char}" for e in examples["text"]])

    return _tokenize


def group_texts(examples: Dict[str, List[int]], block_size: int = 128) -> Dict[str, List[int]]:
    """
    Helper function to concatenate a tokenized dataset (with the function above)
    in chunks of `block_size`. The code was taken from
    https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb

    Parameters
    ----------
    examples : Dict[str, List[int]]
        This is actually a `LazyBatch` from the transformers library, that is
        given by the `DatasetDict.map` method. It should be the dataset returned
        after tokenization with the function returned by `tokenize`. It should
        have 2 main keys: 'input_ids' and 'attention_mask'.
    block_size : int
        The size of the block to use in the training process. If the total lenght
        of the group of texts is not divisible by the block size it will ignore the
        remaining data for simplicity.

    Returns
    -------
    Dict[str, List[str, int]]
        The dictionary that will provide the new dataset divided in chunks of
        `block_size`.
    """
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples["input_ids"])
    # We drop the small remainder, we could add padding if the model supported
    # it instead of this drop, you can customize this part to your needs
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size length
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # labels to be used by the training phase, it copies since the Transformers
    # library will be in charge of making the shift to the right
    result["labels"] = result["input_ids"].copy()
    return result