crscardellino commited on
Commit
2b999a0
1 Parent(s): 72b71cf

utils module

Browse files
Files changed (1) hide show
  1. utils.py +79 -0
utils.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Various utilities needed for the presentation.
3
+ """
4
+
5
+ from datasets import DatasetDict
6
+ from transformers import PreTrainedTokenizerBase
7
+ from typing import Callable, Dict, List
8
+
9
+
10
+ def tokenize(tokenizer: PreTrainedTokenizerBase,
11
+ end_char: str = '\n') -> Callable[[Dict[str, List[str]]], DatasetDict]:
12
+ """
13
+ Helper function that returns a function to use with the `map` method of
14
+ datasets.DatasetDict. It takes a tokenizer and generates a function that
15
+ applies that tokenizer with an optional `end_char` parameter (e.g. a
16
+ newline) that might be needed (e.g. when trying to tokenize and keep the
17
+ structure of a poem wich needs the newline after each sentence). This is
18
+ needed since the function `datasets.load_dataset` forcibly removes the
19
+ newlines characters.
20
+
21
+ Parameters
22
+ ----------
23
+ tokenizer : PreTrainedTokenizerBase
24
+ The tokenizer to use for the tokenization process.
25
+ end_char : str
26
+ The end character to append to each line.
27
+
28
+ Returns
29
+ -------
30
+ Callable[[Dict[str, List[str]]], DatasetDict]
31
+ The function in charge of the tokenization process.
32
+
33
+ """
34
+ def _tokenize(examples: Dict[str, List[str]]) -> DatasetDict:
35
+ return tokenizer([f'{e}{end_char}' for e in examples['text']])
36
+
37
+ return _tokenize
38
+
39
+
40
+ def group_texts(examples: Dict[str, List[int]],
41
+ block_size: int = 128) -> Dict[str, List[int]]:
42
+ """
43
+ Helper function to concatenate a tokenized dataset (with the function above)
44
+ in chunks of `block_size`. The code was taken from
45
+ https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
46
+
47
+ Parameters
48
+ ----------
49
+ examples : Dict[str, List[int]]
50
+ This is actually a `LazyBatch` from the transformers library, that is
51
+ given by the `DatasetDict.map` method. It should be the dataset returned
52
+ after tokenization with the function returned by `tokenize`. It should
53
+ have 2 main keys: 'input_ids' and 'attention_mask'.
54
+ block_size : int
55
+ The size of the block to use in the training process. If the total lenght
56
+ of the group of texts is not divisible by the block size it will ignore the
57
+ remaining data for simplicity.
58
+
59
+ Returns
60
+ -------
61
+ Dict[str, List[str, int]]
62
+ The dictionary that will provide the new dataset divided in chunks of
63
+ `block_size`.
64
+ """
65
+ # Concatenate all texts.
66
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
67
+ total_length = len(concatenated_examples['input_ids'])
68
+ # We drop the small remainder, we could add padding if the model supported
69
+ # it instead of this drop, you can customize this part to your needs
70
+ total_length = (total_length // block_size) * block_size
71
+ # Split by chunks of block_size length
72
+ result = {
73
+ k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
74
+ for k, t in concatenated_examples.items()
75
+ }
76
+ # labels to be used by the training phase, it copies since the Transformers
77
+ # library will be in charge of making the shift to the right
78
+ result["labels"] = result["input_ids"].copy()
79
+ return result