File size: 2,223 Bytes
2095da4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# unused
"""Wikipedia Sentences"""

from __future__ import absolute_import, division, print_function

import os
import json

import datasets


_DESCRIPTION = """\
Dataset of sentences from Wikipedia (from the [Optimus paper](https://arxiv.org/abs/2004.04092)).
Each is of mex 64 words & <=256 GPT2 tokens.
Each row is a tokenised sentence.
{'token_ids': '{gpt2 token ids}'}
This is to test the semantics of a Transformer-VAEs latent space by interpolating on sentences.
"""

NUM_SEGMENTS = 5
DOWNLOAD_URLS = 'https://drive.google.com/file/d/13NnkYAhwszQxc1C5HHfThnF7c1cjzjAD/view?usp=sharing, https://drive.google.com/file/d/14p6FHip_hGTXC-_7SYaK32BpEhZRDJI4/view?usp=sharing, https://drive.google.com/file/d/1IaRfTFh51Wf_zPtK6tjE6xw-up_Z6EyN/view?usp=sharing, https://drive.google.com/file/d/1KGhV397Xfej56uJ9H10xD7tfLdhWlg4q/view?usp=sharing, https://drive.google.com/file/d/1LfsQ1s9wr1mBG3I1bbvnbyrYmnsrXxZt/view?usp=sharing, https://drive.google.com/file/d/1OctFe_JPR0Ajh77FzWdfeYnWZinKl2sW/view?usp=sharing, https://drive.google.com/file/d/1W-Yi8gHCcT8O5F4TcDHScH7pOb0GQZdu/view?usp=sharing, https://drive.google.com/file/d/1jgHjnpe7Vk1pvRgfnH4S4KiRrpUQyqyp/view?usp=sharing, https://drive.google.com/file/d/1oVst8RG8G2d21DL6q4DwO7aJxE1vA2fc/view?usp=sharing, https://drive.google.com/file/d/1qwckIM8YBbU9bnArB6bAoStY3e9I1kqU/view?usp=sharing'.split(', ')


class WikiSentences(datasets.GeneratorBasedBuilder):
    """Sentences from Wikipedia."""

    BUILDER_CONFIGS = [datasets.BuilderConfig(name="main", description="Run through json files one by one.",)]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    'token_ids': [datasets.Value("int32")],
                }
            ),
            homepage="https://github.com/Fraser-Greenlee/transformer-vae",
        )

    def _generate_examples(self, filepath):
        """Generate examples."""
        with open(filepath, encoding="utf-8") as json_lines_file:
            for id_, line in enumerate(json_lines_file):
                yield id_, json.loads(line)
                if id_ >= self.config.max_num_samples:
                    break