|
|
|
"""Wikipedia Sentences""" |
|
|
|
from __future__ import absolute_import, division, print_function |
|
|
|
import os |
|
import json |
|
|
|
import datasets |
|
|
|
|
|
_DESCRIPTION = """\ |
|
Dataset of sentences from Wikipedia (from the [Optimus paper](https://arxiv.org/abs/2004.04092)). |
|
Each is of mex 64 words & <=256 GPT2 tokens. |
|
Each row is a tokenised sentence. |
|
{'token_ids': '{gpt2 token ids}'} |
|
This is to test the semantics of a Transformer-VAEs latent space by interpolating on sentences. |
|
""" |
|
|
|
NUM_SEGMENTS = 5 |
|
DOWNLOAD_URLS = 'https://drive.google.com/file/d/13NnkYAhwszQxc1C5HHfThnF7c1cjzjAD/view?usp=sharing, https://drive.google.com/file/d/14p6FHip_hGTXC-_7SYaK32BpEhZRDJI4/view?usp=sharing, https://drive.google.com/file/d/1IaRfTFh51Wf_zPtK6tjE6xw-up_Z6EyN/view?usp=sharing, https://drive.google.com/file/d/1KGhV397Xfej56uJ9H10xD7tfLdhWlg4q/view?usp=sharing, https://drive.google.com/file/d/1LfsQ1s9wr1mBG3I1bbvnbyrYmnsrXxZt/view?usp=sharing, https://drive.google.com/file/d/1OctFe_JPR0Ajh77FzWdfeYnWZinKl2sW/view?usp=sharing, https://drive.google.com/file/d/1W-Yi8gHCcT8O5F4TcDHScH7pOb0GQZdu/view?usp=sharing, https://drive.google.com/file/d/1jgHjnpe7Vk1pvRgfnH4S4KiRrpUQyqyp/view?usp=sharing, https://drive.google.com/file/d/1oVst8RG8G2d21DL6q4DwO7aJxE1vA2fc/view?usp=sharing, https://drive.google.com/file/d/1qwckIM8YBbU9bnArB6bAoStY3e9I1kqU/view?usp=sharing'.split(', ') |
|
|
|
|
|
class WikiSentences(datasets.GeneratorBasedBuilder): |
|
"""Sentences from Wikipedia.""" |
|
|
|
BUILDER_CONFIGS = [datasets.BuilderConfig(name="main", description="Run through json files one by one.",)] |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
'token_ids': [datasets.Value("int32")], |
|
} |
|
), |
|
homepage="https://github.com/Fraser-Greenlee/transformer-vae", |
|
) |
|
|
|
def _generate_examples(self, filepath): |
|
"""Generate examples.""" |
|
with open(filepath, encoding="utf-8") as json_lines_file: |
|
for id_, line in enumerate(json_lines_file): |
|
yield id_, json.loads(line) |
|
if id_ >= self.config.max_num_samples: |
|
break |
|
|