bertin-roberta-base-spanish / utils /generate_datasets.py

Explanations

8fb54c7 over 4 years ago

4.11 kB

	import json
	import logging
	from datasets import load_dataset
	from tqdm import tqdm
	# Setup logging
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	level="INFO",
	datefmt="[%X]",
	)

	# Log on each process the small summary:
	logger = logging.getLogger(__name__)
	!wget http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
	mc4 = load_dataset(
	"./mc4",
	"es",
	split="train",
	sampling_method="steps",
	perplexity_model="./es.arpa.bin",
	sampling_factor=1.5e5,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-train-50M-steps.jsonl", "w") as f:
	for sample in tqdm(mc4, total=50_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 50_000_000:
	break

	mc4val = load_dataset(
	"./mc4",
	"es",
	split="validation",
	sampling_method="steps",
	perplexity_model="./es.arpa.bin",
	sampling_factor=5e5,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-validation-5M-steps.jsonl", "w") as f:
	for sample in tqdm(mc4val, total=5_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 5_000_000:
	break


	# ------------------

	import json
	import logging
	from datasets import load_dataset
	from tqdm import tqdm
	# Setup logging
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	level="INFO",
	datefmt="[%X]",
	)

	# Log on each process the small summary:
	logger = logging.getLogger(__name__)


	mc4 = load_dataset(
	"./mc4",
	"es",
	split="train",
	sampling_method="gaussian",
	perplexity_model="../es.arpa.bin",
	sampling_factor=0.78,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-train-50M-gaussian.jsonl", "w") as f:
	for sample in tqdm(mc4, total=50_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 50_000_000:
	break
	mc4val = load_dataset(
	"./mc4",
	"es",
	split="validation",
	sampling_method="gaussian",
	perplexity_model="../es.arpa.bin",
	sampling_factor=1,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-validation-5M-gaussian.jsonl", "w") as f:
	for sample in tqdm(mc4val, total=5_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 5_000_000:
	break


	# ------------------

	import json
	import logging
	from datasets import load_dataset
	from tqdm import tqdm
	# Setup logging
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	level="INFO",
	datefmt="[%X]",
	)

	# Log on each process the small summary:
	logger = logging.getLogger(__name__)


	mc4 = load_dataset(
	"./mc4",
	"es",
	split="train",
	sampling_method="random",
	perplexity_model="../es.arpa.bin",
	sampling_factor=0.5,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-train-50M-random.jsonl", "w") as f:
	for sample in tqdm(mc4, total=50_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 50_000_000:
	break
	mc4val = load_dataset(
	"./mc4",
	"es",
	split="validation",
	sampling_method="random",
	perplexity_model="../es.arpa.bin",
	sampling_factor=0.5,
	boundaries=[536394.99320948,662247.50212365,919250.87225178],
	streaming=True).shuffle(buffer_size=10000, seed=2021)
	total = 0
	with open("mc4-es-validation-5M-random.jsonl", "w") as f:
	for sample in tqdm(mc4val, total=5_000_000):
	f.write(json.dumps(sample) + "\n")
	total += 1
	if total >= 5_000_000:
	break



	------------