import json import logging from datasets import load_dataset from tqdm import tqdm # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level="INFO", datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) !wget http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin mc4 = load_dataset( "./mc4", "es", split="train", sampling_method="steps", perplexity_model="./es.arpa.bin", sampling_factor=1.5e5, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-train-50M-steps.jsonl", "w") as f: for sample in tqdm(mc4, total=50_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 50_000_000: break mc4val = load_dataset( "./mc4", "es", split="validation", sampling_method="steps", perplexity_model="./es.arpa.bin", sampling_factor=5e5, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-validation-5M-steps.jsonl", "w") as f: for sample in tqdm(mc4val, total=5_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 5_000_000: break # ------------------ import json import logging from datasets import load_dataset from tqdm import tqdm # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level="INFO", datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) mc4 = load_dataset( "./mc4", "es", split="train", sampling_method="gaussian", perplexity_model="../es.arpa.bin", sampling_factor=0.78, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-train-50M-gaussian.jsonl", "w") as f: for sample in tqdm(mc4, total=50_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 50_000_000: break mc4val = load_dataset( "./mc4", "es", split="validation", sampling_method="gaussian", perplexity_model="../es.arpa.bin", sampling_factor=1, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-validation-5M-gaussian.jsonl", "w") as f: for sample in tqdm(mc4val, total=5_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 5_000_000: break # ------------------ import json import logging from datasets import load_dataset from tqdm import tqdm # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level="INFO", datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) mc4 = load_dataset( "./mc4", "es", split="train", sampling_method="random", perplexity_model="../es.arpa.bin", sampling_factor=0.5, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-train-50M-random.jsonl", "w") as f: for sample in tqdm(mc4, total=50_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 50_000_000: break mc4val = load_dataset( "./mc4", "es", split="validation", sampling_method="random", perplexity_model="../es.arpa.bin", sampling_factor=0.5, boundaries=[536394.99320948,662247.50212365,919250.87225178], streaming=True).shuffle(buffer_size=10000, seed=2021) total = 0 with open("mc4-es-validation-5M-random.jsonl", "w") as f: for sample in tqdm(mc4val, total=5_000_000): f.write(json.dumps(sample) + "\n") total += 1 if total >= 5_000_000: break ------------