bertin-project
/

bertin-roberta-base-spanish

@@ -283,27 +283,28 @@ class Mc4(datasets.GeneratorBasedBuilder):
     BUILDER_CONFIG_CLASS = Mc4Config
     def __init__(self, *args, writer_batch_size=None, **kwargs):
         self.sampling_method = kwargs.pop("sampling_method", None)
         if self.sampling_method:
-            seed = kwargs.pop("seed", None)
-            if seed is not None:
-                self.rng = default_rng(seed)
             else:
                 self.rng = default_rng()
-            self.perplexity_model = kwargs.pop("perplexity_model", None)
-            self.sampling_factor = kwargs.pop("sampling_factor", None)
-            self.boundaries = kwargs.pop("boundaries", None)
-            # Loading 5-gram model
-            # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
-            logger.info("loading model = %s", self.perplexity_model)
-            self.pp_model = kenlm.Model(self.perplexity_model)
-            if self.sampling_method == "gaussian":
-                self.should_keep_doc = self._should_keep_doc_gaussian
-            elif self.sampling_method == "random":
                 self.should_keep_doc = self._should_keep_doc_random
             else:
-                self.should_keep_doc = self._should_keep_doc_step
         super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
     def get_perplexity(self, doc):
@@ -341,7 +342,9 @@ class Mc4(datasets.GeneratorBasedBuilder):
         return self.rng.uniform() < weighted_perplexity
     def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
-        return self.rng.uniform() <= 0.5
     def _info(self):
         return datasets.DatasetInfo(
@@ -371,8 +374,18 @@ class Mc4(datasets.GeneratorBasedBuilder):
                 for lang in self.config.languages
                 for index in range(_N_SHARDS_PER_SPLIT[lang][split])
             ]
-        train_downloaded_files = dl_manager.download(data_urls["train"])
-        validation_downloaded_files = dl_manager.download(data_urls["validation"])
         return [
             datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
             datasets.SplitGenerator(
@@ -385,21 +398,29 @@ class Mc4(datasets.GeneratorBasedBuilder):
         id_ = 0
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
-            with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
-                if self.sampling_method:
-                    logger.info("sampling method = %s", self.sampling_method)
-                    for line in f:
-                        if line:
-                            example = json.loads(line)
-                            if self.should_keep_doc(
-                                example["text"],
-                                factor=self.sampling_factor,
-                                boundaries=self.boundaries):
-                                yield id_, example
-                                id_ += 1
-                else:
                     for line in f:
                         if line:
                             example = json.loads(line)
                             yield id_, example
                             id_ += 1

     BUILDER_CONFIG_CLASS = Mc4Config
     def __init__(self, *args, writer_batch_size=None, **kwargs):
+        self.data_files = kwargs.pop("data_files", {})
         self.sampling_method = kwargs.pop("sampling_method", None)
+        self.perplexity_model = kwargs.pop("perplexity_model", None)
+        self.sampling_factor = kwargs.pop("sampling_factor", None)
+        self.boundaries = kwargs.pop("boundaries", None)
+        self.seed = kwargs.pop("seed", None)
         if self.sampling_method:
+            if self.seed is not None:
+                self.rng = default_rng(self.seed)
             else:
                 self.rng = default_rng()
+            if self.sampling_method == "random":
                 self.should_keep_doc = self._should_keep_doc_random
             else:
+                # Loading 5-gram model
+                # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
+                logger.info("loading model = %s", self.perplexity_model)
+                self.pp_model = kenlm.Model(self.perplexity_model)
+                if self.sampling_method == "gaussian":
+                    self.should_keep_doc = self._should_keep_doc_gaussian
+                else:
+                    self.should_keep_doc = self._should_keep_doc_step
         super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
     def get_perplexity(self, doc):
         return self.rng.uniform() < weighted_perplexity
     def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
+        if factor is None:
+            factor = 0.5
+        return self.rng.uniform() <= factor
     def _info(self):
         return datasets.DatasetInfo(
                 for lang in self.config.languages
                 for index in range(_N_SHARDS_PER_SPLIT[lang][split])
             ]
+        if "train" in self.data_files:
+            train_downloaded_files = self.data_files["train"]
+            if not isinstance(train_downloaded_files, (tuple, list)):
+                train_downloaded_files = [train_downloaded_files]
+        else:
+            train_downloaded_files = dl_manager.download(data_urls["train"])
+        if "validation" in self.data_files:
+            validation_downloaded_files = self.data_files["validation"]
+            if not isinstance(validation_downloaded_files, (tuple, list)):
+                validation_downloaded_files = [validation_downloaded_files]
+        else:
+            validation_downloaded_files = dl_manager.download(data_urls["validation"])
         return [
             datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
             datasets.SplitGenerator(
         id_ = 0
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
+            if filepath.endswith("jsonl"):
+                with open(filepath, "r", encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
                             yield id_, example
                             id_ += 1
+            else:
+                with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    if self.sampling_method:
+                        logger.info("sampling method = %s", self.sampling_method)
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                if self.should_keep_doc(
+                                    example["text"],
+                                    factor=self.sampling_factor,
+                                    boundaries=self.boundaries):
+                                    yield id_, example
+                                    id_ += 1
+                    else:
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                yield id_, example
+                                id_ += 1

run_mlm_flax_stream.py CHANGED Viewed

@@ -178,10 +178,10 @@ class DataTrainingArguments:
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 @flax.struct.dataclass
@@ -386,6 +386,11 @@ if __name__ == "__main__":
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
@@ -397,6 +402,7 @@ if __name__ == "__main__":
             boundaries=sampling_args.boundaries,
             perplexity_model=sampling_args.perplexity_model,
             seed=training_args.seed,
         )
     if model_args.config_name:

         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`train_file` should be a csv, a json (lines) or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`validation_file` should be a csv, a json (lines) or a txt file."
 @flax.struct.dataclass
     # 'text' is found. You can easily tweak this behavior (see below).
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
+        filepaths = {}
+        if data_args.train_file:
+            filepaths["train"] = data_args.train_file
+        if data_args.validation_file:
+            filepaths["validation"] = data_args.validation_file
         dataset = load_dataset(
             data_args.dataset_name,
             data_args.dataset_config_name,
             boundaries=sampling_args.boundaries,
             perplexity_model=sampling_args.perplexity_model,
             seed=training_args.seed,
+            data_files=filepaths,
         )
     if model_args.config_name: