bertin-project
/

bertin-roberta-base-spanish

@@ -399,29 +399,29 @@ class Mc4(datasets.GeneratorBasedBuilder):
         id_ = 0
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
-            if filepath.endswith("json") or filepath.endswith("jsonl"):
                 with open(filepath, "r", encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
                             yield id_, example
                             id_ += 1
-        else:
-            with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
-                if self.sampling_method:
-                    logger.info("sampling method = %s", self.sampling_method)
-                    for line in f:
-                        if line:
-                            example = json.loads(line)
-                            if self.should_keep_doc(
-                                example["text"],
-                                factor=self.sampling_factor,
-                                boundaries=self.boundaries):
                                 yield id_, example
                                 id_ += 1
-                else:
-                    for line in f:
-                        if line:
-                            example = json.loads(line)
-                            yield id_, example
-                            id_ += 1

         id_ = 0
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
+            if filepath.endswith("jsonl"):
                 with open(filepath, "r", encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
                             yield id_, example
                             id_ += 1
+            else:
+                with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    if self.sampling_method:
+                        logger.info("sampling method = %s", self.sampling_method)
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                if self.should_keep_doc(
+                                    example["text"],
+                                    factor=self.sampling_factor,
+                                    boundaries=self.boundaries):
+                                    yield id_, example
+                                    id_ += 1
+                    else:
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
                                 yield id_, example
                                 id_ += 1

run_mlm_flax_stream.py CHANGED Viewed

@@ -178,10 +178,10 @@ class DataTrainingArguments:
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt", "gz"], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt", "gz"], "`validation_file` should be a csv, a json or a txt file."
 @flax.struct.dataclass

         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`train_file` should be a csv, a json (lines) or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`validation_file` should be a csv, a json (lines) or a txt file."
 @flax.struct.dataclass