Fixes treatment of jsonl
Browse files- mc4/mc4.py +18 -18
- run_mlm_flax_stream.py +2 -2
mc4/mc4.py
CHANGED
@@ -399,29 +399,29 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
399 |
id_ = 0
|
400 |
for filepath in filepaths:
|
401 |
logger.info("generating examples from = %s", filepath)
|
402 |
-
if filepath.endswith("
|
403 |
with open(filepath, "r", encoding="utf-8") as f:
|
404 |
for line in f:
|
405 |
if line:
|
406 |
example = json.loads(line)
|
407 |
yield id_, example
|
408 |
id_ += 1
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
yield id_, example
|
421 |
id_ += 1
|
422 |
-
else:
|
423 |
-
for line in f:
|
424 |
-
if line:
|
425 |
-
example = json.loads(line)
|
426 |
-
yield id_, example
|
427 |
-
id_ += 1
|
|
|
399 |
id_ = 0
|
400 |
for filepath in filepaths:
|
401 |
logger.info("generating examples from = %s", filepath)
|
402 |
+
if filepath.endswith("jsonl"):
|
403 |
with open(filepath, "r", encoding="utf-8") as f:
|
404 |
for line in f:
|
405 |
if line:
|
406 |
example = json.loads(line)
|
407 |
yield id_, example
|
408 |
id_ += 1
|
409 |
+
else:
|
410 |
+
with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
|
411 |
+
if self.sampling_method:
|
412 |
+
logger.info("sampling method = %s", self.sampling_method)
|
413 |
+
for line in f:
|
414 |
+
if line:
|
415 |
+
example = json.loads(line)
|
416 |
+
if self.should_keep_doc(
|
417 |
+
example["text"],
|
418 |
+
factor=self.sampling_factor,
|
419 |
+
boundaries=self.boundaries):
|
420 |
+
yield id_, example
|
421 |
+
id_ += 1
|
422 |
+
else:
|
423 |
+
for line in f:
|
424 |
+
if line:
|
425 |
+
example = json.loads(line)
|
426 |
yield id_, example
|
427 |
id_ += 1
|
|
|
|
|
|
|
|
|
|
|
|
run_mlm_flax_stream.py
CHANGED
@@ -178,10 +178,10 @@ class DataTrainingArguments:
|
|
178 |
else:
|
179 |
if self.train_file is not None:
|
180 |
extension = self.train_file.split(".")[-1]
|
181 |
-
assert extension in ["csv", "json", "txt", "gz"], "`train_file` should be a csv, a json or a txt file."
|
182 |
if self.validation_file is not None:
|
183 |
extension = self.validation_file.split(".")[-1]
|
184 |
-
assert extension in ["csv", "json", "txt", "gz"], "`validation_file` should be a csv, a json or a txt file."
|
185 |
|
186 |
|
187 |
@flax.struct.dataclass
|
|
|
178 |
else:
|
179 |
if self.train_file is not None:
|
180 |
extension = self.train_file.split(".")[-1]
|
181 |
+
assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`train_file` should be a csv, a json (lines) or a txt file."
|
182 |
if self.validation_file is not None:
|
183 |
extension = self.validation_file.split(".")[-1]
|
184 |
+
assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`validation_file` should be a csv, a json (lines) or a txt file."
|
185 |
|
186 |
|
187 |
@flax.struct.dataclass
|