versae commited on
Commit
7b22f12
1 Parent(s): 7d6bbb2

Fixes treatment of jsonl

Browse files
Files changed (2) hide show
  1. mc4/mc4.py +18 -18
  2. run_mlm_flax_stream.py +2 -2
mc4/mc4.py CHANGED
@@ -399,29 +399,29 @@ class Mc4(datasets.GeneratorBasedBuilder):
399
  id_ = 0
400
  for filepath in filepaths:
401
  logger.info("generating examples from = %s", filepath)
402
- if filepath.endswith("json") or filepath.endswith("jsonl"):
403
  with open(filepath, "r", encoding="utf-8") as f:
404
  for line in f:
405
  if line:
406
  example = json.loads(line)
407
  yield id_, example
408
  id_ += 1
409
- else:
410
- with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
411
- if self.sampling_method:
412
- logger.info("sampling method = %s", self.sampling_method)
413
- for line in f:
414
- if line:
415
- example = json.loads(line)
416
- if self.should_keep_doc(
417
- example["text"],
418
- factor=self.sampling_factor,
419
- boundaries=self.boundaries):
 
 
 
 
 
 
420
  yield id_, example
421
  id_ += 1
422
- else:
423
- for line in f:
424
- if line:
425
- example = json.loads(line)
426
- yield id_, example
427
- id_ += 1
399
  id_ = 0
400
  for filepath in filepaths:
401
  logger.info("generating examples from = %s", filepath)
402
+ if filepath.endswith("jsonl"):
403
  with open(filepath, "r", encoding="utf-8") as f:
404
  for line in f:
405
  if line:
406
  example = json.loads(line)
407
  yield id_, example
408
  id_ += 1
409
+ else:
410
+ with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
411
+ if self.sampling_method:
412
+ logger.info("sampling method = %s", self.sampling_method)
413
+ for line in f:
414
+ if line:
415
+ example = json.loads(line)
416
+ if self.should_keep_doc(
417
+ example["text"],
418
+ factor=self.sampling_factor,
419
+ boundaries=self.boundaries):
420
+ yield id_, example
421
+ id_ += 1
422
+ else:
423
+ for line in f:
424
+ if line:
425
+ example = json.loads(line)
426
  yield id_, example
427
  id_ += 1
 
 
 
 
 
 
run_mlm_flax_stream.py CHANGED
@@ -178,10 +178,10 @@ class DataTrainingArguments:
178
  else:
179
  if self.train_file is not None:
180
  extension = self.train_file.split(".")[-1]
181
- assert extension in ["csv", "json", "txt", "gz"], "`train_file` should be a csv, a json or a txt file."
182
  if self.validation_file is not None:
183
  extension = self.validation_file.split(".")[-1]
184
- assert extension in ["csv", "json", "txt", "gz"], "`validation_file` should be a csv, a json or a txt file."
185
 
186
 
187
  @flax.struct.dataclass
178
  else:
179
  if self.train_file is not None:
180
  extension = self.train_file.split(".")[-1]
181
+ assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`train_file` should be a csv, a json (lines) or a txt file."
182
  if self.validation_file is not None:
183
  extension = self.validation_file.split(".")[-1]
184
+ assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`validation_file` should be a csv, a json (lines) or a txt file."
185
 
186
 
187
  @flax.struct.dataclass