rewrite / scripts /preprocess_data.py

Add files using upload-large-folder tool

3df5819 verified 6 days ago

6.54 kB

	"""
	Converts all raw dataset formats into unified JSONL training format.
	Output schema per line:
	{"input": "...", "target": "...", "source": "fce\|wi_locness\|jfleg\|synthetic"}

	Datasets handled:
	- FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
	- W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
	- JFLEG: data/raw/jfleg/.src + .ref*

	Run: python scripts/preprocess_data.py
	"""

	import json
	import os
	from pathlib import Path


	def apply_bea19_edits(text: str, edits_block: list) -> str:
	"""
	Apply BEA-2019 character-level edits to produce corrected text.

	edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
	We use the first annotator's corrections.
	Edits are applied in reverse order to preserve character offsets.
	"""
	if not edits_block or len(edits_block) == 0:
	return text

	# Take first annotator's edits
	annotator_edits = edits_block[0][1]

	# Sort by start position descending to apply from end to preserve offsets
	sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)

	result = text
	for edit in sorted_edits:
	start = edit[0]
	end = edit[1]
	replacement = edit[2]

	# Skip null replacements (no correction needed) and noop edits
	if replacement is None:
	continue

	result = result[:start] + replacement + result[end:]

	return result


	def process_bea19_json(json_path: str, source_name: str, out_file):
	"""
	Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
	Each line is a JSON object with 'text' and 'edits' fields.
	Produces (input=original, target=corrected) pairs.
	"""
	count = 0
	with open(json_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	obj = json.loads(line)
	original = obj["text"]
	edits = obj.get("edits", [])
	corrected = apply_bea19_edits(original, edits)

	# Only include if there were actual corrections
	if original.strip() != corrected.strip() and corrected.strip():
	out_file.write(json.dumps({
	"input": original,
	"target": corrected,
	"source": source_name,
	}) + "\n")
	count += 1
	return count


	def process_fce(raw_dir: str, out_file) -> int:
	"""Process all FCE JSON files."""
	total = 0
	json_dir = Path(raw_dir) / "json"
	if not json_dir.exists():
	print(f" ⚠ FCE directory not found: {json_dir}")
	return 0
	for json_file in sorted(json_dir.glob("*.json")):
	n = process_bea19_json(str(json_file), "fce", out_file)
	print(f" {json_file.name}: {n} pairs")
	total += n
	return total


	def process_wi_locness(raw_dir: str, out_file) -> int:
	"""Process all W&I+LOCNESS JSON files."""
	total = 0
	json_dir = Path(raw_dir) / "json"
	if not json_dir.exists():
	print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}")
	return 0
	for json_file in sorted(json_dir.glob("*.json")):
	n = process_bea19_json(str(json_file), "wi_locness", out_file)
	print(f" {json_file.name}: {n} pairs")
	total += n
	return total


	def process_jfleg(raw_dir: str, out_file) -> int:
	"""
	JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
	Each reference becomes a separate training pair.
	"""
	total = 0
	src_files = list(Path(raw_dir).glob("*.src"))
	if not src_files:
	print(f" ⚠ JFLEG directory empty or not found: {raw_dir}")
	return 0
	for src_file in src_files:
	refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
	with open(src_file) as sf:
	src_lines = sf.readlines()
	for ref_path in refs:
	if ref_path.exists():
	with open(ref_path) as rf:
	ref_lines = rf.readlines()
	for src, ref in zip(src_lines, ref_lines):
	src, ref = src.strip(), ref.strip()
	if src and ref and src != ref:
	out_file.write(json.dumps({
	"input": src,
	"target": ref,
	"source": "jfleg",
	}) + "\n")
	total += 1
	return total


	def create_splits(train_path: str, val_ratio: float = 0.1):
	"""Split train.jsonl into train and val sets."""
	import random
	random.seed(42)

	with open(train_path) as f:
	lines = f.readlines()

	random.shuffle(lines)
	val_size = int(len(lines) * val_ratio)
	val_lines = lines[:val_size]
	train_lines = lines[val_size:]

	with open(train_path, "w") as f:
	f.writelines(train_lines)

	val_path = train_path.replace("train.jsonl", "val.jsonl")
	with open(val_path, "w") as f:
	f.writelines(val_lines)

	# Also create a small test split from val
	test_size = min(len(val_lines) // 2, 500)
	test_lines = val_lines[:test_size]
	test_path = train_path.replace("train.jsonl", "test.jsonl")
	with open(test_path, "w") as f:
	f.writelines(test_lines)

	return len(train_lines), len(val_lines), len(test_lines)


	def main():
	os.makedirs("data/processed", exist_ok=True)

	print("=== Preprocessing datasets into unified JSONL ===\n")
	total = 0

	with open("data/processed/train.jsonl", "w") as out:
	# FCE
	print("Processing FCE...")
	n = process_fce("data/raw/fce", out)
	print(f" Total FCE: {n} pairs\n")
	total += n

	# W&I+LOCNESS
	print("Processing W&I+LOCNESS...")
	n = process_wi_locness("data/raw/wi+locness", out)
	print(f" Total W&I+LOCNESS: {n} pairs\n")
	total += n

	# JFLEG
	print("Processing JFLEG...")
	n = process_jfleg("data/raw/jfleg", out)
	print(f" Total JFLEG: {n} pairs\n")
	total += n

	print(f"Total examples in train.jsonl: {total}")

	# Create train/val/test splits
	print("\nSplitting into train/val/test...")
	n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
	print(f" Train: {n_train} \| Val: {n_val} \| Test: {n_test}")

	print("\n✓ Preprocessing complete.")
	print(" data/processed/train.jsonl")
	print(" data/processed/val.jsonl")
	print(" data/processed/test.jsonl")


	if __name__ == "__main__":
	main()