| """ |
| Converts all raw dataset formats into unified JSONL training format. |
| Output schema per line: |
| {"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"} |
| |
| Datasets handled: |
| - FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json |
| - W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json |
| - JFLEG: data/raw/jfleg/*.src + *.ref* |
| |
| Run: python scripts/preprocess_data.py |
| """ |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
|
|
| def apply_bea19_edits(text: str, edits_block: list) -> str: |
| """ |
| Apply BEA-2019 character-level edits to produce corrected text. |
| |
| edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]] |
| We use the first annotator's corrections. |
| Edits are applied in reverse order to preserve character offsets. |
| """ |
| if not edits_block or len(edits_block) == 0: |
| return text |
|
|
| |
| annotator_edits = edits_block[0][1] |
|
|
| |
| sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True) |
|
|
| result = text |
| for edit in sorted_edits: |
| start = edit[0] |
| end = edit[1] |
| replacement = edit[2] |
|
|
| |
| if replacement is None: |
| continue |
|
|
| result = result[:start] + replacement + result[end:] |
|
|
| return result |
|
|
|
|
| def process_bea19_json(json_path: str, source_name: str, out_file): |
| """ |
| Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS). |
| Each line is a JSON object with 'text' and 'edits' fields. |
| Produces (input=original, target=corrected) pairs. |
| """ |
| count = 0 |
| with open(json_path) as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| obj = json.loads(line) |
| original = obj["text"] |
| edits = obj.get("edits", []) |
| corrected = apply_bea19_edits(original, edits) |
|
|
| |
| if original.strip() != corrected.strip() and corrected.strip(): |
| out_file.write(json.dumps({ |
| "input": original, |
| "target": corrected, |
| "source": source_name, |
| }) + "\n") |
| count += 1 |
| return count |
|
|
|
|
| def process_fce(raw_dir: str, out_file) -> int: |
| """Process all FCE JSON files.""" |
| total = 0 |
| json_dir = Path(raw_dir) / "json" |
| if not json_dir.exists(): |
| print(f" ⚠ FCE directory not found: {json_dir}") |
| return 0 |
| for json_file in sorted(json_dir.glob("*.json")): |
| n = process_bea19_json(str(json_file), "fce", out_file) |
| print(f" {json_file.name}: {n} pairs") |
| total += n |
| return total |
|
|
|
|
| def process_wi_locness(raw_dir: str, out_file) -> int: |
| """Process all W&I+LOCNESS JSON files.""" |
| total = 0 |
| json_dir = Path(raw_dir) / "json" |
| if not json_dir.exists(): |
| print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}") |
| return 0 |
| for json_file in sorted(json_dir.glob("*.json")): |
| n = process_bea19_json(str(json_file), "wi_locness", out_file) |
| print(f" {json_file.name}: {n} pairs") |
| total += n |
| return total |
|
|
|
|
| def process_jfleg(raw_dir: str, out_file) -> int: |
| """ |
| JFLEG: .src files (original) and .ref0..ref3 (4 human corrections). |
| Each reference becomes a separate training pair. |
| """ |
| total = 0 |
| src_files = list(Path(raw_dir).glob("*.src")) |
| if not src_files: |
| print(f" ⚠ JFLEG directory empty or not found: {raw_dir}") |
| return 0 |
| for src_file in src_files: |
| refs = [src_file.with_suffix(f".ref{i}") for i in range(4)] |
| with open(src_file) as sf: |
| src_lines = sf.readlines() |
| for ref_path in refs: |
| if ref_path.exists(): |
| with open(ref_path) as rf: |
| ref_lines = rf.readlines() |
| for src, ref in zip(src_lines, ref_lines): |
| src, ref = src.strip(), ref.strip() |
| if src and ref and src != ref: |
| out_file.write(json.dumps({ |
| "input": src, |
| "target": ref, |
| "source": "jfleg", |
| }) + "\n") |
| total += 1 |
| return total |
|
|
|
|
| def create_splits(train_path: str, val_ratio: float = 0.1): |
| """Split train.jsonl into train and val sets.""" |
| import random |
| random.seed(42) |
|
|
| with open(train_path) as f: |
| lines = f.readlines() |
|
|
| random.shuffle(lines) |
| val_size = int(len(lines) * val_ratio) |
| val_lines = lines[:val_size] |
| train_lines = lines[val_size:] |
|
|
| with open(train_path, "w") as f: |
| f.writelines(train_lines) |
|
|
| val_path = train_path.replace("train.jsonl", "val.jsonl") |
| with open(val_path, "w") as f: |
| f.writelines(val_lines) |
|
|
| |
| test_size = min(len(val_lines) // 2, 500) |
| test_lines = val_lines[:test_size] |
| test_path = train_path.replace("train.jsonl", "test.jsonl") |
| with open(test_path, "w") as f: |
| f.writelines(test_lines) |
|
|
| return len(train_lines), len(val_lines), len(test_lines) |
|
|
|
|
| def main(): |
| os.makedirs("data/processed", exist_ok=True) |
|
|
| print("=== Preprocessing datasets into unified JSONL ===\n") |
| total = 0 |
|
|
| with open("data/processed/train.jsonl", "w") as out: |
| |
| print("Processing FCE...") |
| n = process_fce("data/raw/fce", out) |
| print(f" Total FCE: {n} pairs\n") |
| total += n |
|
|
| |
| print("Processing W&I+LOCNESS...") |
| n = process_wi_locness("data/raw/wi+locness", out) |
| print(f" Total W&I+LOCNESS: {n} pairs\n") |
| total += n |
|
|
| |
| print("Processing JFLEG...") |
| n = process_jfleg("data/raw/jfleg", out) |
| print(f" Total JFLEG: {n} pairs\n") |
| total += n |
|
|
| print(f"Total examples in train.jsonl: {total}") |
|
|
| |
| print("\nSplitting into train/val/test...") |
| n_train, n_val, n_test = create_splits("data/processed/train.jsonl") |
| print(f" Train: {n_train} | Val: {n_val} | Test: {n_test}") |
|
|
| print("\n✓ Preprocessing complete.") |
| print(" data/processed/train.jsonl") |
| print(" data/processed/val.jsonl") |
| print(" data/processed/test.jsonl") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|