rewrite / scripts /preprocess_data.py
morpheuslord's picture
Add files using upload-large-folder tool
3df5819 verified
"""
Converts all raw dataset formats into unified JSONL training format.
Output schema per line:
{"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"}
Datasets handled:
- FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
- W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
- JFLEG: data/raw/jfleg/*.src + *.ref*
Run: python scripts/preprocess_data.py
"""
import json
import os
from pathlib import Path
def apply_bea19_edits(text: str, edits_block: list) -> str:
"""
Apply BEA-2019 character-level edits to produce corrected text.
edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
We use the first annotator's corrections.
Edits are applied in reverse order to preserve character offsets.
"""
if not edits_block or len(edits_block) == 0:
return text
# Take first annotator's edits
annotator_edits = edits_block[0][1]
# Sort by start position descending to apply from end to preserve offsets
sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)
result = text
for edit in sorted_edits:
start = edit[0]
end = edit[1]
replacement = edit[2]
# Skip null replacements (no correction needed) and noop edits
if replacement is None:
continue
result = result[:start] + replacement + result[end:]
return result
def process_bea19_json(json_path: str, source_name: str, out_file):
"""
Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
Each line is a JSON object with 'text' and 'edits' fields.
Produces (input=original, target=corrected) pairs.
"""
count = 0
with open(json_path) as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
original = obj["text"]
edits = obj.get("edits", [])
corrected = apply_bea19_edits(original, edits)
# Only include if there were actual corrections
if original.strip() != corrected.strip() and corrected.strip():
out_file.write(json.dumps({
"input": original,
"target": corrected,
"source": source_name,
}) + "\n")
count += 1
return count
def process_fce(raw_dir: str, out_file) -> int:
"""Process all FCE JSON files."""
total = 0
json_dir = Path(raw_dir) / "json"
if not json_dir.exists():
print(f" ⚠ FCE directory not found: {json_dir}")
return 0
for json_file in sorted(json_dir.glob("*.json")):
n = process_bea19_json(str(json_file), "fce", out_file)
print(f" {json_file.name}: {n} pairs")
total += n
return total
def process_wi_locness(raw_dir: str, out_file) -> int:
"""Process all W&I+LOCNESS JSON files."""
total = 0
json_dir = Path(raw_dir) / "json"
if not json_dir.exists():
print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}")
return 0
for json_file in sorted(json_dir.glob("*.json")):
n = process_bea19_json(str(json_file), "wi_locness", out_file)
print(f" {json_file.name}: {n} pairs")
total += n
return total
def process_jfleg(raw_dir: str, out_file) -> int:
"""
JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
Each reference becomes a separate training pair.
"""
total = 0
src_files = list(Path(raw_dir).glob("*.src"))
if not src_files:
print(f" ⚠ JFLEG directory empty or not found: {raw_dir}")
return 0
for src_file in src_files:
refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
with open(src_file) as sf:
src_lines = sf.readlines()
for ref_path in refs:
if ref_path.exists():
with open(ref_path) as rf:
ref_lines = rf.readlines()
for src, ref in zip(src_lines, ref_lines):
src, ref = src.strip(), ref.strip()
if src and ref and src != ref:
out_file.write(json.dumps({
"input": src,
"target": ref,
"source": "jfleg",
}) + "\n")
total += 1
return total
def create_splits(train_path: str, val_ratio: float = 0.1):
"""Split train.jsonl into train and val sets."""
import random
random.seed(42)
with open(train_path) as f:
lines = f.readlines()
random.shuffle(lines)
val_size = int(len(lines) * val_ratio)
val_lines = lines[:val_size]
train_lines = lines[val_size:]
with open(train_path, "w") as f:
f.writelines(train_lines)
val_path = train_path.replace("train.jsonl", "val.jsonl")
with open(val_path, "w") as f:
f.writelines(val_lines)
# Also create a small test split from val
test_size = min(len(val_lines) // 2, 500)
test_lines = val_lines[:test_size]
test_path = train_path.replace("train.jsonl", "test.jsonl")
with open(test_path, "w") as f:
f.writelines(test_lines)
return len(train_lines), len(val_lines), len(test_lines)
def main():
os.makedirs("data/processed", exist_ok=True)
print("=== Preprocessing datasets into unified JSONL ===\n")
total = 0
with open("data/processed/train.jsonl", "w") as out:
# FCE
print("Processing FCE...")
n = process_fce("data/raw/fce", out)
print(f" Total FCE: {n} pairs\n")
total += n
# W&I+LOCNESS
print("Processing W&I+LOCNESS...")
n = process_wi_locness("data/raw/wi+locness", out)
print(f" Total W&I+LOCNESS: {n} pairs\n")
total += n
# JFLEG
print("Processing JFLEG...")
n = process_jfleg("data/raw/jfleg", out)
print(f" Total JFLEG: {n} pairs\n")
total += n
print(f"Total examples in train.jsonl: {total}")
# Create train/val/test splits
print("\nSplitting into train/val/test...")
n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
print(f" Train: {n_train} | Val: {n_val} | Test: {n_test}")
print("\n✓ Preprocessing complete.")
print(" data/processed/train.jsonl")
print(" data/processed/val.jsonl")
print(" data/processed/test.jsonl")
if __name__ == "__main__":
main()