| # Download all training data sources | |
| # Run: bash scripts/download_datasets.sh | |
| set -e | |
| mkdir -p data/raw/wi_locness data/raw/jfleg data/raw/gyafc data/raw/custom_dyslexia | |
| echo "=== Downloading JFLEG (JHU Fluency-Extended GUG) ===" | |
| if [ ! -d "data/raw/jfleg_repo" ]; then | |
| git clone https://github.com/keisks/jfleg.git data/raw/jfleg_repo | |
| cp data/raw/jfleg_repo/test/*.src data/raw/jfleg/ 2>/dev/null || true | |
| cp data/raw/jfleg_repo/test/*.ref* data/raw/jfleg/ 2>/dev/null || true | |
| echo " β JFLEG downloaded" | |
| else | |
| echo " β JFLEG already exists" | |
| fi | |
| echo "" | |
| echo "=== Manual Downloads Required ===" | |
| echo "" | |
| echo "W&I+LOCNESS (35k pairs, gold standard GEC):" | |
| echo " β Register at: https://www.cl.cam.ac.uk/research/nl/bea2019st/" | |
| echo " β Place files in: data/raw/wi_locness/" | |
| echo "" | |
| echo "GYAFC (105k pairs, formality transfer):" | |
| echo " β Request access at: https://github.com/raosudha89/GYAFC-corpus" | |
| echo " β Place files in: data/raw/gyafc/" | |
| echo "" | |
| echo "=== Dataset download complete ===" | |
| echo "Check manually downloaded datasets before proceeding." | |