| """
|
| =============================================================================
|
| FULL PERMUTATION MISSPELLINGS GENERATOR (Google Colab Edition)
|
| =============================================================================
|
|
|
| Purpose:
|
| Generate ALL possible letter permutations of each word from words.txt
|
| and write them as misspelling=correction pairs.
|
|
|
| WARNING β READ BEFORE RUNNING
|
| This is computationally EXTREME. A single 10-letter word has 3,628,800
|
| permutations. A 12-letter word has 479,001,600. For 466k words, the full
|
| output could be PETABYTES. You WILL need to limit word length.
|
|
|
| =============================================================================
|
| HOW TO USE ON GOOGLE COLAB
|
| =============================================================================
|
|
|
| 1. Open Google Colab β https://colab.research.google.com
|
| 2. Create a new notebook (Python 3)
|
|
|
| 3. Upload your words.txt:
|
| βββββββββββββββββββββββββββββββββββββ
|
| # CELL 1: Upload words.txt
|
| from google.colab import files
|
| uploaded = files.upload() # click "Choose Files" β select words.txt
|
| βββββββββββββββββββββββββββββββββββββ
|
|
|
| 4. Copy-paste this ENTIRE script into a new cell and run it.
|
|
|
| 5. Download the result:
|
| βββββββββββββββββββββββββββββββββββββ
|
| # CELL 3: Download the output
|
| files.download('misspellings_permutations.txt')
|
| βββββββββββββββββββββββββββββββββββββ
|
|
|
| =============================================================================
|
| OR: Use Google Drive for large files
|
| =============================================================================
|
|
|
| # Mount Google Drive (you get 15 GB free)
|
| from google.colab import drive
|
| drive.mount('/content/drive')
|
|
|
| # Then set OUTPUT_PATH below to:
|
| OUTPUT_PATH = '/content/drive/MyDrive/misspellings_permutations.txt'
|
|
|
| =============================================================================
|
| CONFIGURATION β Adjust these before running!
|
| =============================================================================
|
| """
|
|
|
| import os
|
| import sys
|
| import time
|
| import math
|
| from itertools import permutations
|
|
|
|
|
|
|
| WORDS_PATH = 'words.txt'
|
| OUTPUT_PATH = 'misspellings_permutations.txt'
|
|
|
| MIN_WORD_LEN = 3
|
| MAX_WORD_LEN = 7
|
|
|
|
|
|
|
|
|
|
|
|
|
| ONLY_ALPHA = True
|
| BATCH_LOG = 5000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def estimate_output(words):
|
| """Estimate total permutations and file size before generating."""
|
| total_perms = 0
|
| for w in words:
|
| n = len(w)
|
|
|
| freq = {}
|
| for ch in w.lower():
|
| freq[ch] = freq.get(ch, 0) + 1
|
| unique_perms = math.factorial(n)
|
| for count in freq.values():
|
| unique_perms //= math.factorial(count)
|
| total_perms += unique_perms - 1
|
|
|
|
|
| avg_bytes_per_line = 15
|
| est_bytes = total_perms * avg_bytes_per_line
|
| est_gb = est_bytes / (1024 ** 3)
|
|
|
| return total_perms, est_gb
|
|
|
|
|
| def generate_unique_permutations(word):
|
| """
|
| Generate all unique permutations of a word's letters,
|
| excluding the original word itself.
|
|
|
| Uses set() to deduplicate (handles repeated letters efficiently).
|
| """
|
| lower = word.lower()
|
| perms = set(''.join(p) for p in permutations(lower))
|
| perms.discard(lower)
|
| return perms
|
|
|
|
|
| def is_pure_alpha(word):
|
| return word.isalpha()
|
|
|
|
|
| def main():
|
| if not os.path.exists(WORDS_PATH):
|
| print(f"ERROR: '{WORDS_PATH}' not found!")
|
| print("Make sure you uploaded words.txt or set WORDS_PATH correctly.")
|
| sys.exit(1)
|
|
|
|
|
| print(f"Reading words from: {WORDS_PATH}")
|
| with open(WORDS_PATH, 'r', encoding='utf-8', errors='replace') as f:
|
| raw_words = [line.strip() for line in f if line.strip()]
|
|
|
| print(f"Total raw entries: {len(raw_words):,}")
|
|
|
|
|
| words = []
|
| for w in raw_words:
|
| if ONLY_ALPHA and not is_pure_alpha(w):
|
| continue
|
| if len(w) < MIN_WORD_LEN or len(w) > MAX_WORD_LEN:
|
| continue
|
| words.append(w)
|
|
|
| print(f"Filtered to {len(words):,} words (alpha-only, len {MIN_WORD_LEN}-{MAX_WORD_LEN})")
|
|
|
| if len(words) == 0:
|
| print("No words matched the filter. Adjust MIN/MAX_WORD_LEN.")
|
| sys.exit(1)
|
|
|
|
|
| print("\nEstimating output size (this may take a moment)...")
|
| total_perms, est_gb = estimate_output(words)
|
| print(f" Estimated permutations : {total_perms:,}")
|
| print(f" Estimated file size : {est_gb:.2f} GB")
|
|
|
|
|
| if est_gb > 70:
|
| print(f"\n WARNING: Estimated output ({est_gb:.1f} GB) exceeds Colab disk (~78 GB).")
|
| print(" Reduce MAX_WORD_LEN or the script will crash when disk fills up.")
|
| print(" Aborting. Set MAX_WORD_LEN lower and re-run.")
|
| sys.exit(1)
|
|
|
| print(f"\nProceeding with generation β {OUTPUT_PATH}")
|
| print("=" * 60)
|
|
|
|
|
| start = time.time()
|
| total_written = 0
|
|
|
| with open(OUTPUT_PATH, 'w', encoding='utf-8') as out:
|
| out.write("# Auto-generated FULL PERMUTATION misspellings\n")
|
| out.write(f"# Config: word length {MIN_WORD_LEN}-{MAX_WORD_LEN}\n")
|
| out.write("# Format: misspelling=correction\n\n")
|
|
|
| for idx, word in enumerate(words):
|
| perms = generate_unique_permutations(word)
|
|
|
| for typo in sorted(perms):
|
| out.write(f"{typo}={word}\n")
|
| total_written += 1
|
|
|
|
|
| if (idx + 1) % BATCH_LOG == 0:
|
| elapsed = time.time() - start
|
| pct = (idx + 1) / len(words) * 100
|
| rate = (idx + 1) / elapsed if elapsed > 0 else 0
|
| cur_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
|
| print(f" [{pct:5.1f}%] {idx+1:>7,}/{len(words):,} words |"
|
| f" {total_written:>12,} lines | {cur_size:.2f} GB |"
|
| f" {rate:.0f} words/sec")
|
|
|
| elapsed = time.time() - start
|
| final_size = os.path.getsize(OUTPUT_PATH) / (1024 ** 3)
|
|
|
| print()
|
| print("=" * 60)
|
| print(f" DONE in {elapsed:.1f}s ({elapsed/60:.1f} min)")
|
| print(f" Words processed : {len(words):,}")
|
| print(f" Lines written : {total_written:,}")
|
| print(f" Output file : {OUTPUT_PATH}")
|
| print(f" File size : {final_size:.2f} GB")
|
| print("=" * 60)
|
|
|
|
|
| if __name__ == '__main__':
|
| main()
|
|
|