| # ...existing code... | |
| import re | |
| from pathlib import Path | |
| p = Path("data/cache") | |
| for f in p.glob("*.txt"): | |
| text = f.read_text(encoding="utf-8") | |
| # find repeated adjacent words like "word word" sequences | |
| matches = re.findall(r"\b(\w+)(?:\s+\1\b)+", text, flags=re.IGNORECASE) | |
| if matches: | |
| print(f"{f.name} has repeated words sample: {matches[:10]}") | |
| else: | |
| print(f"{f.name} looks ok") |