crossword / wordmaker.py
Ali Abid
first commit
2a68adc
from nltk import FreqDist
from nltk.corpus import brown
from nltk.stem.lancaster import LancasterStemmer
import json
WORD_LIMIT = 10000
MIN_WORD_SIZE, MAX_WORD_SIZE = 4, 10
# stem = LancasterStemmer()
# frequency_list = FreqDist(i.lower() for i in brown.words())
# words = [
# w.lower()
# for w, _ in frequency_list.most_common()[:WORD_LIMIT]
# if w.isalpha() and len(w) >= MIN_WORD_SIZE and len(w) <= MAX_WORD_SIZE
# ]
# stem_to_words = {}
# for word in words:
# stemmed = stem.stem(word)
# if stemmed not in stem_to_words:
# stem_to_words[stemmed] = set()
# stem_to_words[stemmed].add(word)
# final_words = []
# for stem, words in stem_to_words.items():
# shortest = min(words, key=len)
# final_words.append(shortest)
# with open("words.json", "w") as f:
# f.write(json.dumps(final_words))
with open("jeopardy.json", "r") as f:
jeopardy = json.loads(f.read())
answers = set()
for row in jeopardy:
answer = row["answer"].lower()
if not answer.isalpha():
continue
if answer.startswith("the "):
answer = answer[4:]
elif answer.startswith("a "):
answer = answer[2:]
if len(answer) < MIN_WORD_SIZE or len(answer) > MAX_WORD_SIZE:
continue
answers.add(answer)
with open("words.json", "w") as f:
f.write(json.dumps(list(answers)))