from nltk import FreqDist from nltk.corpus import brown from nltk.stem.lancaster import LancasterStemmer import json WORD_LIMIT = 10000 MIN_WORD_SIZE, MAX_WORD_SIZE = 4, 10 # stem = LancasterStemmer() # frequency_list = FreqDist(i.lower() for i in brown.words()) # words = [ # w.lower() # for w, _ in frequency_list.most_common()[:WORD_LIMIT] # if w.isalpha() and len(w) >= MIN_WORD_SIZE and len(w) <= MAX_WORD_SIZE # ] # stem_to_words = {} # for word in words: # stemmed = stem.stem(word) # if stemmed not in stem_to_words: # stem_to_words[stemmed] = set() # stem_to_words[stemmed].add(word) # final_words = [] # for stem, words in stem_to_words.items(): # shortest = min(words, key=len) # final_words.append(shortest) # with open("words.json", "w") as f: # f.write(json.dumps(final_words)) with open("jeopardy.json", "r") as f: jeopardy = json.loads(f.read()) answers = set() for row in jeopardy: answer = row["answer"].lower() if not answer.isalpha(): continue if answer.startswith("the "): answer = answer[4:] elif answer.startswith("a "): answer = answer[2:] if len(answer) < MIN_WORD_SIZE or len(answer) > MAX_WORD_SIZE: continue answers.add(answer) with open("words.json", "w") as f: f.write(json.dumps(list(answers)))