KAI MAURIN-JONES commited on
Commit
d358aa1
1 Parent(s): 29ca9e3

solving function updated

Browse files
__pycache__/wordle_functions.cpython-310.pyc ADDED
Binary file (30 kB). View file
 
wordle_functions.py CHANGED
@@ -301,14 +301,13 @@ def get_word_distribution(word_list: list, sort: str = "descending"):
301
  ############################################################################################################################################################
302
 
303
  def wordle_wizard(word_list: list, max_guesses: int = None,
304
- guess: str = None, target: str = None, bias: bool = True,
305
  random_guess: bool = False, random_target: bool = False,
306
  verbose: bool = False, drama: float = None,
307
  return_stats: bool = False, record: bool = False, hf_mod: bool = True):
308
  """
309
  Mimicking the popular web game, this function matches a current word to a target word automatically, in the most statistically optimal way possible.
310
 
311
- ------
312
  Parameters:
313
  ------
314
  `word_list`: list
@@ -317,15 +316,6 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
317
  a string -- must be the same length as `target_word`
318
  `target`: str
319
  a string -- must be the same length as `opening_word`
320
- `bias`: str ['entropy', 'common', 'rare', None]
321
- 'entropy' biases next word guesses to be the ones with the highest impact on the range of next possible guesses. Entropy values associated with each word are normalized across the list.
322
-
323
- 'common' biases next word guesses to be words that are more commonly used
324
-
325
- 'rare' biases next word guesses to be words that are more rarely used
326
-
327
- 'no_bias' chooses a next guess at random of all available guesses
328
-
329
  `max_guesses`: int
330
  the maximum number of attempts allowed to solve the Wordle
331
  `random_guess`: bool
@@ -341,13 +331,15 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
341
  `record`: bool
342
  if True, creates a .txt file with the same information printed according to the indicated verbosity
343
 
344
- ------
345
  Returns:
346
  ------
347
  `stats_dict`: dict
348
  dictionary containing various statistics about the function's performance trying to solve the puzzle
349
  """
350
 
 
 
 
351
  sugg_words = []
352
 
353
  for i in range(0, 20):
@@ -658,100 +650,55 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
658
 
659
  else:
660
 
661
- if bias == "entropy":
662
-
663
- best_next_guesses = list(potential_next_guesses)
664
- # print (best_next_guesses)
665
- word_ratings = get_word_entropy(best_next_guesses, word_list, normalized = True, ascending = False) # "internal" ratings
666
-
667
- # Get max rated word
668
- max_rating = -np.inf
669
- for word, rating in word_ratings:
670
- if rating > max_rating:
671
- max_rating = rating
672
-
673
- for word, rating in word_ratings:
674
- if rating == max_rating:
675
- guess = word
676
-
677
- guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])
678
-
679
- if return_stats == False:
680
- if verbose == True:
681
- if len(word_ratings) <= 40:
682
- print(f"All potential next guesses:\n\t{word_ratings}\n")
683
- print(f"Words guessed so far:\n\t{guessed_words}.\n")
684
- record_list.append(f"Potential next guesses: {word_ratings}\n")
685
- record_list.append(f"Words guessed so far: {guessed_words}.\n")
686
- else:
687
- print(f"The top 40 potential next guesses are:\n\t{word_ratings[:40]}\n")
688
- print(f"Words guessed so far:\n\t{guessed_words}.\n")
689
- record_list.append(f"The top 40 potential next guesses are: {word_ratings[:40]}\n")
690
- record_list.append(f"Words guessed so far: {guessed_words}.\n")
691
-
692
- if bias == "no_bias":
693
- best_next_guesses = set()
694
- for word in potential_next_guesses:
695
- for letter, freq in word_list_sorted_counts:
696
- if letter not in dont_guess_again:
697
- if len(next_letters) > 0:
698
- if letter in next_letters:
699
- if letter in word:
700
- best_next_guesses.add(word)
701
- break
702
- else:
703
- if letter in word:
704
- best_next_guesses.add(word)
705
- break
706
-
707
- if return_stats == False:
708
- if verbose == True:
709
- if len(best_next_guesses) <= 40:
710
- print(f"Potential next guesses:\n\t{best_next_guesses}\n")
711
- print(f"Words guessed so far:\n\t{guessed_words}.\n")
712
- record_list.append(f"Potential next guesses: {best_next_guesses}\n")
713
- record_list.append(f"Words guessed so far: {guessed_words}.\n")
714
-
715
- if bias == ("common" or "rare"):
716
- found_words = []
717
- for word in word_list:
718
- if word in nltk_counts.keys():
719
- found_words.append(word)
720
-
721
- found_words_sorted = sorted(found_words, key = operator.itemgetter(1), reverse = True) # sorted descending
722
-
723
- rated_words = []
724
- for word in potential_next_guesses:
725
- for tup in found_words_sorted:
726
- if tup[0] == word:
727
- rated_words.append(tup)
728
-
729
- rated_words = sorted(rated_words, key = operator.itemgetter(1), reverse = True) # sorted descending
730
-
731
- if bias == "common":
732
- guess = rated_words[0][0] # word in first position // most frequent word
733
-
734
- if return_stats == False:
735
- if verbose == True:
736
- if len(potential_next_guesses) <= 40:
737
- print(f"Potential next guesses:\n\t{rated_words}\n")
738
- print(f"Words guessed so far:\n\t{guessed_words}.\n")
739
- record_list.append(f"Potential next guesses: {potential_next_guesses}\n")
740
- record_list.append(f"Words guessed so far: {guessed_words}.\n")
741
-
742
- if bias == "rare":
743
- guess = rated_words[-1][0] # word in last position // least frequent word
744
-
745
- if return_stats == False:
746
- if verbose == True:
747
- if len(potential_next_guesses) <= 40:
748
- print(f"Potential next guesses:\n\t{rated_words}\n")
749
- print(f"Words guessed so far:\n\t{guessed_words}.\n")
750
- record_list.append(f"Potential next guesses: {potential_next_guesses}\n")
751
- record_list.append(f"Words guessed so far: {guessed_words}.\n")
752
-
753
- # guess = list(best_next_guesses)[0]
754
- guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])
755
 
756
  #### Guess has now been made -- what to do next
757
  if guess_num == max_guesses: # if at max guesses allowed
@@ -850,9 +797,6 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
850
  # average_entropy = 95
851
  luck = round(1 - ((((guess_num / expected_guesses) * (stats_dict['avg_intermediate_guess_entropy'] / 100)) / max_guesses) * 5), 2)
852
  stats_dict['luck'] = luck
853
-
854
- stats_dict['bias'] = bias
855
-
856
 
857
  if record == True:
858
  if verbose == True:
@@ -864,7 +808,6 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
864
  for line in record_list:
865
  fout.write(line + "\n") # write
866
 
867
-
868
  # if guess_num <= len(guess):
869
  if guess_num <= 6:
870
  stats_dict['valid_success'] = True
@@ -877,11 +820,64 @@ def wordle_wizard(word_list: list, max_guesses: int = None,
877
  # return stats_dict
878
  if hf_mod == True:
879
  return record_list
880
-
881
  ############################################################################################################################################################
882
  ############################################################################################################################################################
883
  ############################################################################################################################################################
884
  ############################################################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885
 
886
  def compare_wordle(word_list: list, max_guesses: int = None, guess_list: list = None,
887
  player: str = None, target: str = None,
 
301
  ############################################################################################################################################################
302
 
303
  def wordle_wizard(word_list: list, max_guesses: int = None,
304
+ guess: str = None, target: str = None,
305
  random_guess: bool = False, random_target: bool = False,
306
  verbose: bool = False, drama: float = None,
307
  return_stats: bool = False, record: bool = False, hf_mod: bool = True):
308
  """
309
  Mimicking the popular web game, this function matches a current word to a target word automatically, in the most statistically optimal way possible.
310
 
 
311
  Parameters:
312
  ------
313
  `word_list`: list
 
316
  a string -- must be the same length as `target_word`
317
  `target`: str
318
  a string -- must be the same length as `opening_word`
 
 
 
 
 
 
 
 
 
319
  `max_guesses`: int
320
  the maximum number of attempts allowed to solve the Wordle
321
  `random_guess`: bool
 
331
  `record`: bool
332
  if True, creates a .txt file with the same information printed according to the indicated verbosity
333
 
 
334
  Returns:
335
  ------
336
  `stats_dict`: dict
337
  dictionary containing various statistics about the function's performance trying to solve the puzzle
338
  """
339
 
340
+ guess = guess.lower()
341
+ target = target.lower()
342
+
343
  sugg_words = []
344
 
345
  for i in range(0, 20):
 
650
 
651
  else:
652
 
653
+ best_next_guesses = list(potential_next_guesses)
654
+ # print (best_next_guesses)
655
+ word_ratings = get_word_entropy(best_next_guesses, word_list, normalized = True, ascending = False) # "internal" ratings
656
+
657
+ # Get max rating of all words
658
+ max_rating = -np.inf
659
+ for word, rating in word_ratings:
660
+ if rating > max_rating:
661
+ max_rating = rating
662
+
663
+ # add best rated words (all equally best entropy in next guess list) to set
664
+ best_of_the_best_1 = []
665
+ for word, rating in word_ratings:
666
+ if rating == max_rating:
667
+ best_of_the_best_1.append(word)
668
+
669
+ # only using top ten most frequent prefixes suffixes to bias. After that it the impact is especially negligible
670
+ test_starts = get_gram_freq(word_list = word_list, letters_length = 2, position = "start", search = None)[:10]
671
+ test_ends = get_gram_freq(word_list = word_list, letters_length = 2, position = "end", search = None)[:10]
672
+
673
+ # list of the best words that also have the best suffixes and prefixes
674
+ best_of_the_best_2 = []
675
+ for start_gram, start_count in test_starts:
676
+ for end_gram, end_count in test_ends:
677
+ for word in best_of_the_best_1:
678
+ if word[:2] == start_gram and word[-2:] == end_gram:
679
+ best_of_the_best_2.append(word)
680
+
681
+ if len(best_of_the_best_2) > 0:
682
+ guess = best_of_the_best_2[0]
683
+ else:
684
+ guess = best_of_the_best_1[0] # they're all equally the best of the best possible guesses so just pick the first
685
+
686
+ # guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])
687
+
688
+ if return_stats == False:
689
+ if verbose == True:
690
+ if len(word_ratings) <= 40:
691
+ print(f"All potential next guesses:\n\t{word_ratings}\n")
692
+ print(f"Words guessed so far:\n\t{guessed_words}.\n")
693
+ record_list.append(f"Potential next guesses: {word_ratings}\n")
694
+ record_list.append(f"Words guessed so far: {guessed_words}.\n")
695
+ else:
696
+ print(f"The top 40 potential next guesses are:\n\t{word_ratings[:40]}\n")
697
+ print(f"Words guessed so far:\n\t{guessed_words}.\n")
698
+ record_list.append(f"The top 40 potential next guesses are: {word_ratings[:40]}\n")
699
+ record_list.append(f"Words guessed so far: {guessed_words}.\n")
700
+
701
+ guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
 
703
  #### Guess has now been made -- what to do next
704
  if guess_num == max_guesses: # if at max guesses allowed
 
797
  # average_entropy = 95
798
  luck = round(1 - ((((guess_num / expected_guesses) * (stats_dict['avg_intermediate_guess_entropy'] / 100)) / max_guesses) * 5), 2)
799
  stats_dict['luck'] = luck
 
 
 
800
 
801
  if record == True:
802
  if verbose == True:
 
808
  for line in record_list:
809
  fout.write(line + "\n") # write
810
 
 
811
  # if guess_num <= len(guess):
812
  if guess_num <= 6:
813
  stats_dict['valid_success'] = True
 
820
  # return stats_dict
821
  if hf_mod == True:
822
  return record_list
823
+
824
  ############################################################################################################################################################
825
  ############################################################################################################################################################
826
  ############################################################################################################################################################
827
  ############################################################################################################################################################
828
+
829
+ def get_gram_freq(word_list: list, letters_length: int = 2, position: bool = "start", search: any = None):
830
+ """
831
+ Given a word list, a selected number of letter, a selected word position to start from ("start" or "end"),
832
+ and an optional gram to search within the list, this function will get a frequency distribution of all n-grams
833
+ from the passed word list and returned a frequency distribution in descending order.
834
+
835
+ Parameters:
836
+ ------
837
+ `word_list`: list
838
+ list of words of the same
839
+ `letters_length`: int
840
+ number of letters in succession. Size/length of "gram". Must be between 1 and length of words in word list
841
+ `position`: bool
842
+ Whether to start the gram from the start of the word (like a prefix) or the end of the word (like a suffix)
843
+ `search`: str
844
+ If != None, string of characters to search for within the generated list. If string not found in list, function will print an error message.
845
+
846
+ Returns:
847
+ ------
848
+ `tup`: tuple
849
+ If search != None, will return a tuple with the passed search criteria, and its count
850
+ `sorted_gram_list`: list
851
+ List of tuples in the form of (gram, count) for each combination of the gram size in the pass word_list
852
+ """
853
+
854
+ gram_freq_dist = {}
855
+
856
+ for word in word_list:
857
+ if position == "start":
858
+ gram = word[:letters_length] # first 2 letters
859
+ if position == "end":
860
+ gram = word[-(letters_length):] # first 2 letters
861
+
862
+ if gram not in gram_freq_dist:
863
+ gram_freq_dist[gram] = 1
864
+ else:
865
+ gram_freq_dist[gram] += 1
866
+
867
+ sorted_gram_dist = sorted(gram_freq_dist.items(), key = operator.itemgetter(1), reverse = True)
868
+
869
+ if search:
870
+ nos = []
871
+ for tup in sorted_gram_dist:
872
+ if tup[0] == search:
873
+ return tup
874
+ else:
875
+ nos.append("not here")
876
+
877
+ if len(nos) == len(sorted_gram_dist):
878
+ print ("Search criteria not found in list. Please enter a gram from within the list.")
879
+ else:
880
+ return sorted_gram_dist
881
 
882
  def compare_wordle(word_list: list, max_guesses: int = None, guess_list: list = None,
883
  player: str = None, target: str = None,
wordle_testing.ipynb ADDED
@@ -0,0 +1,1654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Imports"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "import numpy as np\n",
18
+ "import random\n",
19
+ "import operator\n",
20
+ "import time\n",
21
+ "import pandas as pd\n",
22
+ "from wordle_functions import *"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {},
28
+ "source": [
29
+ "## Importing datasets"
30
+ ]
31
+ },
32
+ {
33
+ "attachments": {},
34
+ "cell_type": "markdown",
35
+ "metadata": {},
36
+ "source": [
37
+ "### official words\n",
38
+ "- official wordle word list"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 2,
44
+ "metadata": {},
45
+ "outputs": [
46
+ {
47
+ "name": "stdout",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "2310\n"
51
+ ]
52
+ },
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "['wince', 'thyme', 'mower', 'horde', 'heard']"
57
+ ]
58
+ },
59
+ "execution_count": 2,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "### Official list\n",
66
+ "official_words = []\n",
67
+ "\n",
68
+ "with open(\"data/official_words_processed.txt\", \"r\", encoding = \"utf-8\") as f:\n",
69
+ " for word in f.read().split(\"\\n\"):\n",
70
+ " official_words.append(word)\n",
71
+ "\n",
72
+ "f.close() # closes connection to file\n",
73
+ "\n",
74
+ "print(len(official_words))\n",
75
+ "official_words[:5]"
76
+ ]
77
+ },
78
+ {
79
+ "attachments": {},
80
+ "cell_type": "markdown",
81
+ "metadata": {},
82
+ "source": [
83
+ "### alternative list 1\n",
84
+ "- an alternate list of 5-letter words found on the web"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 3,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "ename": "FileNotFoundError",
94
+ "evalue": "[Errno 2] No such file or directory: 'data/alt_words_1.txt'",
95
+ "output_type": "error",
96
+ "traceback": [
97
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
98
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
99
+ "\u001b[1;32m/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb Cell 7\u001b[0m in \u001b[0;36m<cell line: 4>\u001b[0;34m()\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb#W6sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39m### Official list\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb#W6sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m alt_words_1 \u001b[39m=\u001b[39m []\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb#W6sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(\u001b[39m\"\u001b[39;49m\u001b[39mdata/alt_words_1.txt\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mr\u001b[39;49m\u001b[39m\"\u001b[39;49m, encoding \u001b[39m=\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39mutf-8\u001b[39;49m\u001b[39m\"\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb#W6sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mfor\u001b[39;00m word \u001b[39min\u001b[39;00m f\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m):\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/kmaurinjones/Desktop/data_science/data_science_projects/wordle_wizard/wordle_testing.ipynb#W6sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m alt_words_1\u001b[39m.\u001b[39mappend(word)\n",
100
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/alt_words_1.txt'"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "### Official list\n",
106
+ "alt_words_1 = []\n",
107
+ "\n",
108
+ "with open(\"data/alt_words_1.txt\", \"r\", encoding = \"utf-8\") as f:\n",
109
+ " for word in f.read().split(\"\\n\"):\n",
110
+ " alt_words_1.append(word)\n",
111
+ "\n",
112
+ "f.close() # closes connection to file\n",
113
+ "\n",
114
+ "print(len(alt_words_1))\n",
115
+ "alt_words_1[:5]"
116
+ ]
117
+ },
118
+ {
119
+ "attachments": {},
120
+ "cell_type": "markdown",
121
+ "metadata": {},
122
+ "source": [
123
+ "### nltk grand corpus\n",
124
+ "- Amalgamation of all words in various NLTK corpora to have as big a dataset as possible\n",
125
+ "- Developed manually"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "### grand corpus tokens\n",
135
+ "nltk_tokens = []\n",
136
+ "\n",
137
+ "with open(\"data/nltk_grand_corpus_tokens_5.txt\", \"r\", encoding = \"utf-8\") as f:\n",
138
+ " for word in f.read().split(\"\\n\"):\n",
139
+ " nltk_tokens.append(word)\n",
140
+ "\n",
141
+ "f.close() # closes connection to file\n",
142
+ "\n",
143
+ "print(len(nltk_tokens))\n",
144
+ "nltk_tokens[:5]"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "markdown",
149
+ "metadata": {},
150
+ "source": [
151
+ "### nltk grand corpus types and counts"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "### grand corpus types and counts\n",
161
+ "nltk_counts = {}\n",
162
+ "\n",
163
+ "with open(\"data/nltk_grand_corpus_types_and_counts_5.txt\", \"r\", encoding = \"utf-8\") as f:\n",
164
+ " for line in f.read().split(\"\\n\"):\n",
165
+ " if len(line.split(\"\\t\")) == 2:\n",
166
+ " word = line.split(\"\\t\")[0]\n",
167
+ " count = line.split(\"\\t\")[1]\n",
168
+ " nltk_counts[word] = count\n",
169
+ " else:\n",
170
+ " continue\n",
171
+ "\n",
172
+ "f.close() # closes connection to file\n",
173
+ "\n",
174
+ "print(len(nltk_counts))\n",
175
+ "nltk_counts['which']"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "### Official list\n",
185
+ "official_words = []\n",
186
+ "\n",
187
+ "with open(\"data/official_words_processed.txt\", \"r\", encoding = \"utf-8\") as f:\n",
188
+ " for word in f.read().split(\"\\n\"):\n",
189
+ " if len(word) > 0: # there's one blank entry at the start\n",
190
+ " official_words.append(word)\n",
191
+ "\n",
192
+ "f.close() # closes connection to file\n",
193
+ "\n",
194
+ "print(len(official_words))\n",
195
+ "official_words[:10]"
196
+ ]
197
+ },
198
+ {
199
+ "attachments": {},
200
+ "cell_type": "markdown",
201
+ "metadata": {},
202
+ "source": [
203
+ "## Wordle functions + Testing"
204
+ ]
205
+ },
206
+ {
207
+ "attachments": {},
208
+ "cell_type": "markdown",
209
+ "metadata": {},
210
+ "source": [
211
+ "### Testing `wordle_wizard()`"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 13,
217
+ "metadata": {},
218
+ "outputs": [
219
+ {
220
+ "name": "stdout",
221
+ "output_type": "stream",
222
+ "text": [
223
+ "-----------------------------\n",
224
+ "\n",
225
+ "Guess 1: 'poesy'\n",
226
+ "Letters in correct positions:\n",
227
+ "\t[]\n",
228
+ "\n",
229
+ "Letters in incorrect positions:\n",
230
+ "\t[('e', 2)]\n",
231
+ "\n",
232
+ "Letters to guess again:\n",
233
+ "\t['e']\n",
234
+ "\n",
235
+ "Letters to not guess again:\n",
236
+ "\t['o', 'p', 's', 'y']\n",
237
+ "\n",
238
+ "At this point:\n",
239
+ "\t1905, 82.47% of total words have been eliminated, and\n",
240
+ "\t405, 17.53% of total words remain possible.\n",
241
+ "\n",
242
+ "The top 40 potential next guesses are:\n",
243
+ "\t[('alter', 100.0), ('later', 100.0), ('irate', 98.33), ('renal', 94.34), ('learn', 94.34), ('react', 91.3), ('crate', 91.3), ('trace', 91.3), ('cater', 91.3), ('trade', 88.34), ('leant', 88.3), ('heart', 88.13), ('earth', 88.13), ('hater', 88.13), ('aider', 86.22), ('alien', 86.18), ('crane', 85.68), ('tamer', 85.55), ('grate', 85.35), ('realm', 85.1), ('regal', 84.89), ('glare', 84.89), ('lager', 84.89), ('large', 84.89), ('eclat', 84.76), ('blare', 83.81), ('baler', 83.81), ('inter', 83.81), ('liner', 83.35), ('after', 82.43), ('flare', 81.93), ('feral', 81.93), ('delta', 81.81), ('dealt', 81.81), ('taker', 81.72), ('lathe', 81.6), ('water', 81.14), ('trice', 80.31), ('afire', 80.31), ('ramen', 79.93)]\n",
244
+ "\n",
245
+ "Words guessed so far:\n",
246
+ "\t['poesy'].\n",
247
+ "\n",
248
+ "Next guess:\n",
249
+ "\t'alter'\n",
250
+ "\n",
251
+ "-----------------------------\n",
252
+ "\n",
253
+ "Guess 2: 'alter'\n",
254
+ "Letters in correct positions:\n",
255
+ "\t[]\n",
256
+ "\n",
257
+ "Letters in incorrect positions:\n",
258
+ "\t[('a', 0), ('e', 2), ('t', 2), ('e', 3), ('r', 4)]\n",
259
+ "\n",
260
+ "Letters to guess again:\n",
261
+ "\t['a', 'e', 'r', 't']\n",
262
+ "\n",
263
+ "Letters to not guess again:\n",
264
+ "\t['l', 'o', 'p', 's', 'y']\n",
265
+ "\n",
266
+ "At this point:\n",
267
+ "\t2301, 99.61% of total words have been eliminated, and\n",
268
+ "\t9, 0.39% of total words remain possible.\n",
269
+ "\n",
270
+ "All potential next guesses:\n",
271
+ "\t[('irate', 100.0), ('crate', 70.91), ('trace', 70.91), ('react', 70.91), ('trade', 58.69), ('heart', 57.83), ('earth', 57.83), ('grate', 46.3), ('terra', 0.0)]\n",
272
+ "\n",
273
+ "Words guessed so far:\n",
274
+ "\t['poesy', 'alter'].\n",
275
+ "\n",
276
+ "Next guess:\n",
277
+ "\t'irate'\n",
278
+ "\n",
279
+ "-----------------------------\n",
280
+ "\n",
281
+ "Guess 3: 'irate'\n",
282
+ "Letters in correct positions:\n",
283
+ "\t[('r', 1), ('a', 2), ('t', 3), ('e', 4)]\n",
284
+ "\n",
285
+ "Letters in incorrect positions:\n",
286
+ "\t[('a', 0), ('e', 2), ('t', 2), ('e', 3), ('r', 4)]\n",
287
+ "\n",
288
+ "Letters to guess again:\n",
289
+ "\t['a', 'e', 'r', 't']\n",
290
+ "\n",
291
+ "Letters to not guess again:\n",
292
+ "\t['i', 'l', 'o', 'p', 's', 'y']\n",
293
+ "\n",
294
+ "At this point:\n",
295
+ "\t2308, 99.91% of total words have been eliminated, and\n",
296
+ "\t2, 0.09% of total words remain possible.\n",
297
+ "\n",
298
+ "All potential next guesses:\n",
299
+ "\t[('crate', 100.0), ('grate', 0.0)]\n",
300
+ "\n",
301
+ "Words guessed so far:\n",
302
+ "\t['poesy', 'alter', 'irate'].\n",
303
+ "\n",
304
+ "Next guess:\n",
305
+ "\t'crate'\n",
306
+ "\n",
307
+ "-----------------------------\n",
308
+ "\n",
309
+ "Guess 4: 'crate'\n",
310
+ "Letters in correct positions:\n",
311
+ "\t[('r', 1), ('a', 2), ('t', 3), ('e', 4)]\n",
312
+ "\n",
313
+ "Letters in incorrect positions:\n",
314
+ "\t[('a', 0), ('e', 2), ('t', 2), ('e', 3), ('r', 4)]\n",
315
+ "\n",
316
+ "Letters to guess again:\n",
317
+ "\t['a', 'e', 'r', 't']\n",
318
+ "\n",
319
+ "Letters to not guess again:\n",
320
+ "\t['c', 'i', 'l', 'o', 'p', 's', 'y']\n",
321
+ "\n",
322
+ "At this point:\n",
323
+ "\t2309, 99.96% of total words have been eliminated, and\n",
324
+ "\t1, 0.04% of total words remain possible.\n",
325
+ "\n",
326
+ "The only remaining possible word is:\n",
327
+ "\t'grate'\n",
328
+ "\n",
329
+ "Next guess:\n",
330
+ "\t'grate'\n",
331
+ "\n",
332
+ "-----------------------------\n",
333
+ "\n",
334
+ "Guess 5: 'grate'\n",
335
+ "\n",
336
+ "Congratulations! The Wordle has been solved in 5 guesses!\n",
337
+ "There were still 1 guesses remaining.\n",
338
+ "\n",
339
+ "The target word was 'grate'.\n",
340
+ "\n",
341
+ "-----------------------------\n"
342
+ ]
343
+ }
344
+ ],
345
+ "source": [
346
+ "test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, \n",
347
+ " guess = \"paint\", target = \"force\",\n",
348
+ " random_guess = True, random_target = True, \n",
349
+ " verbose = True, drama = 0, return_stats = False, record = False)"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": null,
355
+ "metadata": {},
356
+ "outputs": [],
357
+ "source": [
358
+ "for val in [False, True]:\n",
359
+ " wordle_wizard(word_list = official_words, max_guesses = 6, \n",
360
+ " guess = \"arose\", target = \"syrup\", bias = 'entropy', \n",
361
+ " random_guess = False, random_target = False, \n",
362
+ " verbose = val, drama = 0, return_stats = False, record = True)"
363
+ ]
364
+ },
365
+ {
366
+ "attachments": {},
367
+ "cell_type": "markdown",
368
+ "metadata": {},
369
+ "source": [
370
+ "### Testing on 3-letter words"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": null,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "### 3 letters\n",
380
+ "words_3_letters = []\n",
381
+ "words_3_types_counts = {}\n",
382
+ "\n",
383
+ "with open(\"data/nltk_grand_corpus_types_and_counts_3.txt\", \"r\", encoding = \"utf-8\") as f:\n",
384
+ " for line in f.read().split(\"\\n\"):\n",
385
+ " word_freq = line.split(\"\\t\")\n",
386
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
387
+ " word = word_freq[0]\n",
388
+ " freq = word_freq[1]\n",
389
+ " if word.isascii() == True:\n",
390
+ " words_3_letters.append(word)\n",
391
+ " words_3_types_counts[word] = freq\n",
392
+ "\n",
393
+ "f.close() # closes connection to file\n",
394
+ "\n",
395
+ "print(len(words_3_letters))\n",
396
+ "print(words_3_letters[:5])\n",
397
+ "words_3_types_counts['the']"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": null,
403
+ "metadata": {},
404
+ "outputs": [],
405
+ "source": [
406
+ "for val in [False, True]:\n",
407
+ " wordle_wizard(word_list = words_3_letters, max_guesses = 6, \n",
408
+ " guess = \"the\", target = \"his\", bias = 'entropy', \n",
409
+ " random_guess = False, random_target = False, \n",
410
+ " verbose = val, drama = 0, return_stats = False, record = True)"
411
+ ]
412
+ },
413
+ {
414
+ "attachments": {},
415
+ "cell_type": "markdown",
416
+ "metadata": {},
417
+ "source": [
418
+ "### Testing on 4-letter words"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": null,
424
+ "metadata": {},
425
+ "outputs": [],
426
+ "source": [
427
+ "### 3 letters\n",
428
+ "words_4_letters = []\n",
429
+ "words_4_types_counts = {}\n",
430
+ "\n",
431
+ "with open(\"data/nltk_grand_corpus_types_and_counts_4.txt\", \"r\", encoding = \"utf-8\") as f:\n",
432
+ " for line in f.read().split(\"\\n\"):\n",
433
+ " word_freq = line.split(\"\\t\")\n",
434
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
435
+ " word = word_freq[0]\n",
436
+ " freq = word_freq[1]\n",
437
+ " if word.isascii() == True:\n",
438
+ " words_4_letters.append(word)\n",
439
+ " words_4_types_counts[word] = freq\n",
440
+ "\n",
441
+ "f.close() # closes connection to file\n",
442
+ "\n",
443
+ "print(len(words_4_letters))\n",
444
+ "print(words_4_letters[:5])\n",
445
+ "words_4_types_counts['that']"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": null,
451
+ "metadata": {},
452
+ "outputs": [],
453
+ "source": [
454
+ "for val in [False, True]:\n",
455
+ " wordle_wizard(word_list = words_4_letters, max_guesses = 6, \n",
456
+ " guess = \"have\", target = \"this\", bias = 'entropy', \n",
457
+ " random_guess = False, random_target = False, \n",
458
+ " verbose = val, drama = 0, return_stats = False, record = True)"
459
+ ]
460
+ },
461
+ {
462
+ "attachments": {},
463
+ "cell_type": "markdown",
464
+ "metadata": {},
465
+ "source": [
466
+ "### Testing on 6-letter words"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "execution_count": null,
472
+ "metadata": {},
473
+ "outputs": [],
474
+ "source": [
475
+ "### 6 letters\n",
476
+ "words_6_letters = []\n",
477
+ "words_6_types_counts = {}\n",
478
+ "\n",
479
+ "with open(\"data/nltk_grand_corpus_types_and_counts_6.txt\", \"r\", encoding = \"utf-8\") as f:\n",
480
+ " for line in f.read().split(\"\\n\"):\n",
481
+ " word_freq = line.split(\"\\t\")\n",
482
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
483
+ " word = word_freq[0]\n",
484
+ " freq = word_freq[1]\n",
485
+ " if word.isascii() == True:\n",
486
+ " words_6_letters.append(word)\n",
487
+ " words_6_types_counts[word] = freq\n",
488
+ "\n",
489
+ "f.close() # closes connection to file\n",
490
+ "\n",
491
+ "print(len(words_6_letters))\n",
492
+ "print(words_6_letters[:5])\n",
493
+ "words_6_types_counts[words_6_letters[0]]"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": null,
499
+ "metadata": {},
500
+ "outputs": [],
501
+ "source": [
502
+ "for val in [False, True]:\n",
503
+ " wordle_wizard(word_list = words_6_letters, max_guesses = 6, \n",
504
+ " guess = \"little\", target = \"before\", bias = 'entropy', \n",
505
+ " random_guess = False, random_target = False, \n",
506
+ " verbose = val, drama = 0, return_stats = False, record = True)"
507
+ ]
508
+ },
509
+ {
510
+ "attachments": {},
511
+ "cell_type": "markdown",
512
+ "metadata": {},
513
+ "source": [
514
+ "### Testing on 7-letter words"
515
+ ]
516
+ },
517
+ {
518
+ "cell_type": "code",
519
+ "execution_count": null,
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "### 7 letters\n",
524
+ "words_7_letters = []\n",
525
+ "words_7_types_counts = {}\n",
526
+ "\n",
527
+ "with open(\"data/nltk_grand_corpus_types_and_counts_7.txt\", \"r\", encoding = \"utf-8\") as f:\n",
528
+ " for line in f.read().split(\"\\n\"):\n",
529
+ " word_freq = line.split(\"\\t\")\n",
530
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
531
+ " word = word_freq[0]\n",
532
+ " freq = word_freq[1]\n",
533
+ " if word.isascii() == True:\n",
534
+ " words_7_letters.append(word)\n",
535
+ " words_7_types_counts[word] = freq\n",
536
+ "\n",
537
+ "f.close() # closes connection to file\n",
538
+ "\n",
539
+ "print(len(words_7_letters))\n",
540
+ "print(words_7_letters[:5])\n",
541
+ "words_7_types_counts[words_7_letters[0]]"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "execution_count": null,
547
+ "metadata": {},
548
+ "outputs": [],
549
+ "source": [
550
+ "for val in [False, True]:\n",
551
+ " wordle_wizard(word_list = words_7_letters, max_guesses = 6, \n",
552
+ " guess = \"because\", target = \"through\", bias = 'entropy', \n",
553
+ " random_guess = True, random_target = True, \n",
554
+ " verbose = val, drama = 0, return_stats = False, record = True)"
555
+ ]
556
+ },
557
+ {
558
+ "attachments": {},
559
+ "cell_type": "markdown",
560
+ "metadata": {},
561
+ "source": [
562
+ "### Testing on 8-letter words"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": null,
568
+ "metadata": {},
569
+ "outputs": [],
570
+ "source": [
571
+ "### 8 letters\n",
572
+ "words_8_letters = []\n",
573
+ "words_8_types_counts = {}\n",
574
+ "\n",
575
+ "with open(\"data/nltk_grand_corpus_types_and_counts_8.txt\", \"r\", encoding = \"utf-8\") as f:\n",
576
+ " for line in f.read().split(\"\\n\"):\n",
577
+ " word_freq = line.split(\"\\t\")\n",
578
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
579
+ " word = word_freq[0]\n",
580
+ " freq = word_freq[1]\n",
581
+ " if word.isascii() == True:\n",
582
+ " words_8_letters.append(word)\n",
583
+ " words_8_types_counts[word] = freq\n",
584
+ "\n",
585
+ "f.close() # closes connection to file\n",
586
+ "\n",
587
+ "print(len(words_8_letters))\n",
588
+ "print(words_8_letters[:5])\n",
589
+ "words_8_types_counts[words_8_letters[0]]"
590
+ ]
591
+ },
592
+ {
593
+ "cell_type": "code",
594
+ "execution_count": null,
595
+ "metadata": {},
596
+ "outputs": [],
597
+ "source": [
598
+ "for val in [False, True]:\n",
599
+ " wordle_wizard(word_list = words_8_letters, max_guesses = 6, \n",
600
+ " guess = \"trinidad\", target = \"together\", bias = 'entropy', \n",
601
+ " random_guess = False, random_target = False, \n",
602
+ " verbose = val, drama = 0, return_stats = False, record = True)"
603
+ ]
604
+ },
605
+ {
606
+ "attachments": {},
607
+ "cell_type": "markdown",
608
+ "metadata": {},
609
+ "source": [
610
+ "### Testing on 9-letter words"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": null,
616
+ "metadata": {},
617
+ "outputs": [],
618
+ "source": [
619
+ "### 9 letters\n",
620
+ "words_9_letters = []\n",
621
+ "words_9_types_counts = {}\n",
622
+ "\n",
623
+ "with open(\"data/nltk_grand_corpus_types_and_counts_9.txt\", \"r\", encoding = \"utf-8\") as f:\n",
624
+ " for line in f.read().split(\"\\n\"):\n",
625
+ " word_freq = line.split(\"\\t\")\n",
626
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
627
+ " word = word_freq[0]\n",
628
+ " freq = word_freq[1]\n",
629
+ " if word.isascii() == True:\n",
630
+ " words_9_letters.append(word)\n",
631
+ " words_9_types_counts[word] = freq\n",
632
+ "\n",
633
+ "f.close() # closes connection to file\n",
634
+ "\n",
635
+ "print(len(words_9_letters))\n",
636
+ "print(words_9_letters[:5])\n",
637
+ "words_9_types_counts[words_9_letters[0]]"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "execution_count": null,
643
+ "metadata": {},
644
+ "outputs": [],
645
+ "source": [
646
+ "for val in [False, True]:\n",
647
+ " wordle_wizard(word_list = words_9_letters, max_guesses = 6, \n",
648
+ " guess = \"something\", target = \"character\", bias = 'entropy', \n",
649
+ " random_guess = True, random_target = False, \n",
650
+ " verbose = val, drama = 0, return_stats = False, record = True)"
651
+ ]
652
+ },
653
+ {
654
+ "attachments": {},
655
+ "cell_type": "markdown",
656
+ "metadata": {},
657
+ "source": [
658
+ "### Testing on 10-letter words"
659
+ ]
660
+ },
661
+ {
662
+ "cell_type": "code",
663
+ "execution_count": null,
664
+ "metadata": {},
665
+ "outputs": [],
666
+ "source": [
667
+ "### 10 letters\n",
668
+ "words_10_letters = []\n",
669
+ "words_10_types_counts = {}\n",
670
+ "\n",
671
+ "with open(\"data/nltk_grand_corpus_types_and_counts_10.txt\", \"r\", encoding = \"utf-8\") as f:\n",
672
+ " for line in f.read().split(\"\\n\"):\n",
673
+ " word_freq = line.split(\"\\t\")\n",
674
+ " if len(word_freq) == 2: # how many items are in each line, NOT the len of the word in the line\n",
675
+ " word = word_freq[0]\n",
676
+ " freq = word_freq[1]\n",
677
+ " if word.isascii() == True:\n",
678
+ " words_10_letters.append(word)\n",
679
+ " words_10_types_counts[word] = freq\n",
680
+ "\n",
681
+ "f.close() # closes connection to file\n",
682
+ "\n",
683
+ "print(len(words_10_letters))\n",
684
+ "print(words_10_letters[:5])\n",
685
+ "words_10_types_counts[words_10_letters[0]]"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": null,
691
+ "metadata": {},
692
+ "outputs": [],
693
+ "source": [
694
+ "for val in [False, True]:\n",
695
+ " wordle_wizard(word_list = words_10_letters, max_guesses = 6, \n",
696
+ " guess = \"characters\", target = \"theologies\", bias = 'entropy', \n",
697
+ " random_guess = True, random_target = False, \n",
698
+ " verbose = val, drama = 0, return_stats = False, record = True)"
699
+ ]
700
+ },
701
+ {
702
+ "cell_type": "code",
703
+ "execution_count": null,
704
+ "metadata": {},
705
+ "outputs": [],
706
+ "source": [
707
+ "wordle_wizard(word_list = official_words, max_guesses = 5, \n",
708
+ " guess = \"quote\", target = \"silly\", bias = 'entropy', \n",
709
+ " random_guess = False, random_target = False, \n",
710
+ " verbose = True, drama = 0, return_stats = False, record = False)"
711
+ ]
712
+ },
713
+ {
714
+ "attachments": {},
715
+ "cell_type": "markdown",
716
+ "metadata": {},
717
+ "source": [
718
+ "### `compare_wordle()` testing"
719
+ ]
720
+ },
721
+ {
722
+ "cell_type": "code",
723
+ "execution_count": null,
724
+ "metadata": {},
725
+ "outputs": [],
726
+ "source": [
727
+ "df = pd.read_csv(\"compared_data/wordle_humans - Sheet1.csv\")\n",
728
+ "print(df.shape)\n",
729
+ "df"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "execution_count": null,
735
+ "metadata": {},
736
+ "outputs": [],
737
+ "source": [
738
+ "df = pd.read_csv(\"compared_data/wordle_humans - Sheet1.csv\")\n",
739
+ "df\n",
740
+ "convert_row(df, 37)"
741
+ ]
742
+ },
743
+ {
744
+ "cell_type": "code",
745
+ "execution_count": null,
746
+ "metadata": {},
747
+ "outputs": [],
748
+ "source": [
749
+ "### TESTING DF INTERPRETATION\n",
750
+ "\n",
751
+ "df = pd.read_csv(\"compared_data/wordle_humans - Sheet1.csv\")\n",
752
+ "\n",
753
+ "row = 37\n",
754
+ "\n",
755
+ "print(convert_row(df, row))\n",
756
+ "player = convert_row(df, row)[0]\n",
757
+ "target_word = convert_row(df, row)[1]\n",
758
+ "guess_list = convert_row(df, row)[2]\n",
759
+ "\n",
760
+ "compare_wordle(word_list = official_words, max_guesses = 6, \n",
761
+ " guess_list = guess_list, player = player, target = target_word,\n",
762
+ " verbose = False, return_stats = True, record = False)"
763
+ ]
764
+ },
765
+ {
766
+ "attachments": {},
767
+ "cell_type": "markdown",
768
+ "metadata": {},
769
+ "source": [
770
+ "## Comparing player solutions against wizard solutions"
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "code",
775
+ "execution_count": null,
776
+ "metadata": {},
777
+ "outputs": [],
778
+ "source": [
779
+ "def create_compared_df(player_df, to_csv: bool = False, show_shapes: bool = False):\n",
780
+ " \"\"\"\n",
781
+ " Creates master df of player wordle scores compared to how wordle_wizard would perform on the same puzzles\n",
782
+ "\n",
783
+ " Parameters:\n",
784
+ " -----\n",
785
+ " `player_df`: Pandas dataFrame object\n",
786
+ " df of player scores of wordle puzzles\n",
787
+ " `to_csv`: bool\n",
788
+ " If True, writes returned df to csv\n",
789
+ " `show_shapes`: bool\n",
790
+ " If True, prints shape of new df before and after deleting duplicate rows (created by wordle_wizard running the same puzzles multiple times)\n",
791
+ " \n",
792
+ " Returns:\n",
793
+ " -----\n",
794
+ " `df_master`: Pandas dataFrame object\n",
795
+ " df of player scores and wordle_wizard scores of wordle puzzles\n",
796
+ " \"\"\"\n",
797
+ "\n",
798
+ " stats_master = {}\n",
799
+ " excepts = []\n",
800
+ " for row in player_df.index:\n",
801
+ " player = convert_row(player_df, row)[0]\n",
802
+ " target_word = convert_row(player_df, row)[1]\n",
803
+ " guess_list = convert_row(player_df, row)[2]\n",
804
+ " try:\n",
805
+ " complete = compare_wordle(word_list = official_words, max_guesses = 6, \n",
806
+ " guess_list = guess_list, player = player, target = target_word,\n",
807
+ " verbose = True, return_stats = True, record = False)\n",
808
+ " for metric, results in complete.items():\n",
809
+ " if metric in stats_master:\n",
810
+ " for result in results:\n",
811
+ " stats_master[metric].append(result)\n",
812
+ " else:\n",
813
+ " stats_master[metric] = []\n",
814
+ " for result in results:\n",
815
+ " stats_master[metric].append(result)\n",
816
+ " except:\n",
817
+ " AttributeError\n",
818
+ " excepts.append(guess_list)\n",
819
+ "\n",
820
+ " df_master = pd.DataFrame(stats_master)\n",
821
+ " print(df_master.columns.tolist())\n",
822
+ "\n",
823
+ " # Re-organizing columns to a more logical order (for viewing)\n",
824
+ " df_master = df_master[['first_guess', 'target_word', 'player', 'num_guesses', 'expected_guesses', 'luck', 'first_guess_vowels', 'first_guess_consonants',\n",
825
+ " 'target_vowels', 'target_consonants', 'first_guess_entropy', 'target_entropy',\n",
826
+ " 'target_guessed', 'mid_guesses_avg_vows', 'mid_guesses_avg_cons', 'avg_perf_letters',\n",
827
+ " 'avg_wrong_pos_letters', 'avg_wrong_letters', 'avg_remaining', 'avg_intermediate_guess_entropy',\n",
828
+ " 'valid_success']]\n",
829
+ "\n",
830
+ " # print(excepts)\n",
831
+ " if show_shapes == True:\n",
832
+ " print(df_master.shape) # check shape before deleting dups\n",
833
+ "\n",
834
+ " # Delete duplicate rows (some created by process)\n",
835
+ " df_master.drop_duplicates(inplace = True)\n",
836
+ " \n",
837
+ " if to_csv == True:\n",
838
+ " df_master.to_csv('compared_data/players_compared.csv') # write new data to csv\n",
839
+ " \n",
840
+ " if show_shapes == True:\n",
841
+ " print(df_master.shape) # check shape after deleting dups\n",
842
+ " \n",
843
+ " return df_master.reset_index().drop(columns = \"index\")"
844
+ ]
845
+ },
846
+ {
847
+ "cell_type": "code",
848
+ "execution_count": null,
849
+ "metadata": {},
850
+ "outputs": [],
851
+ "source": [
852
+ "test_word = \"test 1 \"\n",
853
+ "test_word.strip().lower()"
854
+ ]
855
+ },
856
+ {
857
+ "cell_type": "code",
858
+ "execution_count": null,
859
+ "metadata": {},
860
+ "outputs": [],
861
+ "source": [
862
+ "df = pd.read_csv(\"compared_data/wordle_humans - Sheet1.csv\")\n",
863
+ "\n",
864
+ "df_master = create_compared_df(df, to_csv = True, show_shapes = True)\n",
865
+ "df_master"
866
+ ]
867
+ },
868
+ {
869
+ "cell_type": "code",
870
+ "execution_count": null,
871
+ "metadata": {},
872
+ "outputs": [],
873
+ "source": [
874
+ "print(df_master.query(\"player == 'aidan'\")['num_guesses'].mean())\n",
875
+ "print(df_master.query(\"player == 'aidan'\").shape)\n",
876
+ "df_master.query(\"player == 'aidan'\").head()"
877
+ ]
878
+ },
879
+ {
880
+ "cell_type": "code",
881
+ "execution_count": null,
882
+ "metadata": {},
883
+ "outputs": [],
884
+ "source": [
885
+ "print(df_master.query(\"player == 'dad'\")['num_guesses'].mean())\n",
886
+ "print(df_master.query(\"player == 'dad'\").shape)\n",
887
+ "df_master.query(\"player == 'dad'\").head()"
888
+ ]
889
+ },
890
+ {
891
+ "cell_type": "code",
892
+ "execution_count": null,
893
+ "metadata": {},
894
+ "outputs": [],
895
+ "source": [
896
+ "print(df_master.query(\"player == 'diane'\")['num_guesses'].mean())\n",
897
+ "print(df_master.query(\"player == 'diane'\").shape)\n",
898
+ "df_master.query(\"player == 'diane'\").head()"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": null,
904
+ "metadata": {},
905
+ "outputs": [],
906
+ "source": [
907
+ "print(df_master.query(\"player == 'wizard'\")['num_guesses'].mean())\n",
908
+ "print(df_master.query(\"player == 'wizard'\").shape)\n",
909
+ "df_master.query(\"player == 'wizard'\").head(40)"
910
+ ]
911
+ },
912
+ {
913
+ "attachments": {},
914
+ "cell_type": "markdown",
915
+ "metadata": {},
916
+ "source": [
917
+ "## Prefix/Suffix bias"
918
+ ]
919
+ },
920
+ {
921
+ "cell_type": "code",
922
+ "execution_count": null,
923
+ "metadata": {},
924
+ "outputs": [],
925
+ "source": [
926
+ "def get_gram_freq(word_list: list, letters_length: int = 2, position: bool = \"start\", search: any = None):\n",
927
+ " \"\"\"\n",
928
+ " Given a word list, a selected number of letter, a selected word position to start from (\"start\" or \"end\"),\n",
929
+ " and an optional gram to search within the list, this function will get a frequency distribution of all n-grams\n",
930
+ " from the passed word list and returned a frequency distribution in descending order.\n",
931
+ "\n",
932
+ " Parameters:\n",
933
+ " ------\n",
934
+ " `word_list`: list\n",
935
+ " list of words of the same \n",
936
+ " `letters_length`: int\n",
937
+ " number of letters in succession. Size/length of \"gram\". Must be between 1 and length of words in word list\n",
938
+ " `position`: bool\n",
939
+ " Whether to start the gram from the start of the word (like a prefix) or the end of the word (like a suffix)\n",
940
+ " `search`: str\n",
941
+ " If != None, string of characters to search for within the generated list. If string not found in list, function will print an error message.\n",
942
+ "\n",
943
+ " Returns:\n",
944
+ " ------\n",
945
+ " `tup`: tuple\n",
946
+ " If search != None, will return a tuple with the passed search criteria, and its count\n",
947
+ " `sorted_gram_list`: list\n",
948
+ " List of tuples in the form of (gram, count) for each combination of the gram size in the pass word_list\n",
949
+ " \"\"\"\n",
950
+ "\n",
951
+ " gram_freq_dist = {}\n",
952
+ "\n",
953
+ " for word in word_list:\n",
954
+ " if position == \"start\":\n",
955
+ " gram = word[:letters_length] # first 2 letters\n",
956
+ " if position == \"end\":\n",
957
+ " gram = word[-(letters_length):] # first 2 letters\n",
958
+ "\n",
959
+ " if gram not in gram_freq_dist:\n",
960
+ " gram_freq_dist[gram] = 1\n",
961
+ " else:\n",
962
+ " gram_freq_dist[gram] += 1\n",
963
+ "\n",
964
+ " sorted_gram_dist = sorted(gram_freq_dist.items(), key = operator.itemgetter(1), reverse = True)\n",
965
+ "\n",
966
+ " if search:\n",
967
+ " nos = []\n",
968
+ " for tup in sorted_gram_dist:\n",
969
+ " if tup[0] == search:\n",
970
+ " return tup\n",
971
+ " else:\n",
972
+ " nos.append(\"not here\")\n",
973
+ " \n",
974
+ " if len(nos) == len(sorted_gram_dist):\n",
975
+ " print (\"Search criteria not found in list. Please enter a gram from within the list.\")\n",
976
+ " else:\n",
977
+ " return sorted_gram_dist\n",
978
+ "\n",
979
+ "get_gram_freq(word_list = official_words, letters_length = 2, position = \"start\", search = None)[:10]"
980
+ ]
981
+ },
982
+ {
983
+ "cell_type": "code",
984
+ "execution_count": null,
985
+ "metadata": {},
986
+ "outputs": [],
987
+ "source": [
988
+ "test_starts = get_gram_freq(word_list = official_words, letters_length = 2, position = \"start\", search = None)[:10]\n",
989
+ "test_ends = get_gram_freq(word_list = official_words, letters_length = 2, position = \"end\", search = None)[:10]\n",
990
+ "\n",
991
+ "test_words = official_words\n",
992
+ "\n",
993
+ "for start_gram, start_count in test_starts:\n",
994
+ " for end_gram, end_count in test_ends:\n",
995
+ " for word in [\"natal\", 'fatal']:\n",
996
+ " # for word in test_words:\n",
997
+ " if word[:2] == start_gram and word[-2:] == end_gram:\n",
998
+ " print (word, start_gram, end_gram)"
999
+ ]
1000
+ },
1001
+ {
1002
+ "cell_type": "code",
1003
+ "execution_count": null,
1004
+ "metadata": {},
1005
+ "outputs": [],
1006
+ "source": [
1007
+ "def wordle_wizard(word_list: list, max_guesses: int = None, \n",
1008
+ " guess: str = None, target: str = None,\n",
1009
+ " random_guess: bool = False, random_target: bool = False, \n",
1010
+ " verbose: bool = False, drama: float = None, \n",
1011
+ " return_stats: bool = False, record: bool = False, hf_mod: bool = True):\n",
1012
+ " \"\"\"\n",
1013
+ " Mimicking the popular web game, this function matches a current word to a target word automatically, in the most statistically optimal way possible.\n",
1014
+ "\n",
1015
+ " Parameters:\n",
1016
+ " ------\n",
1017
+ " `word_list`: list\n",
1018
+ " list of valid words to be considered\n",
1019
+ " `guess`: str\n",
1020
+ " a string -- must be the same length as `target_word`\n",
1021
+ " `target`: str\n",
1022
+ " a string -- must be the same length as `opening_word`\n",
1023
+ " `max_guesses`: int\n",
1024
+ " the maximum number of attempts allowed to solve the Wordle\n",
1025
+ " `random_guess`: bool\n",
1026
+ " if True, randomly chooses a starting word from all words within `word_list`. If False, passed starting word must be used instead\n",
1027
+ " `random_target`: bool\n",
1028
+ " if True, randomly chooses a target word from all words within `word_list`. If False, passed target word must be used instead\n",
1029
+ " `verbose`: bool\n",
1030
+ " if True, prints progress and explanation of how function solves the puzzle. If False, prints only the guessed word at each guess.\n",
1031
+ " `drama`: float or int\n",
1032
+ " if int provided, each guess' output is delayed by that number of seconds, else each output is shown as quickly as possible. For ~dRaMaTiC eFfEcT~\n",
1033
+ " `return_stats`: bool\n",
1034
+ " if True, prints nothing and returns a dictionary of various statistics about the function's performance trying to solve the puzzle\n",
1035
+ " `record`: bool\n",
1036
+ " if True, creates a .txt file with the same information printed according to the indicated verbosity\n",
1037
+ "\n",
1038
+ " Returns:\n",
1039
+ " ------\n",
1040
+ " `stats_dict`: dict\n",
1041
+ " dictionary containing various statistics about the function's performance trying to solve the puzzle\n",
1042
+ " \"\"\"\n",
1043
+ "\n",
1044
+ " guess = guess.lower()\n",
1045
+ " target = target.lower()\n",
1046
+ "\n",
1047
+ " sugg_words = []\n",
1048
+ "\n",
1049
+ " for i in range(0, 20):\n",
1050
+ " ran_int = random.randint(0, len(word_list) - 1)\n",
1051
+ " word = word_list[ran_int]\n",
1052
+ " sugg_words.append(word)\n",
1053
+ "\n",
1054
+ " if guess not in word_list:\n",
1055
+ " print (\"Guess word not in passed word list.\\nOnly words within the given word list are valid.\")\n",
1056
+ " print (f\"Here are some examples of valid words from the passed word list.\\n\\t{sugg_words[:10]}\")\n",
1057
+ " return None\n",
1058
+ " \n",
1059
+ " if target not in word_list:\n",
1060
+ " print (\"Target word not in passed word list.\\nOnly words within the given word list are valid.\")\n",
1061
+ " print (f\"Here are some examples of valid words from the passed word list.\\n\\t{sugg_words[-10:]}\")\n",
1062
+ " return None\n",
1063
+ "\n",
1064
+ " if random_guess == True:\n",
1065
+ " randomint_guess = random.randint(0, len(word_list) - 1)\n",
1066
+ " guess = word_list[randomint_guess]\n",
1067
+ "\n",
1068
+ " if random_target == True:\n",
1069
+ " randomint_target = random.randint(0, len(word_list) - 1)\n",
1070
+ " target = word_list[randomint_target]\n",
1071
+ "\n",
1072
+ " stats_dict = {}\n",
1073
+ " stats_dict['first_guess'] = guess\n",
1074
+ " stats_dict['target_word'] = target\n",
1075
+ " stats_dict['first_guess_vowels'] = float(count_vows_cons(guess, y_vow = True)['vows'])\n",
1076
+ " stats_dict['first_guess_consonants'] = float(count_vows_cons(guess, y_vow = True)['cons'])\n",
1077
+ " stats_dict['target_vowels'] = float(count_vows_cons(target, y_vow = True)['vows'])\n",
1078
+ " stats_dict['target_consonants'] = float(count_vows_cons(target, y_vow = True)['cons'])\n",
1079
+ " \n",
1080
+ " # get entropy of the first guess word and target word in the entire word_list\n",
1081
+ " for tup in get_word_entropy(word_list, word_list, normalized = True):\n",
1082
+ " if tup[0] == guess:\n",
1083
+ " stats_dict['first_guess_entropy'] = tup[1]\n",
1084
+ " if tup[0] == target:\n",
1085
+ " stats_dict['target_entropy'] = tup[1]\n",
1086
+ "\n",
1087
+ " guess_entropies = []\n",
1088
+ " guess_entropies.append(stats_dict['first_guess_entropy'])\n",
1089
+ "\n",
1090
+ " # luck_guess_1 = round(1 - ((1 / len(word_list)) * guess_entropies[0] / 100), 2) * 100\n",
1091
+ "\n",
1092
+ " english_alphabet = \"abcdefghijklmnopqrstuvwxyz\"\n",
1093
+ "\n",
1094
+ " word_list_sorted_counts = get_letter_counts(english_alphabet, word_list, sort = \"descending\")\n",
1095
+ " \n",
1096
+ " wordlen = len(guess)\n",
1097
+ " letter_positions = set(i for i in range(0, wordlen))\n",
1098
+ "\n",
1099
+ " guess_set = set()\n",
1100
+ " perfect_dict = {}\n",
1101
+ " wrong_pos_dict = {}\n",
1102
+ " wrong_pos_set = set()\n",
1103
+ " dont_guess_again = set()\n",
1104
+ "\n",
1105
+ " guessed_words = [] # running set of guessed words\n",
1106
+ " guess_num = 0 # baseline for variable\n",
1107
+ " dont_guess_words = set()\n",
1108
+ " incorrect_positions = []\n",
1109
+ " reduction_per_guess = []\n",
1110
+ "\n",
1111
+ " if max_guesses == None: # if no value is passed, default is len(guess)\n",
1112
+ " max_guesses = wordlen\n",
1113
+ " else: # else it is the value passed\n",
1114
+ " max_guesses = max_guesses\n",
1115
+ "\n",
1116
+ " perfect_letts_per_guess = []\n",
1117
+ " wrong_pos_per_guess = []\n",
1118
+ " wrong_letts_per_guess = []\n",
1119
+ "\n",
1120
+ " record_list = []\n",
1121
+ "\n",
1122
+ " while guess: # while there is any guess -- there are conditions to break it at the bottom\n",
1123
+ "\n",
1124
+ " guess_num += 1\n",
1125
+ "\n",
1126
+ " guessed_words.append(guess)\n",
1127
+ "\n",
1128
+ " if drama:\n",
1129
+ " time.sleep(drama)\n",
1130
+ "\n",
1131
+ " # guess_num += 1 # each time the guess is processed\n",
1132
+ " if return_stats == False:\n",
1133
+ " if guess_num == 1:\n",
1134
+ " print(\"-----------------------------\\n\")\n",
1135
+ " record_list.append(\"-----------------------------\\n\")\n",
1136
+ " \n",
1137
+ " if return_stats == False:\n",
1138
+ " print(f\"Guess {guess_num}: '{guess}'\")\n",
1139
+ " record_list.append(f\"Guess {guess_num}: '{guess}'\")\n",
1140
+ "\n",
1141
+ " if guess == target:\n",
1142
+ " stats_dict['target_guessed'] = True\n",
1143
+ " if return_stats == False:\n",
1144
+ " if guess_num == 1:\n",
1145
+ " print(f\"Congratulations! The Wordle has been solved in {guess_num} guess, that's amazingly lucky!\")\n",
1146
+ " print(f\"The target word was {target}\")\n",
1147
+ " record_list.append(f\"Congratulations! The Wordle has been solved in {guess_num} guess, that's amazingly lucky!\")\n",
1148
+ " record_list.append(f\"The target word was '{target}'.\")\n",
1149
+ " perfect_letts_per_guess.append(5)\n",
1150
+ " wrong_pos_per_guess.append(0)\n",
1151
+ " wrong_letts_per_guess.append(0)\n",
1152
+ " break\n",
1153
+ "\n",
1154
+ " guess_set = set()\n",
1155
+ " wrong_pos_set = set()\n",
1156
+ "\n",
1157
+ " #### Step 2 -- ALL PERFECT\n",
1158
+ " for i in letter_positions: # number of letters in each word (current word and target word)\n",
1159
+ " guess_set.add(guess[i])\n",
1160
+ "\n",
1161
+ " if guess[i] not in perfect_dict:\n",
1162
+ " perfect_dict[guess[i]] = set()\n",
1163
+ " if guess[i] not in wrong_pos_dict:\n",
1164
+ " wrong_pos_dict[guess[i]] = set()\n",
1165
+ "\n",
1166
+ " ### EVALUATE CURRENT GUESS\n",
1167
+ " if guess[i] == target[i]: # letter == correct and position == correct\n",
1168
+ " perfect_dict[guess[i]].add(i)\n",
1169
+ "\n",
1170
+ " if (guess[i] != target[i] and guess[i] in target): # letter == correct and position != correct\n",
1171
+ " wrong_pos_dict[guess[i]].add(i)\n",
1172
+ " wrong_pos_set.add(guess[i])\n",
1173
+ "\n",
1174
+ " if guess[i] not in target: # if letter is not relevant at all\n",
1175
+ " dont_guess_again.add(guess[i])\n",
1176
+ "\n",
1177
+ " #### Step 3 -- ALL PERFECT\n",
1178
+ " next_letters = set()\n",
1179
+ " for letter, positions in perfect_dict.items():\n",
1180
+ " if len(positions) > 0:\n",
1181
+ " next_letters.add(letter)\n",
1182
+ "\n",
1183
+ " for letter, positions in wrong_pos_dict.items():\n",
1184
+ " if len(positions) > 0:\n",
1185
+ " next_letters.add(letter)\n",
1186
+ "\n",
1187
+ " #### List of tuples of correct letter positions in new valid words. Eg: [('e', 2), ('a', 3)]\n",
1188
+ " perfect_letters = []\n",
1189
+ " for letter, positions in perfect_dict.items():\n",
1190
+ " for pos in positions:\n",
1191
+ " if len(positions) > 0:\n",
1192
+ " perfect_letters.append((letter, pos))\n",
1193
+ "\n",
1194
+ " #### all words that have correct letters in same spots\n",
1195
+ " words_matching_correct_all = []\n",
1196
+ " for word in word_list:\n",
1197
+ " word_set = set()\n",
1198
+ " for letter, pos in perfect_letters:\n",
1199
+ " if pos < len(word):\n",
1200
+ " if word[pos] == letter:\n",
1201
+ " words_matching_correct_all.append(word)\n",
1202
+ "\n",
1203
+ " #### excluding words with letters in known incorrect positions\n",
1204
+ " for letter, positions in wrong_pos_dict.items():\n",
1205
+ " for pos in positions:\n",
1206
+ " if len(positions) > 0:\n",
1207
+ " if (letter, pos) not in incorrect_positions:\n",
1208
+ " incorrect_positions.append((letter, pos))\n",
1209
+ "\n",
1210
+ " # sorting lists of tuples just to make them look nice in the printout\n",
1211
+ " incorrect_positions = sorted(incorrect_positions, key = operator.itemgetter(1), reverse = False)\n",
1212
+ " perfect_letters = sorted(perfect_letters, key = operator.itemgetter(1), reverse = False)\n",
1213
+ "\n",
1214
+ " #### all words that have correct letters in incorrect spots -- so they can be excluded efficiently\n",
1215
+ " \n",
1216
+ " # print(incorrect_positions)\n",
1217
+ " \n",
1218
+ " for word in word_list:\n",
1219
+ " word_set = set()\n",
1220
+ " for letter, pos in incorrect_positions:\n",
1221
+ " if pos < len(word):\n",
1222
+ " if word[pos] == letter:\n",
1223
+ " dont_guess_words.add(word)\n",
1224
+ " for word in word_list:\n",
1225
+ " word_set = set()\n",
1226
+ " for letter, pos in incorrect_positions:\n",
1227
+ " if pos < len(word):\n",
1228
+ " if word[pos] == letter:\n",
1229
+ " dont_guess_words.add(word)\n",
1230
+ "\n",
1231
+ " for bad_letter in dont_guess_again:\n",
1232
+ " for word in word_list:\n",
1233
+ " if (bad_letter in word and word not in dont_guess_words):\n",
1234
+ " dont_guess_words.add(word)\n",
1235
+ "\n",
1236
+ " if return_stats == False:\n",
1237
+ " if verbose == True:\n",
1238
+ " print(f\"Letters in correct positions:\\n\\t{perfect_letters}\\n\")\n",
1239
+ " print(f\"Letters in incorrect positions:\\n\\t{incorrect_positions}\\n\")\n",
1240
+ " print (f\"Letters to guess again:\\n\\t{sorted(list(next_letters), reverse = False)}\\n\")\n",
1241
+ " print(f\"Letters to not guess again:\\n\\t{sorted(list(dont_guess_again), reverse = False)}\\n\") # works\n",
1242
+ " \n",
1243
+ " if len(perfect_letters) == 0:\n",
1244
+ " record_list.append(f\"Letters in correct positions: None\\n\")\n",
1245
+ " else:\n",
1246
+ " record_list.append(f\"Letters in correct positions: {perfect_letters}\\n\")\n",
1247
+ " \n",
1248
+ " if len(incorrect_positions) == 0:\n",
1249
+ " record_list.append(f\"Letters in incorrect positions: None\\n\")\n",
1250
+ " else:\n",
1251
+ " record_list.append(f\"Letters in incorrect positions: {incorrect_positions}\\n\")\n",
1252
+ " \n",
1253
+ " if len(next_letters) == 0:\n",
1254
+ " record_list.append(f\"Letters to guess again: None\\n\")\n",
1255
+ " else:\n",
1256
+ " record_list.append(f\"Letters to guess again: {sorted(list(next_letters), reverse = False)}\\n\")\n",
1257
+ " # if\n",
1258
+ " # record_list.append(f\"Letters to not guess again: {sorted(list(dont_guess_again), reverse = False)}\\n\") # works\n",
1259
+ "\n",
1260
+ " # Returns True\n",
1261
+ " # print(A.issubset(B)) # \"if everything in A is in B\", returns Bool\n",
1262
+ "\n",
1263
+ " perfect_letts_per_guess.append(len(perfect_letters))\n",
1264
+ " wrong_pos_per_guess.append(len(incorrect_positions))\n",
1265
+ " wrong_letts_per_guess.append(len(dont_guess_again))\n",
1266
+ "\n",
1267
+ " potential_next_guesses = set()\n",
1268
+ " middle_set = set()\n",
1269
+ "\n",
1270
+ " if len(perfect_letters) == 0 and len(incorrect_positions) == 0: # if there are NEITHER perfect letters, NOR incorrect positions, ....\n",
1271
+ " for word in word_list:\n",
1272
+ " if word not in dont_guess_words:\n",
1273
+ " if word not in guessed_words:\n",
1274
+ " potential_next_guesses.add(word)\n",
1275
+ " \n",
1276
+ " # print(f\"GUESS {guess_num} : TEST 1-1\")\n",
1277
+ "\n",
1278
+ " if len(perfect_letters) == 0 and len(incorrect_positions) != 0: # if there are no perfect letters whatsoever, but there ARE incorrect positions ....\n",
1279
+ " for word in word_list:\n",
1280
+ " for incor_letter, incor_pos in incorrect_positions:\n",
1281
+ " if incor_pos < len(word):\n",
1282
+ " if word[incor_pos] != incor_letter:\n",
1283
+ " if word not in dont_guess_words: # just in case\n",
1284
+ " word_set = set()\n",
1285
+ " for letter in word:\n",
1286
+ " word_set.add(letter)\n",
1287
+ " \n",
1288
+ " if next_letters.issubset(word_set):\n",
1289
+ " if word not in guessed_words:\n",
1290
+ " if len(dont_guess_again) > 0:\n",
1291
+ " for bad_letter in dont_guess_again:\n",
1292
+ " if bad_letter not in word:\n",
1293
+ " # potential_next_guesses.append(word)\n",
1294
+ " potential_next_guesses.add(word)\n",
1295
+ " else:\n",
1296
+ " potential_next_guesses.add(word)\n",
1297
+ " \n",
1298
+ " # print(f\"GUESS {guess_num} : TEST 2-1\")\n",
1299
+ "\n",
1300
+ " else:\n",
1301
+ " for word in word_list:\n",
1302
+ " if word not in dont_guess_words: # just in case\n",
1303
+ " word_set = set()\n",
1304
+ " for letter in word:\n",
1305
+ " word_set.add(letter)\n",
1306
+ " if next_letters.issubset(word_set):\n",
1307
+ " if word not in guessed_words:\n",
1308
+ " # print (\"TEST 3-2\")\n",
1309
+ "\n",
1310
+ " if len(dont_guess_again) > 0:\n",
1311
+ " for bad_letter in dont_guess_again:\n",
1312
+ " if bad_letter not in word:\n",
1313
+ " middle_set.add(word)\n",
1314
+ " else:\n",
1315
+ " middle_set.add(word)\n",
1316
+ " for word in middle_set:\n",
1317
+ " dummy_list = []\n",
1318
+ " for good_lett, good_pos in perfect_letters:\n",
1319
+ " if word[good_pos] == good_lett:\n",
1320
+ " dummy_list.append(1)\n",
1321
+ " if len(dummy_list) == len(perfect_letters):\n",
1322
+ " potential_next_guesses.add(word)\n",
1323
+ " for word in middle_set:\n",
1324
+ " dummy_list = []\n",
1325
+ " for bad_lett, bad_pos in incorrect_positions:\n",
1326
+ " if bad_pos < len(word):\n",
1327
+ " if word[bad_pos] == bad_lett:\n",
1328
+ " dummy_list.append(1)\n",
1329
+ " if len(dummy_list) > 0:\n",
1330
+ " potential_next_guesses.remove(word)\n",
1331
+ " \n",
1332
+ " # print(f\"GUESS {guess_num} : TEST 3-1\")\n",
1333
+ "\n",
1334
+ " if return_stats == False:\n",
1335
+ " if verbose == True:\n",
1336
+ " print(f\"At this point:\")\n",
1337
+ " print(f\"\\t{len(word_list) - len(potential_next_guesses)}, {round((len(word_list) - len(potential_next_guesses)) / len(word_list) * 100, 2)}% of total words have been eliminated, and\")\n",
1338
+ " print(f\"\\t{len(potential_next_guesses)}, {round(len(potential_next_guesses) / len(word_list) * 100, 2)}% of total words remain possible.\\n\")\n",
1339
+ " # record_list.append(f\"At this point:\")\n",
1340
+ " record_list.append(f\"{len(word_list) - len(potential_next_guesses)} ({round((len(word_list) - len(potential_next_guesses)) / len(word_list) * 100, 2)}% of all) words have been eliminated, and {len(potential_next_guesses)} ({round(len(potential_next_guesses) / len(word_list) * 100, 2)}% of all) words are still possible.\\n\")\n",
1341
+ " \n",
1342
+ " reduction_per_guess.append(len(potential_next_guesses))\n",
1343
+ " \n",
1344
+ " #### Guessing next word\n",
1345
+ " if len(potential_next_guesses) == 1:\n",
1346
+ "\n",
1347
+ " if return_stats == False:\n",
1348
+ " if verbose == True:\n",
1349
+ " print(f\"The only remaining possible word is:\\n\\t'{list(potential_next_guesses)[0]}'\\n\")\n",
1350
+ " record_list.append(f\"The only remaining possible word is: '{list(potential_next_guesses)[0]}'\\n\")\n",
1351
+ " \n",
1352
+ " guess = list(potential_next_guesses)[0]\n",
1353
+ " guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])\n",
1354
+ "\n",
1355
+ " else:\n",
1356
+ "\n",
1357
+ " best_next_guesses = list(potential_next_guesses) \n",
1358
+ " # print (best_next_guesses)\n",
1359
+ " word_ratings = get_word_entropy(best_next_guesses, word_list, normalized = True, ascending = False) # \"internal\" ratings\n",
1360
+ " \n",
1361
+ " # Get max rating of all words\n",
1362
+ " max_rating = -np.inf\n",
1363
+ " for word, rating in word_ratings:\n",
1364
+ " if rating > max_rating:\n",
1365
+ " max_rating = rating\n",
1366
+ "\n",
1367
+ " # add best rated words (all equally best entropy in next guess list) to set\n",
1368
+ " best_of_the_best_1 = []\n",
1369
+ " for word, rating in word_ratings:\n",
1370
+ " if rating == max_rating:\n",
1371
+ " best_of_the_best_1.append(word)\n",
1372
+ "\n",
1373
+ " # only using top ten most frequent prefixes suffixes to bias. After that it the impact is especially negligible\n",
1374
+ " test_starts = get_gram_freq(word_list = word_list, letters_length = 2, position = \"start\", search = None)[:10]\n",
1375
+ " test_ends = get_gram_freq(word_list = word_list, letters_length = 2, position = \"end\", search = None)[:10]\n",
1376
+ "\n",
1377
+ " # list of the best words that also have the best suffixes and prefixes\n",
1378
+ " best_of_the_best_2 = []\n",
1379
+ " for start_gram, start_count in test_starts:\n",
1380
+ " for end_gram, end_count in test_ends:\n",
1381
+ " for word in test_words:\n",
1382
+ " if word[:2] == start_gram and word[-2:] == end_gram:\n",
1383
+ " best_of_the_best_2.append(word)\n",
1384
+ "\n",
1385
+ " if len(best_of_the_best_2) > 0:\n",
1386
+ " guess = best_of_the_best_1[0]\n",
1387
+ " else:\n",
1388
+ " guess = best_of_the_best_2[0] # they're all equally the best of the best possible guesses so just pick the first\n",
1389
+ " \n",
1390
+ " # guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])\n",
1391
+ "\n",
1392
+ " if return_stats == False:\n",
1393
+ " if verbose == True:\n",
1394
+ " if len(word_ratings) <= 40:\n",
1395
+ " print(f\"All potential next guesses:\\n\\t{word_ratings}\\n\")\n",
1396
+ " print(f\"Words guessed so far:\\n\\t{guessed_words}.\\n\")\n",
1397
+ " record_list.append(f\"Potential next guesses: {word_ratings}\\n\")\n",
1398
+ " record_list.append(f\"Words guessed so far: {guessed_words}.\\n\")\n",
1399
+ " else:\n",
1400
+ " print(f\"The top 40 potential next guesses are:\\n\\t{word_ratings[:40]}\\n\")\n",
1401
+ " print(f\"Words guessed so far:\\n\\t{guessed_words}.\\n\")\n",
1402
+ " record_list.append(f\"The top 40 potential next guesses are: {word_ratings[:40]}\\n\")\n",
1403
+ " record_list.append(f\"Words guessed so far: {guessed_words}.\\n\")\n",
1404
+ "\n",
1405
+ " guess_entropies.append(get_word_entropy([guess], word_list, normalized = True, ascending = False)[0][1])\n",
1406
+ "\n",
1407
+ " #### Guess has now been made -- what to do next\n",
1408
+ " if guess_num == max_guesses: # if at max guesses allowed\n",
1409
+ " guessed_words.append(guess)\n",
1410
+ " stats_dict['target_guessed'] = False\n",
1411
+ " if return_stats == False:\n",
1412
+ " if verbose == True:\n",
1413
+ " # print(\"-----------------------------\\n\")\n",
1414
+ " print(f\"Unfortunately, the Wordle could not be solved in {max_guesses} guesses.\\n\")\n",
1415
+ " print(f\"The target word was '{target}'. Better luck next time!\\n\")\n",
1416
+ " print(\"-----------------------------\\n\")\n",
1417
+ " record_list.append(f\"Unfortunately, the Wordle could not be solved in {max_guesses} guesses.\\n\")\n",
1418
+ " record_list.append(f\"The target word was '{target}'. Better luck next time!\\n\")\n",
1419
+ " record_list.append(\"-----------------------------\\n\")\n",
1420
+ " else:\n",
1421
+ " print(f\"\\nUnfortunately, the Wordle could not be solved in {max_guesses} guesses.\")\n",
1422
+ " print(f\"The target word was '{target}'. Better luck next time!\\n\")\n",
1423
+ " record_list.append(f\"Unfortunately, the Wordle could not be solved in {max_guesses} guesses.\")\n",
1424
+ " record_list.append(f\"The target word was '{target}'. Better luck next time!\\n\")\n",
1425
+ " break\n",
1426
+ " else: # if not at max guesses yet allowed\n",
1427
+ " # stats_dict['target_guessed'] = False\n",
1428
+ " if return_stats == False:\n",
1429
+ " if verbose == True:\n",
1430
+ " print(f\"Next guess:\\n\\t'{guess}'\")\n",
1431
+ " print(\"\\n-----------------------------\\n\")\n",
1432
+ " record_list.append(f\"Next guess: '{guess}'\")\n",
1433
+ " record_list.append(\"-----------------------------\\n\")\n",
1434
+ "\n",
1435
+ " if guess == target:\n",
1436
+ " guess_num += 1\n",
1437
+ " guessed_words.append(guess)\n",
1438
+ " stats_dict['target_guessed'] = True\n",
1439
+ "\n",
1440
+ " if return_stats == False:\n",
1441
+ " print(f\"Guess {guess_num}: '{guess}'\\n\")\n",
1442
+ " print(f\"Congratulations! The Wordle has been solved in {guess_num} guesses!\")\n",
1443
+ " record_list.append(f\"Guess {guess_num}: '{guess}'\\n\")\n",
1444
+ " record_list.append(f\"Congratulations! The Wordle has been solved in {guess_num} guesses!\")\n",
1445
+ "\n",
1446
+ " if max_guesses - guess_num == 0:\n",
1447
+ " print(f\"Lucky! It was the last guess.\")\n",
1448
+ " record_list.append(f\"Lucky! It was the last guess.\")\n",
1449
+ " else:\n",
1450
+ " print(f\"There were still {max_guesses - guess_num} guesses remaining.\")\n",
1451
+ " record_list.append(f\"There were still {max_guesses - guess_num} guesses remaining.\")\n",
1452
+ "\n",
1453
+ " if return_stats == False: \n",
1454
+ " # stats_dict['target_guessed'] = True \n",
1455
+ " print(f\"\\nThe target word was '{target}'.\")\n",
1456
+ " print(\"\\n-----------------------------\")\n",
1457
+ " record_list.append(f\"The target word was '{target}'.\")\n",
1458
+ " record_list.append(\"-----------------------------\")\n",
1459
+ " break\n",
1460
+ "\n",
1461
+ " #### STATS STUFF \n",
1462
+ " mid_guesses_vows = 0\n",
1463
+ " mid_guesses_cons = 0\n",
1464
+ " avg_perf_letters = 0\n",
1465
+ " avg_wrong_pos_letters = 0\n",
1466
+ " avg_wrong_letters = 0\n",
1467
+ "\n",
1468
+ " for i, word in enumerate(guessed_words):\n",
1469
+ " mid_guesses_vows += count_vows_cons(word, y_vow = True)['vows']\n",
1470
+ " mid_guesses_cons += count_vows_cons(word, y_vow = True)['cons']\n",
1471
+ " \n",
1472
+ " for i in range(0, len(guessed_words) - 1):\n",
1473
+ " avg_perf_letters += perfect_letts_per_guess[i]\n",
1474
+ " avg_wrong_pos_letters += wrong_pos_per_guess[i]\n",
1475
+ " avg_wrong_letters += wrong_letts_per_guess[i]\n",
1476
+ "\n",
1477
+ " stats_dict['mid_guesses_avg_vows'] = float(round(mid_guesses_vows / len(guessed_words), 2))\n",
1478
+ " stats_dict['mid_guesses_avg_cons'] = float(round(mid_guesses_cons / len(guessed_words), 2))\n",
1479
+ "\n",
1480
+ " stats_dict['avg_perf_letters'] = float(round(np.mean(avg_perf_letters), 2))\n",
1481
+ " stats_dict['avg_wrong_pos_letters'] = float(round(np.mean(avg_wrong_pos_letters), 2))\n",
1482
+ " stats_dict['avg_wrong_letters'] = float(round(np.mean(avg_wrong_letters), 2))\n",
1483
+ " \n",
1484
+ " # average number of words remaining after each guess -- the higher this is, the luckier the person got (the lower, the more guesses it took)\n",
1485
+ " stats_dict['avg_remaining'] = float(round(np.mean(reduction_per_guess), 2))\n",
1486
+ "\n",
1487
+ " # avg entropy of each guessed word relative to all other words possible at that moment -- this should consistently be 100 for the algorithm, but will be different for user\n",
1488
+ " if len(guess_entropies) > 1: # in case of guessing it correctly on the first try\n",
1489
+ " sum_entropies = 0\n",
1490
+ " for entropy in guess_entropies:\n",
1491
+ " sum_entropies += entropy\n",
1492
+ "\n",
1493
+ " average_entropy = float(round(sum_entropies / len(guess_entropies), 2))\n",
1494
+ " stats_dict['avg_intermediate_guess_entropy'] = average_entropy\n",
1495
+ " else:\n",
1496
+ " stats_dict['avg_intermediate_guess_entropy'] = float(100)\n",
1497
+ "\n",
1498
+ " expected_guesses = 3.85\n",
1499
+ "\n",
1500
+ " # guess_num = 3\n",
1501
+ " # average_entropy = 95\n",
1502
+ " luck = round(1 - ((((guess_num / expected_guesses) * (stats_dict['avg_intermediate_guess_entropy'] / 100)) / max_guesses) * 5), 2)\n",
1503
+ " stats_dict['luck'] = luck\n",
1504
+ "\n",
1505
+ " if record == True:\n",
1506
+ " if verbose == True:\n",
1507
+ " with open(f\"solutions/{guessed_words[0]}_{target}_wizard_detailed.txt\", \"w\") as fout:\n",
1508
+ " for line in record_list:\n",
1509
+ " fout.write(line + \"\\n\") # write each line of list of printed text to .txt file\n",
1510
+ " else:\n",
1511
+ " with open(f\"solutions/{guessed_words[0]}_{target}_wizard_summary.txt\", \"w\") as fout:\n",
1512
+ " for line in record_list:\n",
1513
+ " fout.write(line + \"\\n\") # write\n",
1514
+ "\n",
1515
+ " # if guess_num <= len(guess):\n",
1516
+ " if guess_num <= 6:\n",
1517
+ " stats_dict['valid_success'] = True\n",
1518
+ " else:\n",
1519
+ " stats_dict['valid_success'] = False\n",
1520
+ "\n",
1521
+ " stats_dict['num_guesses'] = float(guess_num)\n",
1522
+ "\n",
1523
+ " # if return_stats == True:\n",
1524
+ " # return stats_dict\n",
1525
+ " if hf_mod == True:\n",
1526
+ " return record_list"
1527
+ ]
1528
+ },
1529
+ {
1530
+ "cell_type": "code",
1531
+ "execution_count": null,
1532
+ "metadata": {},
1533
+ "outputs": [],
1534
+ "source": [
1535
+ "test_1 = wordle_wizard(word_list = official_words, max_guesses = 6, \n",
1536
+ " guess = \"quota\", target = \"fatAl\",\n",
1537
+ " random_guess = False, random_target = False, \n",
1538
+ " verbose = True, drama = 0, return_stats = False, record = False)"
1539
+ ]
1540
+ },
1541
+ {
1542
+ "cell_type": "code",
1543
+ "execution_count": null,
1544
+ "metadata": {},
1545
+ "outputs": [],
1546
+ "source": [
1547
+ "suffix_freq_dist = {}\n",
1548
+ "prefix_freq_dist = {}\n",
1549
+ "\n",
1550
+ "for word in official_words:\n",
1551
+ " prefix = word[:2] # first 2 letters\n",
1552
+ " suffix = word[-2:] # last 2 letters\n",
1553
+ " if prefix not in prefix_freq_dist:\n",
1554
+ " prefix_freq_dist[prefix] = 1\n",
1555
+ " else:\n",
1556
+ " prefix_freq_dist[prefix] += 1\n",
1557
+ "\n",
1558
+ " if suffix not in suffix_freq_dist:\n",
1559
+ " suffix_freq_dist[suffix] = 1\n",
1560
+ " else:\n",
1561
+ " suffix_freq_dist[suffix] += 1\n",
1562
+ "\n",
1563
+ "suffix_types = [key for key in suffix_freq_dist.keys()]\n",
1564
+ "prefix_types = [key for key in prefix_freq_dist.keys()]\n",
1565
+ "\n",
1566
+ "sorted_prefix_dist = sorted(prefix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)\n",
1567
+ "sorted_suffix_dist = sorted(suffix_freq_dist.items(), key = operator.itemgetter(1), reverse = True)\n",
1568
+ "\n",
1569
+ "print(\"Prefixes:\")\n",
1570
+ "print(len(sorted_prefix_dist))\n",
1571
+ "print(sorted_prefix_dist[:10])\n",
1572
+ "print(\"-----\")\n",
1573
+ "print(\"Suffixes:\")\n",
1574
+ "print(len(sorted_suffix_dist))\n",
1575
+ "print(sorted_suffix_dist[:10])\n",
1576
+ "\n",
1577
+ "for tup in sorted_prefix_dist:\n",
1578
+ " if tup[0] in [\"ho\", 'jo', 'go']:\n",
1579
+ " print (tup)"
1580
+ ]
1581
+ },
1582
+ {
1583
+ "cell_type": "code",
1584
+ "execution_count": null,
1585
+ "metadata": {},
1586
+ "outputs": [],
1587
+ "source": [
1588
+ "grams_freq_dist = {}\n",
1589
+ "gram_len = 3\n",
1590
+ "\n",
1591
+ "for word in official_words:\n",
1592
+ " for i in range(0, len(word) - (gram_len - 1)): # so it doesn't index out of range\n",
1593
+ " gram = word[i:i + gram_len]\n",
1594
+ "\n",
1595
+ " if gram not in grams_freq_dist:\n",
1596
+ " grams_freq_dist[gram] = 1\n",
1597
+ " else:\n",
1598
+ " grams_freq_dist[gram] += 1\n",
1599
+ "\n",
1600
+ "print(len(grams_freq_dist))\n",
1601
+ "sorted_gram_dist = sorted(grams_freq_dist.items(), key = operator.itemgetter(1), reverse = True)\n",
1602
+ "sorted_gram_dist[:15]"
1603
+ ]
1604
+ },
1605
+ {
1606
+ "cell_type": "code",
1607
+ "execution_count": null,
1608
+ "metadata": {},
1609
+ "outputs": [],
1610
+ "source": []
1611
+ },
1612
+ {
1613
+ "cell_type": "code",
1614
+ "execution_count": null,
1615
+ "metadata": {},
1616
+ "outputs": [],
1617
+ "source": []
1618
+ },
1619
+ {
1620
+ "cell_type": "code",
1621
+ "execution_count": null,
1622
+ "metadata": {},
1623
+ "outputs": [],
1624
+ "source": []
1625
+ }
1626
+ ],
1627
+ "metadata": {
1628
+ "kernelspec": {
1629
+ "display_name": "base",
1630
+ "language": "python",
1631
+ "name": "python3"
1632
+ },
1633
+ "language_info": {
1634
+ "codemirror_mode": {
1635
+ "name": "ipython",
1636
+ "version": 3
1637
+ },
1638
+ "file_extension": ".py",
1639
+ "mimetype": "text/x-python",
1640
+ "name": "python",
1641
+ "nbconvert_exporter": "python",
1642
+ "pygments_lexer": "ipython3",
1643
+ "version": "3.10.4"
1644
+ },
1645
+ "orig_nbformat": 4,
1646
+ "vscode": {
1647
+ "interpreter": {
1648
+ "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf"
1649
+ }
1650
+ }
1651
+ },
1652
+ "nbformat": 4,
1653
+ "nbformat_minor": 2
1654
+ }