Spaces:

Ujjwal123
/

EZ-Crossword

Running

App Files Files Community

Ujjwal123 commited on Feb 24

Commit

076da67

•

1 Parent(s): 3de893e

second pass model integrated

Browse files

Files changed (6) hide show

BPSolver_inf.py +246 -16
Dockerfile +4 -0
Models_inf.py +66 -4
Normal_utils_inf.py +100 -3
Solver_inf.py +14 -30
main.py +53 -23

BPSolver_inf.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import math
 import string
 from collections import defaultdict
 from copy import deepcopy
@@ -9,6 +10,8 @@ from tqdm import trange
 from Utils_inf import print_grid, get_word_flips
 from Solver_inf import Solver
 # the probability of each alphabetical character in the crossword
 UNIGRAM_PROBS = [('A', 0.0897379968935765), ('B', 0.02121248877769636), ('C', 0.03482206634145926), ('D', 0.03700942543460491), ('E', 0.1159773210750429), ('F', 0.017257461694024614), ('G', 0.025429024796296124), ('H', 0.033122967601502), ('I', 0.06800036223479956), ('J', 0.00294611331754349), ('K', 0.013860682888259786), ('L', 0.05130800574373874), ('M', 0.027962776827660175), ('N', 0.06631994270448001), ('O', 0.07374646543246745), ('P', 0.026750756212433214), ('Q', 0.001507814175439393), ('R', 0.07080460813737305), ('S', 0.07410988246048224), ('T', 0.07242993582154593), ('U', 0.0289272388037645), ('V', 0.009153522059555467), ('W', 0.01434705167591524), ('X', 0.003096729223103298), ('Y', 0.01749958208224007), ('Z', 0.002659777584995724)]
@@ -18,12 +21,12 @@ LETTER_SMOOTHING_FACTOR = [0.0, 0.0, 0.04395604395604396, 0.0001372495196266813,
 class BPVar:
     def __init__(self, name, variable, candidates, cells):
-        self.name = name
         cells_by_position = {}
-        for cell in cells:
-            cells_by_position[cell.position] = cell
             cell._connect(self)
-        self.length = len(cells)
         self.ordered_cells = [cells_by_position[pos] for pos in variable['cells']]
         self.candidates = candidates
         self.words = self.candidates['words']
@@ -85,6 +88,7 @@ class BPCell:
         self.log_probs = log_softmax(sum(self.directional_scores))
     def propagate(self):
         try:
             for i, v in enumerate(self.crossing_vars):
                 v._propagate_to_var(self, self.directional_scores[1-i])
@@ -98,7 +102,8 @@ class BPSolver(Solver):
                  model_path,
                  ans_tsv_path,
                  dense_embd_path,
-                 max_candidates = 5000,
                  process_id = 0,
                  model_type = 'bert',
                  **kwargs):
@@ -111,6 +116,8 @@ class BPSolver(Solver):
                          model_type = model_type,
                          **kwargs)
         self.crossword = crossword
         # our answer set
         self.answer_set = set()
@@ -130,12 +137,27 @@ class BPSolver(Solver):
                 self.bp_cells_by_clue[clue].append(cell)
         self.bp_vars = []
         for key, value in self.crossword.variables.items():
             var = BPVar(key, value, self.candidates[key], self.bp_cells_by_clue[key])
             self.bp_vars.append(var)
     def solve(self, num_iters=10, iterative_improvement_steps=5, return_greedy_states = False, return_ii_states = False):
         # run solving for num_iters iterations
-        print('beginning BP iterations')
         for _ in trange(num_iters):
             for var in self.bp_vars:
                 var.propagate()
@@ -145,7 +167,7 @@ class BPSolver(Solver):
                 cell.propagate()
             for var in self.bp_vars:
                 var.sync_state()
-        print('done BP iterations')
         # Get the current based grid based on greedy selection from the marginals
         if return_greedy_states:
@@ -153,16 +175,168 @@ class BPSolver(Solver):
         else:
             grid = self.greedy_sequential_word_solution()
             all_grids = []
-        grid = self.greedy_sequential_word_solution()
-        # print('=====Greedy search grid=====')
-        # print_grid(grid)
-        if iterative_improvement_steps < 1:
             if return_greedy_states or return_ii_states:
-                return grid, all_grids
             else:
-                return grid
     def greedy_sequential_word_solution(self, return_grids = False):
         all_grids = []
         # after we've run BP, we run a simple greedy search to get the final.
@@ -181,7 +355,6 @@ class BPSolver(Solver):
             best_index = best_per_var.index(max([x for x in best_per_var if x is not None]))
             best_var = self.bp_vars[best_index]
             best_word = best_var.words[best_var.log_probs.argmax()]
-            # print('greedy filling in', best_word)
             for i, cell in enumerate(best_var.ordered_cells):
                 letter = best_word[i]
                 grid[cell.position[0]][cell.position[1]] = letter
@@ -201,14 +374,71 @@ class BPSolver(Solver):
             best_var.words = []
             best_var.log_probs = best_var.log_probs[[]]
             best_per_var[best_index] = None
         for cell in self.bp_cells:
             if cell.position in unfilled_cells:
                 grid[cell.position[0]][cell.position[1]] = string.ascii_uppercase[cell.log_probs.argmax()]
         for var, (words, log_probs) in zip(self.bp_vars, cache): # restore state
             var.words = words
             var.log_probs = log_probs
         if return_grids:
             return grid, all_grids
         else:
-            return grid

 import math
 import string
+import re
 from collections import defaultdict
 from copy import deepcopy
 from Utils_inf import print_grid, get_word_flips
 from Solver_inf import Solver
+from Models_inf import setup_t5_reranker, t5_reranker_score_with_clue
 # the probability of each alphabetical character in the crossword
 UNIGRAM_PROBS = [('A', 0.0897379968935765), ('B', 0.02121248877769636), ('C', 0.03482206634145926), ('D', 0.03700942543460491), ('E', 0.1159773210750429), ('F', 0.017257461694024614), ('G', 0.025429024796296124), ('H', 0.033122967601502), ('I', 0.06800036223479956), ('J', 0.00294611331754349), ('K', 0.013860682888259786), ('L', 0.05130800574373874), ('M', 0.027962776827660175), ('N', 0.06631994270448001), ('O', 0.07374646543246745), ('P', 0.026750756212433214), ('Q', 0.001507814175439393), ('R', 0.07080460813737305), ('S', 0.07410988246048224), ('T', 0.07242993582154593), ('U', 0.0289272388037645), ('V', 0.009153522059555467), ('W', 0.01434705167591524), ('X', 0.003096729223103298), ('Y', 0.01749958208224007), ('Z', 0.002659777584995724)]
 class BPVar:
     def __init__(self, name, variable, candidates, cells):
+        self.name = name # key from crossword.variables i.e. 1A, 2D, 3A
         cells_by_position = {}
+        for cell in cells: # every cells or letter box that a particular variable or filling takes into consideration
+            cells_by_position[cell.position] = cell # cell.position (0,0) -> cell -> BPCell
             cell._connect(self)
+        self.length = len(cells) # obviously the length of the answer
         self.ordered_cells = [cells_by_position[pos] for pos in variable['cells']]
         self.candidates = candidates
         self.words = self.candidates['words']
         self.log_probs = log_softmax(sum(self.directional_scores))
     def propagate(self):
+        # assert len(self.crossing_vars) == 2
         try:
             for i, v in enumerate(self.crossing_vars):
                 v._propagate_to_var(self, self.directional_scores[1-i])
                  model_path,
                  ans_tsv_path,
                  dense_embd_path,
+                 reranker_path,
+                 max_candidates = 100,
                  process_id = 0,
                  model_type = 'bert',
                  **kwargs):
                          model_type = model_type,
                          **kwargs)
         self.crossword = crossword
+        self.reranker_path = reranker_path
+        self.reranker_model_type = 't5-small'
         # our answer set
         self.answer_set = set()
                 self.bp_cells_by_clue[clue].append(cell)
         self.bp_vars = []
         for key, value in self.crossword.variables.items():
+            # if key == '1A':
+            #     print('-'*100)
+            #     print(self.candidates[key]['words'])
+            #     print(self.candidates[key]['bit_array'].shape)
+            #     print(self.candidates[key]['weights'])
+            #     print('-'*100)
             var = BPVar(key, value, self.candidates[key], self.bp_cells_by_clue[key])
+            # print('*'*100)
+            # print(self.bp_cells_by_clue[key])
+            # print('*'*100)
             self.bp_vars.append(var)
+    def extract_float(self, input_string):
+        pattern = r"\d+\.\d+"
+        matches = re.findall(pattern, input_string)
+        float_numbers = [float(match) for match in matches]
+        return float_numbers
     def solve(self, num_iters=10, iterative_improvement_steps=5, return_greedy_states = False, return_ii_states = False):
         # run solving for num_iters iterations
+        print('\nBeginning Belief Propagation Iteration Steps: ')
         for _ in trange(num_iters):
             for var in self.bp_vars:
                 var.propagate()
                 cell.propagate()
             for var in self.bp_vars:
                 var.sync_state()
+        print('Belief Propagation Iteration Complete\n')
         # Get the current based grid based on greedy selection from the marginals
         if return_greedy_states:
         else:
             grid = self.greedy_sequential_word_solution()
             all_grids = []
+        # properly save all the outputs results:
+        output_results = {}
+        output_results['first pass model'] = {}
+        output_results['first pass model']['grid'] = grid
+        # save first pass model grid, and letter accuracies
+        _, accu_log = self.evaluate(grid, False)
+        [ori_letter_accu, ori_word_accu] = self.extract_float(accu_log)
+        output_results['first pass model']['letter accuracy'] = ori_letter_accu
+        output_results['first pass model']['word accuracy'] = ori_word_accu
+        print("First pass model result was", grid,ori_letter_accu,ori_word_accu)
+        output_results['second pass model'] = {}
+        output_results['second pass model']['final grid'] = [] # just for the sake of the api
+        output_results['second pass model']['final grid'] = grid # just for the sake of the api
+        output_results['second pass model']['all grids'] = []
+        output_results['second pass model']['all letter accuracy'] = []
+        output_results['second pass model']['all word accuracy'] = []
+        if iterative_improvement_steps < 1 or ori_letter_accu == 100.0:
             if return_greedy_states or return_ii_states:
+                return output_results, all_grids
             else:
+                return output_results
+        '''
+            Iterative Improvement with t5-small starts from here.
+        '''
+        self.reranker, self.tokenizer = setup_t5_reranker(self.reranker_path, self.reranker_model_type)
+        for i in range(iterative_improvement_steps):
+            grid, did_iterative_improvement_make_edit = self.iterative_improvement(grid)
+            _, accu_log = self.evaluate(grid, False)
+            [temp_letter_accu, temp_word_accu] = self.extract_float(accu_log)
+            print(f"{i+1}th iteration: {accu_log}")
+            # save grid & accuracies at each iteration
+            output_results['second pass model']['all grids'].append(grid)
+            output_results['second pass model']['all letter accuracy'].append(temp_letter_accu)
+            output_results['second pass model']['all word accuracy'].append(temp_word_accu)
+            if not did_iterative_improvement_make_edit or temp_letter_accu == 100.0:
+                break
+            if return_ii_states:
+                all_grids.append(deepcopy(grid))
+        temp_lett_accu_list = output_results['second pass model']['all letter accuracy'].copy()
+        ii_max_index = temp_lett_accu_list.index(max(temp_lett_accu_list))
+        output_results['second pass model']['final grid'] = output_results['second pass model']['all grids'][ii_max_index]
+        output_results['second pass model']['final letter'] = output_results['second pass model']['all letter accuracy'][ii_max_index]
+        output_results['second pass model']['final word'] = output_results['second pass model']['all word accuracy'][ii_max_index]
+        if return_greedy_states or return_ii_states:
+            return output_results, all_grids
+        else:
+            return output_results
+    def get_candidate_replacements(self, uncertain_answers, grid):
+        # find alternate answers for all the uncertain words
+        candidate_replacements = []
+        replacement_id_set = set()
+        # check against dictionaries
+        for clue in uncertain_answers.keys():
+            initial_word = uncertain_answers[clue]
+            clue_flips = get_word_flips(initial_word, 10) # flip then segment
+            clue_positions = [key for key, value in self.crossword.variables.items() if value['clue'] == clue]
+            for clue_position in clue_positions:
+                cells = sorted([cell for cell in self.bp_cells if clue_position in cell.crossing_clues], key=lambda c: c.position)
+                if len(cells) == len(initial_word):
+                    break
+            for flip in clue_flips:
+                if len(flip) != len(cells):
+                    import pdb; pdb.set_trace()
+                assert len(flip) == len(cells)
+                for i in range(len(flip)):
+                    if flip[i] != initial_word[i]:
+                        candidate_replacements.append([(cells[i], flip[i])])
+                        break
+        # also add candidates based on uncertainties in the letters, e.g., if we said P but G also had some probability, try G too
+        for cell_id, cell in enumerate(self.bp_cells):
+            probs = np.exp(cell.log_probs)
+            above_threshold = list(probs > 0.01)
+            new_characters = ['ABCDEFGHIJKLMNOPQRSTUVWXYZ'[i] for i in range(26) if above_threshold[i]]
+            # used = set()
+            # new_characters = [x for x in new_characters if x not in used and (used.add(x) or True)] # unique the set
+            new_characters = [x for x in new_characters if x != grid[cell.position[0]][cell.position[1]]] # ignore if its the same as the original solution
+            if len(new_characters) > 0:
+                for new_character in new_characters:
+                    id = '_'.join([str(cell.position), new_character])
+                    if id not in replacement_id_set:
+                        candidate_replacements.append([(cell, new_character)])
+                    replacement_id_set.add(id)
+        # create composite flips based on things in the same row/column
+        composite_replacements = []
+        for i in range(len(candidate_replacements)):
+            for j in range(i+1, len(candidate_replacements)):
+                flip1, flip2 = candidate_replacements[i], candidate_replacements[j]
+                if flip1[0][0] != flip2[0][0]:
+                    if len(set(flip1[0][0].crossing_clues + flip2[0][0].crossing_clues)) < 4: # shared clue
+                        composite_replacements.append(flip1 + flip2)
+        candidate_replacements += composite_replacements
+        #print('\ncandidate replacements')
+        for cr in candidate_replacements:
+            modified_grid = deepcopy(grid)
+            for cell, letter in cr:
+                modified_grid[cell.position[0]][cell.position[1]] = letter
+            variables = set(sum([cell.crossing_vars for cell, _ in cr], []))
+            for var in variables:
+                original_fill = ''.join([grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                modified_fill = ''.join([modified_grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                #print('original:', original_fill, 'modified:', modified_fill)
+        return candidate_replacements
+    def get_uncertain_answers(self, grid):
+        original_qa_pairs = {} # the original puzzle preds that we will try to improve
+        # first save what the argmax word-level prediction was for each grid cell just to make life easier
+        for var in self.crossword.variables:
+            # read the current word off the grid
+            cells = self.crossword.variables[var]["cells"]
+            word = []
+            for cell in cells:
+                word.append(grid[cell[0]][cell[1]])
+            word = ''.join(word)
+            for cell in self.bp_cells: # loop through all cells
+                if cell.position in cells: # if this cell is in the word we are currently handling
+                    # save {clue, answer} pair into this cell
+                    cell.prediction[self.crossword.variables[var]['clue']] = word
+                    original_qa_pairs[self.crossword.variables[var]['clue']] = word
+        uncertain_answers = {}
+        # find uncertain answers
+        # right now the heuristic we use is any answer that is not in the answer set
+        for clue in original_qa_pairs.keys():
+            if original_qa_pairs[clue] not in self.answer_set:
+                uncertain_answers[clue] = original_qa_pairs[clue]
+        return uncertain_answers
+    def score_grid(self, grid):
+        clues = []
+        answers = []
+        for clue, cells in self.bp_cells_by_clue.items():
+            letters = ''.join([grid[cell.position[0]][cell.position[1]] for cell in sorted(list(cells), key=lambda c: c.position)])
+            clues.append(self.crossword.variables[clue]['clue'])
+            answers.append(letters)
+        scores = t5_reranker_score_with_clue(self.reranker, self.tokenizer, self.reranker_model_type, clues, answers)
+        return sum(scores)
     def greedy_sequential_word_solution(self, return_grids = False):
         all_grids = []
         # after we've run BP, we run a simple greedy search to get the final.
             best_index = best_per_var.index(max([x for x in best_per_var if x is not None]))
             best_var = self.bp_vars[best_index]
             best_word = best_var.words[best_var.log_probs.argmax()]
             for i, cell in enumerate(best_var.ordered_cells):
                 letter = best_word[i]
                 grid[cell.position[0]][cell.position[1]] = letter
             best_var.words = []
             best_var.log_probs = best_var.log_probs[[]]
             best_per_var[best_index] = None
+        unfilled_cells_count = 0
         for cell in self.bp_cells:
             if cell.position in unfilled_cells:
+                unfilled_cells_count += 1
                 grid[cell.position[0]][cell.position[1]] = string.ascii_uppercase[cell.log_probs.argmax()]
         for var, (words, log_probs) in zip(self.bp_vars, cache): # restore state
             var.words = words
             var.log_probs = log_probs
         if return_grids:
             return grid, all_grids
         else:
+            return grid
+    def iterative_improvement(self, grid):
+        # check the grid for uncertain areas and save those words to be analyzed in local search, aka looking for alternate candidates
+        uncertain_answers = self.get_uncertain_answers(grid)
+        self.candidate_replacements = self.get_candidate_replacements(uncertain_answers, grid)
+        # print('\nstarting iterative improvement')
+        original_grid_score = self.score_grid(grid)
+        possible_edits = []
+        for replacements in self.candidate_replacements:
+            modified_grid = deepcopy(grid)
+            for cell, letter in replacements:
+                modified_grid[cell.position[0]][cell.position[1]] = letter
+            modified_grid_score = self.score_grid(modified_grid)
+            # print('candidate edit')
+            variables = set(sum([cell.crossing_vars for cell, _ in replacements], []))
+            for var in variables:
+                original_fill = ''.join([grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                modified_fill = ''.join([modified_grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                clue_index = list(set(var.ordered_cells[0].crossing_clues).intersection(*[set(cell.crossing_clues) for cell in var.ordered_cells]))[0]
+            #     print('original:', original_fill, 'modified:', modified_fill)
+            #     print('gold answer', self.crossword.variables[clue_index]['gold'])
+            #     print('clue', self.crossword.variables[clue_index]['clue'])
+            # print('original score:', original_grid_score, 'modified score:', modified_grid_score)
+            if modified_grid_score - original_grid_score > 0.5:
+                # print('found a possible edit')
+                possible_edits.append((modified_grid, modified_grid_score, replacements))
+            # print()
+        if len(possible_edits) > 0:
+            variables_modified = set()
+            possible_edits = sorted(possible_edits, key=lambda x: x[1], reverse=True)
+            selected_edits = []
+            for edit in possible_edits:
+                replacements = edit[2]
+                variables = set(sum([cell.crossing_vars for cell, _ in replacements], []))
+                if len(variables_modified.intersection(variables)) == 0: # we can do multiple updates at once if they don't share clues
+                    variables_modified.update(variables)
+                    selected_edits.append(edit)
+            new_grid = deepcopy(grid)
+            for edit in selected_edits:
+                # print('\nactually applying edit')
+                replacements = edit[2]
+                for cell, letter in replacements:
+                    new_grid[cell.position[0]][cell.position[1]] = letter
+                variables = set(sum([cell.crossing_vars for cell, _ in replacements], []))
+                for var in variables:
+                    original_fill = ''.join([grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                    modified_fill = ''.join([new_grid[cell.position[0]][cell.position[1]] for cell in var.ordered_cells])
+                    # print('original:', original_fill, 'modified:', modified_fill)
+            return new_grid, True
+        else:
+            return grid, False

Dockerfile CHANGED Viewed

@@ -34,4 +34,8 @@ ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scor
 ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/dpr_biencoder_trained_500k.bin $HOME/app/Inference_components/
 ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/embeddings_all_answers_json_0.pkl $HOME/app/Inference_components/
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/dpr_biencoder_trained_500k.bin $HOME/app/Inference_components/
 ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/embeddings_all_answers_json_0.pkl $HOME/app/Inference_components/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/t5_small_new_dataset_2EPOCHS/config.json $HOME/app/Inference_components/t5_small_new_dataset_2EPOCHS/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/t5_small_new_dataset_2EPOCHS/generation_config.json $HOME/app/Inference_components/t5_small_new_dataset_2EPOCHS/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/resolve/main/t5_small_new_dataset_2EPOCHS/model.safetensors $HOME/app/Inference_components/t5_small_new_dataset_2EPOCHS/
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

Models_inf.py CHANGED Viewed

@@ -7,8 +7,6 @@ import string
 import sys
 from typing import List, Tuple, Dict
 import re
-import math
-import collections
 import numpy as np
 import unicodedata
@@ -21,7 +19,10 @@ from Options_inf import setup_args_gpu, print_args, set_encoder_params_from_stat
 from Faiss_Indexers_inf import DenseIndexer, DenseFlatIndexer
 from Data_utils_inf import Tensorizer
 from Model_utils_inf import load_states_from_checkpoint, get_model_obj
 SEGMENTER_CACHE = {}
 RERANKER_CACHE = {}
@@ -37,6 +38,64 @@ def setup_closedbook(model_path, ans_tsv_path, dense_embd_path, process_id, mode
     )
     return dpr
 def preprocess_clue_fn(clue):
     clue = str(clue)
@@ -202,6 +261,7 @@ class DenseRetriever(object):
                 query_vectors.extend(out.cpu().split(1, dim=0))
         query_tensor = torch.cat(query_vectors, dim=0)
         assert query_tensor.size(0) == len(questions)
         return query_tensor
@@ -353,9 +413,11 @@ class DPRForCrossword(object):
         if max_answers > self.len_all_passages:
             max_answers = self.len_all_passages
         # get top k results
         top_ids_and_scores = self.retriever.get_top_docs(questions_tensor.numpy(), max_answers)
         if not output_strings:
             return top_ids_and_scores
         else:

 import sys
 from typing import List, Tuple, Dict
 import re
 import numpy as np
 import unicodedata
 from Faiss_Indexers_inf import DenseIndexer, DenseFlatIndexer
 from Data_utils_inf import Tensorizer
 from Model_utils_inf import load_states_from_checkpoint, get_model_obj
+from transformers import T5ForConditionalGeneration, AutoTokenizer
+import time
+from wordsegment import load, segment
+load()
 SEGMENTER_CACHE = {}
 RERANKER_CACHE = {}
     )
     return dpr
+def setup_t5_reranker(reranker_path, reranker_model_type = 't5-small'):
+    tokenizer = AutoTokenizer.from_pretrained(reranker_model_type)
+    model = T5ForConditionalGeneration.from_pretrained(reranker_path)
+    model.eval().to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+    return model, tokenizer
+def post_process_clue(clue):
+    clue = preprocess_clue_fn(clue)
+    if clue[-3:] == '. .':
+        clue = clue[:-3]
+    elif clue[-3:] == ' ..':
+        clue = clue[:-3]
+    elif clue[-2:] == '..':
+        clue = clue[:-2]
+    elif clue[-1] == '.':
+        clue = clue[:-1]
+    return clue
+def t5_reranker_score_with_clue(model, tokenizer, model_type, clues, possibly_ungrammatical_fills):
+    global RERANKER_CACHE
+    results = []
+    device = model.device
+    fills = possibly_ungrammatical_fills.copy()
+    if model_type == 't5-small':
+        segmented_fills = []
+        for answer in possibly_ungrammatical_fills:
+            segmented_fills.append(" ".join(segment(answer.lower())))
+        fills = segmented_fills.copy()
+    for clue, possibly_ungrammatical_fill in zip(clues, fills):
+        # possibly here is where the byt5 failed
+        if not possibly_ungrammatical_fill.islower():
+            possibly_ungrammatical_fill = possibly_ungrammatical_fill.lower()
+        clue = post_process_clue(clue)
+        if clue + possibly_ungrammatical_fill in RERANKER_CACHE:
+            results.append(RERANKER_CACHE[clue + possibly_ungrammatical_fill])
+            continue
+        else:
+            with torch.no_grad(), torch.inference_mode():
+                # move all the input tensors to the GPU (cuda)
+                inputs = tokenizer(["Q: " + clue], return_tensors='pt')['input_ids'].to(device)
+                labels = tokenizer([possibly_ungrammatical_fill], return_tensors='pt')['input_ids'].to(device)
+                # model mode set to evaluation
+                model.eval()
+                loss = model(inputs, labels = labels)
+                answer_length = labels.shape[1]
+                logprob = -loss[0].item() * answer_length
+                results.append(logprob)
+                RERANKER_CACHE[clue + possibly_ungrammatical_fill] = logprob
+    return results
 def preprocess_clue_fn(clue):
     clue = str(clue)
                 query_vectors.extend(out.cpu().split(1, dim=0))
         query_tensor = torch.cat(query_vectors, dim=0)
+        print("CLUE Vector Shape", query_tensor.shape)
         assert query_tensor.size(0) == len(questions)
         return query_tensor
         if max_answers > self.len_all_passages:
             max_answers = self.len_all_passages
+        start_time = time.time()
         # get top k results
         top_ids_and_scores = self.retriever.get_top_docs(questions_tensor.numpy(), max_answers)
+        end_time = time.time()
+        print("\n\nTime taken by FAISS INDEXER: ", end_time - start_time)
         if not output_strings:
             return top_ids_and_scores
         else:

Normal_utils_inf.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import puz
-import re
-import unicodedata
-import sys
 def puz_to_json(fname):
     """ Converts a puzzle in .puz format to .json format
@@ -63,3 +62,101 @@ def puz_to_pairs(filepath):
     return [(k, v) for k, v in pairs.items()]

 import puz
+import json
+import requests
 def puz_to_json(fname):
     """ Converts a puzzle in .puz format to .json format
     return [(k, v) for k, v in pairs.items()]
+def json_CA_json_converter(json_file_path, is_path):
+    try:
+        if is_path:
+            with open(json_file_path, "r") as file:
+                data = json.load(file)
+        else:
+            data = json_file_path
+        json_conversion_dict = {}
+        rows = data["size"]["rows"]
+        cols = data["size"]["cols"]
+        date = data["date"]
+        clues = data["clues"]
+        answers = data["answers"]
+        json_conversion_dict["metadata"] = {"date": date, "rows": rows, "cols": cols}
+        across_clue_answer = {}
+        down_clue_answer = {}
+        for clue, ans in zip(clues["across"], answers["across"]):
+            split_clue = clue.split(" ")
+            clue_num = split_clue[0][:-1]
+            clue_ = " ".join(split_clue[1:])
+            clue_ = clue_.replace("[", "").replace("]", "")
+            across_clue_answer[clue_num] = [clue_, ans]
+        for clue, ans in zip(clues["down"], answers["down"]):
+            split_clue = clue.split(" ")
+            clue_num = split_clue[0][:-1]
+            clue_ = " ".join(split_clue[1:])
+            clue_ = clue_.replace("[", "").replace("]", "")
+            down_clue_answer[clue_num] = [clue_, ans]
+        json_conversion_dict["clues"] = {
+            "across": across_clue_answer,
+            "down": down_clue_answer,
+        }
+        grid_info = data["grid"]
+        grid_num = data["gridnums"]
+        grid_info_list = []
+        for i in range(rows):
+            row_list = []
+            for j in range(cols):
+                if grid_info[i * rows + j] == ".":
+                    row_list.append("BLACK")
+                else:
+                    if grid_num[i * rows + j] == 0:
+                        row_list.append(["", grid_info[i * rows + j]])
+                    else:
+                        row_list.append(
+                            [str(grid_num[i * rows + j]), grid_info[i * rows + j]]
+                        )
+            grid_info_list.append(row_list)
+        json_conversion_dict["grid"] = grid_info_list
+        return json_conversion_dict
+    except:
+        print("ERROR has occured.")
+def fetch_nyt_crossword(dateStr):
+    '''
+        Fetch NYT puzzle from a specific date.
+    '''
+    headers = {
+        'Referer': 'https://www.xwordinfo.com/JSON/'
+    }
+    # mm/dd/yyyy
+    url = 'https://www.xwordinfo.com/JSON/Data.ashx?date=' + dateStr
+    response = requests.get(url, headers=headers)
+    context = {}
+    grid_data = {}
+    if response.status_code == 200:
+        bytevalue = response.content
+        jsonText = bytevalue.decode('utf-8').replace("'", '"')
+        grid_data = json.loads(jsonText)
+        puzzle_data = json_CA_json_converter(grid_data, False)
+        for dim in ['across', 'down']:
+            for grid_num in puzzle_data['clues'][dim].keys():
+                clue_answer_list = puzzle_data['clues'][dim][grid_num]
+                clue_section = clue_answer_list[0]
+                ans_section = clue_answer_list[1]
+                clue_section = clue_section.replace("&quot;", "'").replace("&#39;", "'")
+                puzzle_data['clues'][dim][grid_num] = [clue_section, ans_section]
+        return puzzle_data
+    else:
+        print(f"Request failed with status code {response.status_code}.")

Solver_inf.py CHANGED Viewed

@@ -16,7 +16,7 @@ class Solver:
         crossword (Crossword): puzzle to solve
         max_candidates (int): number of answer candidates to consider per clue
     """
-    def __init__(self, crossword, model_path, ans_tsv_path, dense_embd_path, max_candidates=1000, process_id = 0, model_type = 'bert'):
         self.crossword = crossword
         self.max_candidates = max_candidates
         self.process_id = process_id
@@ -46,10 +46,10 @@ class Solver:
                 clue = clue[:match.start()] + self.crossword.variables[var]['clue'] + clue[match.end():]
                 all_clues[idx] = clue
-        # print("MODEL PATH: ", type(self.dense_embd_glob))
         # get predictions
         dpr = setup_closedbook(self.model_path, self.ans_tsv_path, self.dense_embd_glob, self.process_id, self.model_type)
-        all_words, all_scores = answer_clues(dpr, all_clues, max_answers=self.max_candidates, output_strings=True)
         for index, var in enumerate(self.crossword.variables):
             length = len(self.crossword.variables[var]["gold"])
             self.candidates[var] = {"words": [], "bit_array": None, "weights": {}}
@@ -63,21 +63,13 @@ class Solver:
                     keep_positions.append(word_index)
             words = [words[i] for i in keep_positions]
             scores = [scores[i] for i in keep_positions]
             scores = list(-np.log(softmax(np.array(scores) / 0.75)))
             for word, score in zip(words, scores):
                 self.candidates[var]["weights"][word] = score
-            # for debugging purposes, print the rank of the gold answer on our candidate list
-            # the gold answer is otherwise *not* used in any way during solving
-            # if self.crossword.variables[var]["gold"] in words:
-            #     print(clue, self.crossword.variables[var]["gold"], words.index(self.crossword.variables[var]["gold"]))
-            # else:
-            #     print('not found', clue, self.crossword.variables[var]["gold"])
-            # fill up some data structures used later in solving
-            for word, score in zip(words, scores):
-                self.candidates[var]["weights"][word] = score
             weights = self.candidates[var]["weights"]
             self.candidates[var]["words"] = sorted(weights, key=weights.get)
             self.candidates[var]["bit_array"] = np.zeros((len(chars), length, len(self.candidates[var]["words"])))
@@ -94,8 +86,7 @@ class Solver:
         # cleanup a bit
         del dpr
-    def evaluate(self, solution):
-        # print puzzle accuracy results given a generated solution
         letters_correct = 0
         letters_total = 0
         for i in range(len(self.crossword.letter_grid)):
@@ -110,20 +101,13 @@ class Solver:
             matching_cells = [self.crossword.letter_grid[cell[0]][cell[1]] == solution[cell[0]][cell[1]] for cell in cells]
             if len(cells) == sum(matching_cells):
                 words_correct += 1
-            else:
-                # print('evaluation: correct word', ''.join([self.crossword.letter_grid[cell[0]][cell[1]] for cell in cells]), 'our prediction:', ''.join([solution[cell[0]][cell[1]] for cell in cells]))
-                pass
             words_total += 1
-        print("Letters Correct: {}/{} | Words Correct: {}/{}".format(int(letters_correct), int(letters_total), int(words_correct), int(words_total)))
-        print("Letters Correct: {}% | Words Correct: {}%".format(float(letters_correct/letters_total*100), float(words_correct/words_total*100)))
-        info = {
-            "total_letters" : int(letters_total),
-            "total_words" : int(words_total),
-            "correct_letters" : int(letters_correct),
-            "correct_words" : int(words_correct),
-            "correct_letters_percent" : float(letters_correct/letters_total*100),
-            "correct_words_percent" : float(words_correct/words_total*100),
-        }
-        return info

         crossword (Crossword): puzzle to solve
         max_candidates (int): number of answer candidates to consider per clue
     """
+    def __init__(self, crossword, model_path, ans_tsv_path, dense_embd_path, max_candidates = 100, process_id = 0, model_type = 'bert'):
         self.crossword = crossword
         self.max_candidates = max_candidates
         self.process_id = process_id
                 clue = clue[:match.start()] + self.crossword.variables[var]['clue'] + clue[match.end():]
                 all_clues[idx] = clue
         # get predictions
         dpr = setup_closedbook(self.model_path, self.ans_tsv_path, self.dense_embd_glob, self.process_id, self.model_type)
+        all_words, all_scores = answer_clues(dpr, all_clues, max_answers = self.max_candidates, output_strings=True)
         for index, var in enumerate(self.crossword.variables):
             length = len(self.crossword.variables[var]["gold"])
             self.candidates[var] = {"words": [], "bit_array": None, "weights": {}}
                     keep_positions.append(word_index)
             words = [words[i] for i in keep_positions]
             scores = [scores[i] for i in keep_positions]
             scores = list(-np.log(softmax(np.array(scores) / 0.75)))
             for word, score in zip(words, scores):
                 self.candidates[var]["weights"][word] = score
             weights = self.candidates[var]["weights"]
             self.candidates[var]["words"] = sorted(weights, key=weights.get)
             self.candidates[var]["bit_array"] = np.zeros((len(chars), length, len(self.candidates[var]["words"])))
         # cleanup a bit
         del dpr
+    def evaluate(self, solution, print_log = True):
         letters_correct = 0
         letters_total = 0
         for i in range(len(self.crossword.letter_grid)):
             matching_cells = [self.crossword.letter_grid[cell[0]][cell[1]] == solution[cell[0]][cell[1]] for cell in cells]
             if len(cells) == sum(matching_cells):
                 words_correct += 1
             words_total += 1
+        letter_frac_log = "Letters Correct: {}/{} | Words Correct: {}/{}".format(int(letters_correct), int(letters_total), int(words_correct), int(words_total))
+        letter_acc_log = "Letters Correct: {}% | Words Correct: {}%".format(float(letters_correct/letters_total*100), float(words_correct/words_total*100))
+        if print_log:
+            print(letter_frac_log)
+            print(letter_acc_log)
+        return letter_frac_log, letter_acc_log

main.py CHANGED Viewed

@@ -7,9 +7,30 @@ from Strict_json import json_CA_json_converter
 import asyncio
 from fastapi.middleware.cors import CORSMiddleware
-MODEL_PATH_DISTIL = os.path.join("Inference_components","distilbert_EPOCHs_7_COMPLETE.bin")
-ANS_TSV_PATH_DISTIL = os.path.join("Inference_components","all_answer_list.tsv")
-DENSE_EMBD_PATH_DISTIL = os.path.join("Inference_components","distilbert_7_epochs_embeddings.pkl")
 app = FastAPI()
@@ -22,27 +43,35 @@ app.add_middleware(
 )
 async def solve_puzzle(json):
-    puzzle = json_CA_json_converter(json, False)
-    crossword = Crossword(puzzle)
-    # Perform asynchronous operations using asyncio.gather or asyncio.create_task
-    async def solve_async():
-        return await asyncio.to_thread(BPSolver, crossword, model_path=MODEL_PATH_DISTIL,
-                                       ans_tsv_path=ANS_TSV_PATH_DISTIL,
-                                       dense_embd_path=DENSE_EMBD_PATH_DISTIL,
-                                       max_candidates=40000,
-                                       model_type='distilbert')
-    solver = await solve_async()
-    # Run solve method asynchronously
-    async def solve_method_async():
-        return await asyncio.to_thread(solver.solve, num_iters=100, iterative_improvement_steps=0)
-    solution = await solve_method_async()
-    evaluation = await asyncio.to_thread(solver.evaluate, solution)
-    return solution, evaluation
 fifo_queue = asyncio.Queue()
@@ -55,8 +84,9 @@ async def worker():
         job_id, job, args, future = await fifo_queue.get()
         jobs[job_id]["status"] = "processing"
         result = await job(*args)
         jobs[job_id]["result"] = result
-        jobs[job_id]["status"] = "completed"
         future.set_result(job_id)
 @app.on_event("startup")

 import asyncio
 from fastapi.middleware.cors import CORSMiddleware
+MODEL_CONFIG = {
+	'bert':
+	{
+		'MODEL_PATH' : "./Inference_components/dpr_biencoder_trained_EPOCH_2_COMPLETE.bin",
+		'ANS_TSV_PATH': "./Inference_components/all_answer_list.tsv",
+		'DENSE_EMBD_PATH': "./Inference_components/embeddings_BERT_EPOCH_2_COMPLETE0.pkl"
+	},
+	'distilbert':
+	{
+		'MODEL_PATH': "./Inference_components/distilbert_EPOCHs_7_COMPLETE.bin",
+		'ANS_TSV_PATH': "./Inference_components/all_answer_list.tsv",
+		'DENSE_EMBD_PATH': "./Inference_components/distilbert_7_epochs_embeddings.pkl"
+	},
+	't5_small':
+	{
+		'MODEL_PATH': './Inference_components/t5_small_new_dataset_2EPOCHS/'
+	}
+}
+choosen_model_path = MODEL_CONFIG['distilbert']['MODEL_PATH']
+ans_list_path = MODEL_CONFIG['distilbert']['ANS_TSV_PATH']
+dense_embedding_path = MODEL_CONFIG['distilbert']['DENSE_EMBD_PATH']
+second_pass_model_path = MODEL_CONFIG['t5_small']['MODEL_PATH']
 app = FastAPI()
 )
 async def solve_puzzle(json):
+    try:
+        puzzle = json_CA_json_converter(json, False)
+        crossword = Crossword(puzzle)
+        async def solve_async():
+            return await asyncio.to_thread(BPSolver, crossword,
+                        model_path = choosen_model_path,
+                        ans_tsv_path = ans_list_path,
+                        dense_embd_path = dense_embedding_path,
+                        reranker_path = second_pass_model_path,
+                        max_candidates = 40000,
+                        model_type = 'distilbert')
+        solver = await solve_async()
+        async def solve_method_async():
+            return await asyncio.to_thread(solver.solve,num_iters=60, iterative_improvement_steps=3)
+        solution = await solve_method_async()
+        evaluation1 = await asyncio.to_thread(solver.evaluate, solution['first pass model']['grid'])
+        evaluation2 = await asyncio.to_thread(solver.evaluate, solution['second pass model']['final grid'])
+        return solution['second pass model']['final grid'], evaluation1, solution['second pass model']['final grid'], evaluation2
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None, None, None
 fifo_queue = asyncio.Queue()
         job_id, job, args, future = await fifo_queue.get()
         jobs[job_id]["status"] = "processing"
         result = await job(*args)
+        print(result)
         jobs[job_id]["result"] = result
+        jobs[job_id]["status"] = "completed" if result[1] else "failed"
         future.set_result(job_id)
 @app.on_event("startup")