Spaces:

Ujjwal123
/

EZ-Crossword

Running

App Files Files Community

Ujjwal123 commited on Dec 5, 2023

Commit

a04b340

•

1 Parent(s): acb82e9

copied the whole api code from django and updated the dockerfile

Browse files

Files changed (21) hide show

.gitignore +1 -0
BPSolver_inf.py +212 -0
Crossword_inf.py +56 -0
Data_utils_inf.py +175 -0
Dockerfile +6 -0
Faiss_Indexers_inf.py +214 -0
Inference_components/test.py +0 -1
Model_utils_inf.py +160 -0
Models_inf.py +391 -0
Normal_utils_inf.py +65 -0
Options_inf.py +275 -0
Solver_inf.py +129 -0
Strict_json.py +57 -0
Utils_inf.py +89 -0
extractpuzzle.py +792 -0
main.py +27 -2
models/__init__.py +38 -0
models/biencoder.py +427 -0
models/hf_models.py +368 -0
requirements.txt +6 -0
words_alpha.txt +0 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 *.png
 *.jpg
 *.mp4

+Inference_components/
 *.png
 *.jpg
 *.mp4

BPSolver_inf.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import math
+import string
+from collections import defaultdict
+from copy import deepcopy
+import numpy as np
+from scipy.special import log_softmax, softmax
+from tqdm import trange
+from Utils_inf import print_grid, get_word_flips
+from Solver_inf import Solver
+# the probability of each alphabetical character in the crossword
+UNIGRAM_PROBS = [('A', 0.0897379968935765), ('B', 0.02121248877769636), ('C', 0.03482206634145926), ('D', 0.03700942543460491), ('E', 0.1159773210750429), ('F', 0.017257461694024614), ('G', 0.025429024796296124), ('H', 0.033122967601502), ('I', 0.06800036223479956), ('J', 0.00294611331754349), ('K', 0.013860682888259786), ('L', 0.05130800574373874), ('M', 0.027962776827660175), ('N', 0.06631994270448001), ('O', 0.07374646543246745), ('P', 0.026750756212433214), ('Q', 0.001507814175439393), ('R', 0.07080460813737305), ('S', 0.07410988246048224), ('T', 0.07242993582154593), ('U', 0.0289272388037645), ('V', 0.009153522059555467), ('W', 0.01434705167591524), ('X', 0.003096729223103298), ('Y', 0.01749958208224007), ('Z', 0.002659777584995724)]
+# the LETTER_SMOOTHING_FACTOR controls how much we interpolate with the unigram LM. TODO this should be tuned.
+# Right now it is set according to the probability that the answer is not in the answer set
+LETTER_SMOOTHING_FACTOR = [0.0, 0.0, 0.04395604395604396, 0.0001372495196266813, 0.0005752186417796561, 0.0019841824329989103, 0.0048042463338563764, 0.013325257419745608, 0.027154447774285505, 0.06513517299341645, 0.12527790128946198, 0.22003002358996354, 0.23172376584839494, 0.254873006497342, 0.3985086992543496, 0.2764976958525346, 0.672645739910314, 0.6818181818181818, 0.8571428571428571, 0.8245614035087719, 0.8, 0.71900826446281, 0.0]
+class BPVar:
+    def __init__(self, name, variable, candidates, cells):
+        self.name = name
+        cells_by_position = {}
+        for cell in cells:
+            cells_by_position[cell.position] = cell
+            cell._connect(self)
+        self.length = len(cells)
+        self.ordered_cells = [cells_by_position[pos] for pos in variable['cells']]
+        self.candidates = candidates
+        self.words = self.candidates['words']
+        self.word_indices = np.array([[string.ascii_uppercase.index(l) for l in fill] for fill in self.candidates['words']]) # words x length of letter indices
+        self.scores = -np.array([self.candidates['weights'][fill] for fill in self.candidates['words']]) # the incoming 'weights' are costs
+        self.prior_log_probs = log_softmax(self.scores)
+        self.log_probs = log_softmax(self.scores)
+        self.directional_scores = [np.zeros(len(self.log_probs)) for _ in range(len(self.ordered_cells))]
+    def _propagate_to_var(self, other, belief_state):
+        assert other in self.ordered_cells
+        other_idx = self.ordered_cells.index(other)
+        letter_scores = belief_state
+        self.directional_scores[other_idx] = letter_scores[self.word_indices[:, other_idx]]
+    def _postprocess(self, all_letter_probs):
+        # unigram smoothing
+        unigram_probs = np.array([x[1] for x in UNIGRAM_PROBS])
+        for i in range(len(all_letter_probs)):
+            all_letter_probs[i] = (1 - LETTER_SMOOTHING_FACTOR[self.length]) * all_letter_probs[i] + LETTER_SMOOTHING_FACTOR[self.length] * unigram_probs
+        return all_letter_probs
+    def sync_state(self):
+        self.log_probs = log_softmax(sum(self.directional_scores) + self.prior_log_probs)
+    def propagate(self):
+        all_letter_probs = []
+        for i in range(len(self.ordered_cells)):
+            word_scores = self.log_probs - self.directional_scores[i]
+            word_probs = softmax(word_scores)
+            letter_probs = (self.candidates['bit_array'][:, i] * np.expand_dims(word_probs, axis=0)).sum(axis=1) + 1e-8
+            all_letter_probs.append(letter_probs)
+        all_letter_probs = self._postprocess(all_letter_probs) # unigram postprocessing
+        for i, cell in enumerate(self.ordered_cells):
+            cell._propagate_to_cell(self, np.log(all_letter_probs[i]))
+class BPCell:
+    def __init__(self, position, clue_pair):
+        self.crossing_clues = clue_pair
+        self.position = tuple(position)
+        self.letters = list(string.ascii_uppercase)
+        self.log_probs = np.log(np.array([1./len(self.letters) for _ in range(len(self.letters))]))
+        self.crossing_vars = []
+        self.directional_scores = []
+        self.prediction = {}
+    def _connect(self, other):
+        self.crossing_vars.append(other)
+        self.directional_scores.append(None)
+        assert len(self.crossing_vars) <= 2
+    def _propagate_to_cell(self, other, belief_state):
+        assert other in self.crossing_vars
+        other_idx = self.crossing_vars.index(other)
+        self.directional_scores[other_idx] = belief_state
+    def sync_state(self):
+        self.log_probs = log_softmax(sum(self.directional_scores))
+    def propagate(self):
+        assert len(self.crossing_vars) == 2
+        for i, v in enumerate(self.crossing_vars):
+            v._propagate_to_var(self, self.directional_scores[1-i])
+class BPSolver(Solver):
+    def __init__(self,
+                 crossword,
+                 model_path,
+                 ans_tsv_path,
+                 dense_embd_path,
+                 max_candidates = 5000,
+                 process_id = 0,
+                 model_type = 'bert',
+                 **kwargs):
+        super().__init__(crossword,
+                         model_path,
+                         ans_tsv_path,
+                         dense_embd_path,
+                         max_candidates = max_candidates,
+                         process_id  = process_id,
+                         model_type = model_type,
+                         **kwargs)
+        self.crossword = crossword
+        # our answer set
+        self.answer_set = set()
+        with open(ans_tsv_path, 'r') as rf:
+            for line in rf:
+                w = ''.join([c.upper() for c in (line.split('\t')[-1]).upper() if c in string.ascii_uppercase])
+                self.answer_set.add(w)
+        self.reset()
+    def reset(self):
+        self.bp_cells = []
+        self.bp_cells_by_clue = defaultdict(lambda: [])
+        for position, clue_pair in self.crossword.grid_cells.items():
+            cell = BPCell(position, clue_pair)
+            self.bp_cells.append(cell)
+            for clue in clue_pair:
+                self.bp_cells_by_clue[clue].append(cell)
+        self.bp_vars = []
+        for key, value in self.crossword.variables.items():
+            var = BPVar(key, value, self.candidates[key], self.bp_cells_by_clue[key])
+            self.bp_vars.append(var)
+    def solve(self, num_iters=10, iterative_improvement_steps=5, return_greedy_states = False, return_ii_states = False):
+        # run solving for num_iters iterations
+        print('beginning BP iterations')
+        for _ in trange(num_iters):
+            for var in self.bp_vars:
+                var.propagate()
+            for cell in self.bp_cells:
+                cell.sync_state()
+            for cell in self.bp_cells:
+                cell.propagate()
+            for var in self.bp_vars:
+                var.sync_state()
+        print('done BP iterations')
+        # Get the current based grid based on greedy selection from the marginals
+        if return_greedy_states:
+            grid, all_grids = self.greedy_sequential_word_solution(return_grids = True)
+        else:
+            grid = self.greedy_sequential_word_solution()
+            all_grids = []
+        grid = self.greedy_sequential_word_solution()
+        # print('=====Greedy search grid=====')
+        # print_grid(grid)
+        if iterative_improvement_steps < 1:
+            if return_greedy_states or return_ii_states:
+                return grid, all_grids
+            else:
+                return grid
+    def greedy_sequential_word_solution(self, return_grids = False):
+        all_grids = []
+        # after we've run BP, we run a simple greedy search to get the final.
+        # We repeatedly pick the highest-log-prob candidate across all clues which fits the grid, and fill it.
+        # at the end, if you have any cells left (due to missing gold candidates) just fill it with the argmax on that letter.
+        cache = [(deepcopy(var.words), deepcopy(var.log_probs)) for var in self.bp_vars]
+        grid = [["" for _ in row] for row in self.crossword.letter_grid]
+        unfilled_cells = set([cell.position for cell in self.bp_cells])
+        for var in self.bp_vars:
+            # postprocess log probs to estimate probability that you don't have the right candidate
+            var.log_probs = var.log_probs + math.log(1 - LETTER_SMOOTHING_FACTOR[var.length])
+        best_per_var = [var.log_probs.max() for var in self.bp_vars]
+        while not all([x is None for x in best_per_var]):
+            all_grids.append(deepcopy(grid))
+            best_index = best_per_var.index(max([x for x in best_per_var if x is not None]))
+            best_var = self.bp_vars[best_index]
+            best_word = best_var.words[best_var.log_probs.argmax()]
+            # print('greedy filling in', best_word)
+            for i, cell in enumerate(best_var.ordered_cells):
+                letter = best_word[i]
+                grid[cell.position[0]][cell.position[1]] = letter
+                if cell.position in unfilled_cells:
+                    unfilled_cells.remove(cell.position)
+                for var in cell.crossing_vars:
+                    if var != best_var:
+                        cell_index = var.ordered_cells.index(cell)
+                        keep_indices = [j for j in range(len(var.words)) if var.words[j][cell_index] == letter]
+                        var.words = [var.words[j] for j in keep_indices]
+                        var.log_probs = var.log_probs[keep_indices]
+                        var_index = self.bp_vars.index(var)
+                        if len(keep_indices) > 0:
+                            best_per_var[var_index] = var.log_probs.max()
+                        else:
+                            best_per_var[var_index] = None
+            best_var.words = []
+            best_var.log_probs = best_var.log_probs[[]]
+            best_per_var[best_index] = None
+        for cell in self.bp_cells:
+            if cell.position in unfilled_cells:
+                grid[cell.position[0]][cell.position[1]] = string.ascii_uppercase[cell.log_probs.argmax()]
+        for var, (words, log_probs) in zip(self.bp_vars, cache): # restore state
+            var.words = words
+            var.log_probs = log_probs
+        if return_grids:
+            return grid, all_grids
+        else:
+            return grid

Crossword_inf.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from Utils_inf import clean
+class Crossword:
+    def __init__(self, data):
+        self.initialize_grids(grid=data["grid"])
+        self.initialize_clues(clues=data["clues"])
+        self.initialize_variables()
+    def initialize_grids(self, grid):
+        self.letter_grid = [[grid[j][i][1] if type(grid[j][i]) == list else "" for i in
+                             range(len(grid[0]))] for j in range(len(grid))]
+        self.number_grid = [[grid[j][i][0] if type(grid[j][i]) == list else "" for i in
+                             range(len(grid[0]))] for j in range(len(grid))]
+        self.grid_cells = {}
+    def initialize_clues(self, clues):
+        self.across = clues["across"]
+        self.down = clues["down"]
+    def initialize_variable(self, position, clues, across=True):
+        row, col = position
+        cell_number = self.number_grid[row][col]
+        assert cell_number in clues, print("Missing clue")
+        word_id = cell_number + "A" if across else cell_number + "D"
+        clue = clean(clues[cell_number][0])
+        answer = clean(clues[cell_number][1])
+        for idx in range(len(answer)):
+            cell = (row, col + idx) if across else (row + idx, col)
+            if cell in self.grid_cells:
+                self.grid_cells[cell].append(word_id)
+            else:
+                self.grid_cells[cell] = [word_id]
+            if word_id in self.variables:
+                self.variables[word_id]["cells"].append(cell)
+            else:
+                self.variables[word_id] = {"clue": clue, "gold": answer, "cells": [cell], "crossing": []}
+    def initialize_crossing(self):
+        for word_id in self.variables:
+            cells = self.variables[word_id]["cells"]
+            crossing_ids = []
+            for cell in cells:
+                crossing_ids += list(filter(lambda x: x!= word_id, self.grid_cells[cell]))
+            self.variables[word_id]["crossing"] = crossing_ids
+    def initialize_variables(self):
+        self.variables = {}
+        for row in range(len(self.number_grid)):
+            for col in range(len(self.number_grid[0])):
+                cell_number = self.number_grid[row][col]
+                if self.number_grid[row][col] != "":
+                    if cell_number in self.across:
+                        self.initialize_variable((row, col), self.across, across=True)
+                    if cell_number in self.down:
+                        self.initialize_variable((row, col), self.down, across=False)
+        self.initialize_crossing()

Data_utils_inf.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import json
+import logging
+import math
+import pickle
+import random
+from typing import List, Iterator, Callable
+from torch import Tensor as T
+logger = logging.getLogger()
+def read_serialized_data_from_files(paths: List[str]) -> List:
+    results = []
+    for i, path in enumerate(paths):
+        with open(path, "rb") as reader:
+            logger.info("Reading file %s", path)
+            data = pickle.load(reader)
+            results.extend(data)
+            logger.info("Aggregated data size: {}".format(len(results)))
+    logger.info("Total data size: {}".format(len(results)))
+    return results
+def read_data_from_json_files(paths: List[str], upsample_rates: List = None) -> List:
+    results = []
+    if upsample_rates is None:
+        upsample_rates = [1] * len(paths)
+    assert len(upsample_rates) == len(
+        paths
+    ), "up-sample rates parameter doesn't match input files amount"
+    for i, path in enumerate(paths):
+        with open(path, "r", encoding="utf-8") as f:
+            logger.info("Reading file %s" % path)
+            data = json.load(f)
+            upsample_factor = int(upsample_rates[i])
+            data = data * upsample_factor
+            results.extend(data)
+            logger.info("Aggregated data size: {}".format(len(results)))
+    return results
+class ShardedDataIterator(object):
+    """
+    General purpose data iterator to be used for Pytorch's DDP mode where every node should handle its own part of
+    the data.
+    Instead of cutting data shards by their min size, it sets the amount of iterations by the maximum shard size.
+    It fills the extra sample by just taking first samples in a shard.
+    It can also optionally enforce identical batch size for all iterations (might be useful for DP mode).
+    """
+    def __init__(
+        self,
+        data: list,
+        shard_id: int = 0,
+        num_shards: int = 1,
+        batch_size: int = 1,
+        shuffle=True,
+        shuffle_seed: int = 0,
+        offset: int = 0,
+        strict_batch_size: bool = False,
+    ):
+        self.data = data
+        total_size = len(data)
+        self.shards_num = max(num_shards, 1)
+        self.shard_id = max(shard_id, 0)
+        samples_per_shard = math.ceil(total_size / self.shards_num)
+        self.shard_start_idx = self.shard_id * samples_per_shard
+        self.shard_end_idx = min(self.shard_start_idx + samples_per_shard, total_size)
+        if strict_batch_size:
+            self.max_iterations = math.ceil(samples_per_shard / batch_size)
+        else:
+            self.max_iterations = int(samples_per_shard / batch_size)
+        logger.debug(
+            "samples_per_shard=%d, shard_start_idx=%d, shard_end_idx=%d, max_iterations=%d",
+            samples_per_shard,
+            self.shard_start_idx,
+            self.shard_end_idx,
+            self.max_iterations,
+        )
+        self.iteration = offset  # to track in-shard iteration status
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.shuffle_seed = shuffle_seed
+        self.strict_batch_size = strict_batch_size
+    def total_data_len(self) -> int:
+        return len(self.data)
+    def iterate_data(self, epoch: int = 0) -> Iterator[List]:
+        if self.shuffle:
+            # to be able to resume, same shuffling should be used when starting from a failed/stopped iteration
+            epoch_rnd = random.Random(self.shuffle_seed + epoch)
+            epoch_rnd.shuffle(self.data)
+        # if resuming iteration somewhere in the middle of epoch, one needs to adjust max_iterations
+        max_iterations = self.max_iterations - self.iteration
+        shard_samples = self.data[self.shard_start_idx : self.shard_end_idx]
+        for i in range(
+            self.iteration * self.batch_size, len(shard_samples), self.batch_size
+        ):
+            items = shard_samples[i : i + self.batch_size]
+            if self.strict_batch_size and len(items) < self.batch_size:
+                logger.debug("Extending batch to max size")
+                items.extend(shard_samples[0 : self.batch_size - len(items)])
+            self.iteration += 1
+            yield items
+        # some shards may done iterating while the others are at the last batch. Just return the first batch
+        while self.iteration < max_iterations:
+            logger.debug("Fulfilling non complete shard=".format(self.shard_id))
+            self.iteration += 1
+            batch = shard_samples[0 : self.batch_size]
+            yield batch
+        logger.debug(
+            "Finished iterating, iteration={}, shard={}".format(
+                self.iteration, self.shard_id
+            )
+        )
+        # reset the iteration status
+        self.iteration = 0
+    def get_iteration(self) -> int:
+        return self.iteration
+    def apply(self, visitor_func: Callable):
+        for sample in self.data:
+            visitor_func(sample)
+def normalize_question(question: str) -> str:
+    if question[-1] == "?":
+        question = question[:-1]
+    return question
+class Tensorizer(object):
+    """
+    Component for all text to model input data conversions and related utility methods
+    """
+    # Note: title, if present, is supposed to be put before text (i.e. optional title + document body)
+    def text_to_tensor(
+        self, text: str, title: str = None, add_special_tokens: bool = True
+    ):
+        raise NotImplementedError
+    def get_pair_separator_ids(self) -> T:
+        raise NotImplementedError
+    def get_pad_id(self) -> int:
+        raise NotImplementedError
+    def get_attn_mask(self, tokens_tensor: T):
+        raise NotImplementedError
+    def is_sub_word_id(self, token_id: int):
+        raise NotImplementedError
+    def to_string(self, token_ids, skip_special_tokens=True):
+        raise NotImplementedError
+    def set_pad_to_max(self, pad: bool):
+        raise NotImplementedError

Dockerfile CHANGED Viewed

@@ -20,4 +20,10 @@ WORKDIR $HOME/app
 COPY --chown=user . $HOME/app/
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user . $HOME/app/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/blob/main/all_answer_list.tsv $HOME/app/Inference_components/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/blob/main/distilbert_7_epochs_embeddings.pkl $HOME/app/Inference_components/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/blob/main/distilbert_EPOCHs_7_COMPLETE.bin $HOME/app/Inference_components/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/blob/main/dpr_biencoder_trained_500k.bin $HOME/app/Inference_components/
+ADD --chown=user https://huggingface.co/prajesh069/clue-answer.multi-answer-scoring.dual-bert-encoder/blob/main/embeddings_all_answers_json_0.pkl $HOME/app/Inference_components/
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

Faiss_Indexers_inf.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import time
+import logging
+import pickle
+from typing import List, Tuple, Iterator
+import faiss
+import numpy as np
+logger = logging.getLogger()
+class DenseIndexer(object):
+    def __init__(self, buffer_size: int = 50000):
+        self.buffer_size = buffer_size
+        self.index_id_to_db_id = []
+        self.index = None
+    def index_data(self, vector_files: List[str]):
+        start_time = time.time()
+        buffer = []
+        for i, item in enumerate(iterate_encoded_files(vector_files)):
+            db_id, doc_vector = item
+            buffer.append((db_id, doc_vector))
+            if 0 < self.buffer_size == len(buffer):
+                # indexing in batches is beneficial for many faiss index types
+                self._index_batch(buffer)
+                logger.info(
+                    "data indexed %d, used_time: %f sec.",
+                    len(self.index_id_to_db_id),
+                    time.time() - start_time,
+                )
+                buffer = []
+        self._index_batch(buffer)
+        indexed_cnt = len(self.index_id_to_db_id)
+        logger.info("Total data indexed %d", indexed_cnt)
+        logger.info("Data indexing completed.")
+    def _index_batch(self, data: List[Tuple[object, np.array]]):
+        raise NotImplementedError
+    def search_knn(
+        self, query_vectors: np.array, top_docs: int
+    ) -> List[Tuple[List[object], List[float]]]:
+        raise NotImplementedError
+    def serialize(self, file: str):
+        logger.info("Serializing index to %s", file)
+        if os.path.isdir(file):
+            index_file = os.path.join(file, "index.dpr")
+            meta_file = os.path.join(file, "index_meta.dpr")
+        else:
+            index_file = file + ".index.dpr"
+            meta_file = file + ".index_meta.dpr"
+        faiss.write_index(self.index, index_file)
+        with open(meta_file, mode="wb") as f:
+            pickle.dump(self.index_id_to_db_id, f)
+    def deserialize_from(self, file: str):
+        logger.info("Loading index from %s", file)
+        if os.path.isdir(file):
+            index_file = os.path.join(file, "index.dpr")
+            meta_file = os.path.join(file, "index_meta.dpr")
+        else:
+            index_file = file + ".index.dpr"
+            meta_file = file + ".index_meta.dpr"
+        self.index = faiss.read_index(index_file)
+        logger.info(
+            "Loaded index of type %s and size %d", type(self.index), self.index.ntotal
+        )
+        with open(meta_file, "rb") as reader:
+            self.index_id_to_db_id = pickle.load(reader)
+        assert (
+            len(self.index_id_to_db_id) == self.index.ntotal
+        ), "Deserialized index_id_to_db_id should match faiss index size"
+    def _update_id_mapping(self, db_ids: List):
+        self.index_id_to_db_id.extend(db_ids)
+class DenseFlatIndexer(DenseIndexer):
+    def __init__(self, vector_sz: int, buffer_size: int = 50000):
+        super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size)
+        #res = faiss.StandardGpuResources()
+        #cpu_index = faiss.IndexFlatIP(vector_sz)
+        #self.index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+        self.index = faiss.IndexFlatIP(vector_sz)
+        self.all_vectors = None
+    def _index_batch(self, data: List[Tuple[object, np.array]]):
+        db_ids = [t[0] for t in data]
+        vectors = [np.reshape(t[1], (1, -1)) for t in data]
+        vectors = np.concatenate(vectors, axis=0)
+        self._update_id_mapping(db_ids)
+        self.index.add(vectors)
+        #if self.all_vectors is None:
+        #    self.all_vectors = vectors
+        #else:
+        #    self.all_vectors = np.concatenate((self.all_vectors, vectors), axis=0)
+    def search_knn(
+        self, query_vectors: np.array, top_docs: int
+    ) -> List[Tuple[List[object], List[float]]]:
+        scores, indexes = self.index.search(query_vectors, top_docs)
+        # convert to external ids
+        db_ids = [
+            [self.index_id_to_db_id[i] for i in query_top_idxs]
+            for query_top_idxs in indexes
+        ]
+        result = [(db_ids[i], scores[i]) for i in range(len(db_ids))]
+        return result
+class DenseHNSWFlatIndexer(DenseIndexer):
+    """
+    Efficient index for retrieval. Note: default settings are for hugh accuracy but also high RAM usage
+    """
+    def __init__(
+        self,
+        vector_sz: int,
+        buffer_size: int = 50000,
+        store_n: int = 512,
+        ef_search: int = 128,
+        ef_construction: int = 200,
+    ):
+        super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size)
+        # IndexHNSWFlat supports L2 similarity only
+        # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
+        index = faiss.IndexHNSWFlat(vector_sz + 1, store_n)
+        index.hnsw.efSearch = ef_search
+        index.hnsw.efConstruction = ef_construction
+        self.index = index
+        self.phi = None
+    def index_data(self, vector_files: List[str]):
+        self._set_phi(vector_files)
+        super(DenseHNSWFlatIndexer, self).index_data(vector_files)
+    def _set_phi(self, vector_files: List[str]):
+        """
+        Calculates the max norm from the whole data and assign it to self.phi: necessary to transform IP -> L2 space
+        :param vector_files: file names to get passages vectors from
+        :return:
+        """
+        phi = 0
+        for i, item in enumerate(iterate_encoded_files(vector_files)):
+            id, doc_vector = item
+            norms = (doc_vector ** 2).sum()
+            phi = max(phi, norms)
+        logger.info("HNSWF DotProduct -> L2 space phi={}".format(phi))
+        self.phi = phi
+    def _index_batch(self, data: List[Tuple[object, np.array]]):
+        # max norm is required before putting all vectors in the index to convert inner product similarity to L2
+        if self.phi is None:
+            raise RuntimeError(
+                "Max norm needs to be calculated from all data at once,"
+                "results will be unpredictable otherwise."
+                "Run `_set_phi()` before calling this method."
+            )
+        db_ids = [t[0] for t in data]
+        vectors = [np.reshape(t[1], (1, -1)) for t in data]
+        norms = [(doc_vector ** 2).sum() for doc_vector in vectors]
+        aux_dims = [np.sqrt(self.phi - norm) for norm in norms]
+        hnsw_vectors = [
+            np.hstack((doc_vector, aux_dims[i].reshape(-1, 1)))
+            for i, doc_vector in enumerate(vectors)
+        ]
+        hnsw_vectors = np.concatenate(hnsw_vectors, axis=0)
+        self._update_id_mapping(db_ids)
+        self.index.add(hnsw_vectors)
+    def search_knn(
+        self, query_vectors: np.array, top_docs: int
+    ) -> List[Tuple[List[object], List[float]]]:
+        aux_dim = np.zeros(len(query_vectors), dtype="float32")
+        query_nhsw_vectors = np.hstack((query_vectors, aux_dim.reshape(-1, 1)))
+        logger.info("query_hnsw_vectors %s", query_nhsw_vectors.shape)
+        scores, indexes = self.index.search(query_nhsw_vectors, top_docs)
+        # convert to external ids
+        db_ids = [
+            [self.index_id_to_db_id[i] for i in query_top_idxs]
+            for query_top_idxs in indexes
+        ]
+        result = [(db_ids[i], scores[i]) for i in range(len(db_ids))]
+        return result
+    def deserialize_from(self, file: str):
+        super(DenseHNSWFlatIndexer, self).deserialize_from(file)
+        # to trigger warning on subsequent indexing
+        self.phi = None
+def iterate_encoded_files(vector_files: str) -> Iterator[Tuple[object, np.array]]:
+    # for i, file in enumerate(vector_files):
+    logger.info("Reading file %s", vector_files)
+    with open(vector_files, "rb") as reader:
+        doc_vectors = pickle.load(reader)
+        for doc in doc_vectors:
+            db_id, doc_vector = doc
+            yield db_id, doc_vector

Inference_components/test.py DELETED Viewed

	@@ -1 +0,0 @@
1	- print('hello')

Model_utils_inf.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import collections
+import glob
+import logging
+import os
+from typing import List
+import torch
+from torch import nn
+from torch.optim.lr_scheduler import LambdaLR
+from torch.serialization import default_restore_location
+logger = logging.getLogger()
+CheckpointState = collections.namedtuple(
+    "CheckpointState",
+    [
+        "model_dict",
+        "optimizer_dict",
+        "scheduler_dict",
+        "offset",
+        "epoch",
+        "encoder_params",
+    ],
+)
+def setup_for_distributed_mode(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    device: object,
+    n_gpu: int = 1,
+    local_rank: int = -1,
+    fp16: bool = False,
+    fp16_opt_level: str = "O1",
+) -> (nn.Module, torch.optim.Optimizer):
+    model.to(device)
+    if fp16:
+        try:
+            import apex
+            from apex import amp
+            apex.amp.register_half_function(torch, "einsum")
+        except ImportError:
+            raise ImportError(
+                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
+            )
+        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level)
+    if n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    if local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[local_rank],
+            output_device=local_rank,
+            find_unused_parameters=True,
+        )
+    return model, optimizer
+def move_to_cuda(sample):
+    if len(sample) == 0:
+        return {}
+    def _move_to_cuda(maybe_tensor):
+        if torch.is_tensor(maybe_tensor):
+            return maybe_tensor.cuda()
+        elif isinstance(maybe_tensor, dict):
+            return {key: _move_to_cuda(value) for key, value in maybe_tensor.items()}
+        elif isinstance(maybe_tensor, list):
+            return [_move_to_cuda(x) for x in maybe_tensor]
+        elif isinstance(maybe_tensor, tuple):
+            return [_move_to_cuda(x) for x in maybe_tensor]
+        else:
+            return maybe_tensor
+    return _move_to_cuda(sample)
+def move_to_device(sample, device):
+    if len(sample) == 0:
+        return {}
+    def _move_to_device(maybe_tensor, device):
+        if torch.is_tensor(maybe_tensor):
+            return maybe_tensor.to(device)
+        elif isinstance(maybe_tensor, dict):
+            return {
+                key: _move_to_device(value, device)
+                for key, value in maybe_tensor.items()
+            }
+        elif isinstance(maybe_tensor, list):
+            return [_move_to_device(x, device) for x in maybe_tensor]
+        elif isinstance(maybe_tensor, tuple):
+            return [_move_to_device(x, device) for x in maybe_tensor]
+        else:
+            return maybe_tensor
+    return _move_to_device(sample, device)
+def get_schedule_linear(optimizer, warmup_steps, training_steps, last_epoch=-1):
+    """Create a schedule with a learning rate that decreases linearly after
+    linearly increasing during a warmup period.
+    """
+    def lr_lambda(current_step):
+        if current_step < warmup_steps:
+            return float(current_step) / float(max(1, warmup_steps))
+        return max(
+            0.0,
+            float(training_steps - current_step)
+            / float(max(1, training_steps - warmup_steps)),
+        )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+def init_weights(modules: List):
+    for module in modules:
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+def get_model_obj(model: nn.Module):
+    return model.module if hasattr(model, "module") else model
+def get_model_file(args, file_prefix) -> str:
+    if args.model_file and os.path.exists(args.model_file):
+        return args.model_file
+    out_cp_files = (
+        glob.glob(os.path.join(args.output_dir, file_prefix + "*"))
+        if args.output_dir
+        else []
+    )
+    logger.info("Checkpoint files %s", out_cp_files)
+    model_file = None
+    if len(out_cp_files) > 0:
+        model_file = max(out_cp_files, key=os.path.getctime)
+    return model_file
+def load_states_from_checkpoint(model_file: str) -> CheckpointState:
+    logger.info("Reading saved model from s", model_file)
+    if isinstance(model_file, tuple):
+        model_file = model_file[0]
+    state_dict = torch.load(
+        model_file, map_location=lambda s, l: default_restore_location(s, "cpu")
+    )
+    logger.info("model_state_dict keys %s", state_dict.keys())
+    return CheckpointState(**state_dict)

Models_inf.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# This file contains the inference code for loading and running the closed-book and open-book QA models
+import os
+import csv
+import glob
+import gzip
+import string
+import sys
+from typing import List, Tuple, Dict
+import re
+import math
+import collections
+import numpy as np
+import unicodedata
+import torch
+from torch import Tensor as T
+from torch import nn
+from models import init_biencoder_components
+from Options_inf import setup_args_gpu, print_args, set_encoder_params_from_state
+from Faiss_Indexers_inf import DenseIndexer, DenseFlatIndexer
+from Data_utils_inf import Tensorizer
+from Model_utils_inf import load_states_from_checkpoint, get_model_obj
+SEGMENTER_CACHE = {}
+RERANKER_CACHE = {}
+def setup_closedbook(model_path, ans_tsv_path, dense_embd_path, process_id, model_type):
+    dpr = DPRForCrossword(
+        model_path,
+        ans_tsv_path,
+        dense_embd_path,
+        retrievalmodel = False,
+        process_id=process_id,
+        model_type = model_type
+    )
+    return dpr
+def preprocess_clue_fn(clue):
+    clue = str(clue)
+    # https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
+    clue = ''.join(c for c in unicodedata.normalize('NFD', clue) if unicodedata.category(c) != 'Mn')
+    clue = re.sub("\x17|\x18|\x93|\x94|“|”|''|\"\"", "\"", clue)
+    clue = re.sub("\x85|…", "...", clue)
+    clue = re.sub("\x91|\x92|‘|’", "'", clue)
+    clue = re.sub("‚", ",", clue)
+    clue = re.sub("—|–", "-", clue)
+    clue = re.sub("¢", " cents", clue)
+    clue = re.sub("¿|¡|^;|\{|\}", "", clue)
+    clue = re.sub("÷", "division", clue)
+    clue = re.sub("°", " degrees", clue)
+    euro = re.search("^£[0-9]+(,*[0-9]*){0,}| £[0-9]+(,*[0-9]*){0,}", clue)
+    if euro:
+        num = clue[:euro.end()]
+        rest_clue = clue[euro.end():]
+        clue = num + " Euros" + rest_clue
+        clue = re.sub(", Euros", " Euros", clue)
+        clue = re.sub("Euros [Mm]illion", "million Euros", clue)
+        clue = re.sub("Euros [Bb]illion", "billion Euros", clue)
+        clue = re.sub("Euros[Kk]", "K Euros", clue)
+        clue = re.sub(" K Euros", "K Euros", clue)
+        clue = re.sub("£", "", clue)
+    clue = re.sub(" *\(\d{1,},*\)$| *\(\d{1,},* \d{1,}\)$", "", clue)
+    clue = re.sub("&amp;", "&", clue)
+    clue = re.sub("&lt;", "<", clue)
+    clue = re.sub("&gt;", ">", clue)
+    clue = re.sub("e\.g\.|for ex\.", "for example", clue)
+    clue = re.sub(": [Aa]bbreviat\.|: [Aa]bbrev\.|: [Aa]bbrv\.|: [Aa]bbrv|: [Aa]bbr\.|: [Aa]bbr", " abbreviation", clue)
+    clue = re.sub("abbr\.|abbrv\.", "abbreviation", clue)
+    clue = re.sub("Abbr\.|Abbrv\.", "Abbreviation", clue)
+    clue = re.sub("\(anag\.\)|\(anag\)", "(anagram)", clue)
+    clue = re.sub("org\.", "organization", clue)
+    clue = re.sub("Org\.", "Organization", clue)
+    clue = re.sub("Grp\.|Gp\.", "Group", clue)
+    clue = re.sub("grp\.|gp\.", "group", clue)
+    clue = re.sub(": Sp\.", " (Spanish)", clue)
+    clue = re.sub("\(Sp\.\)|Sp\.", "(Spanish)", clue)
+    clue = re.sub("Ave\.", "Avenue", clue)
+    clue = re.sub("Sch\.", "School", clue)
+    clue = re.sub("sch\.", "school", clue)
+    clue = re.sub("Agcy\.", "Agency", clue)
+    clue = re.sub("agcy\.", "agency", clue)
+    clue = re.sub("Co\.", "Company", clue)
+    clue = re.sub("co\.", "company", clue)
+    clue = re.sub("No\.", "Number", clue)
+    clue = re.sub("no\.", "number", clue)
+    clue = re.sub(": [Vv]ar\.", " variable", clue)
+    clue = re.sub("Subj\.", "Subject", clue)
+    clue = re.sub("subj\.", "subject", clue)
+    clue = re.sub("Subjs\.", "Subjects", clue)
+    clue = re.sub("subjs\.", "subjects", clue)
+    theme_clue = re.search("^.+\|[A-Z]{1,}", clue)
+    if theme_clue:
+        clue = re.sub("\|", " | ", clue)
+    if "Partner of" in clue:
+        clue = re.sub("Partner of", "", clue)
+        clue = clue + " and ___"
+    link = re.search("^.+-.+ [Ll]ink$", clue)
+    if link:
+        no_link = re.search("^.+-.+ ", clue)
+        x_y = clue[no_link.start():no_link.end() - 1]
+        x_y_lst = x_y.split("-")
+        clue = x_y_lst[0] + " ___ " + x_y_lst[1]
+    follower = re.search("^.+ [Ff]ollower$", clue)
+    if follower:
+        no_follower = re.search("^.+ ", clue)
+        x = clue[:no_follower.end() - 1]
+        clue = x + " ___"
+    preceder = re.search("^.+ [Pp]receder$", clue)
+    if preceder:
+        no_preceder = re.search("^.+ ", clue)
+        x = clue[:no_preceder.end() - 1]
+        clue = "___ " + x
+    if re.search("--[^A-Za-z]|--$", clue):
+        clue = re.sub("--", "__", clue)
+    if not re.search("_-[A-Za-z]|_-$", clue):
+        clue = re.sub("_-", "__", clue)
+    clue = re.sub("_{2,}", "___", clue)
+    clue = re.sub("\?$", " (wordplay)", clue)
+    nonverbal = re.search("\[[^0-9]+,* *[^0-9]*\]", clue)
+    if nonverbal:
+        clue = re.sub("\[|\]", "", clue)
+        clue = clue + " (nonverbal)"
+    if clue[:4] == "\"\"\" " and clue[-4:] == " \"\"\"":
+        clue = "\"" + clue[4:-4] + "\""
+    if clue[:4] == "''' " and clue[-4:] == " '''":
+        clue = "'" + clue[4:-4] + "'"
+    if clue[:3] == "\"\"\"" and clue[-3:] == "\"\"\"":
+        clue = "\"" + clue[3:-3] + "\""
+    if clue[:3] == "'''" and clue[-3:] == "'''":
+        clue = "'" + clue[3:-3] + "'"
+    return clue
+def answer_clues(dpr, clues, max_answers, output_strings=False):
+    clues = [preprocess_clue_fn(c.rstrip()) for c in clues]
+    outputs = dpr.answer_clues_closedbook(clues, max_answers, output_strings=output_strings)
+    return outputs
+class DenseRetriever(object):
+    """
+    Does passage retrieving over the provided index and question encoder
+    """
+    def __init__(
+        self,
+        question_encoder: nn.Module,
+        batch_size: int,
+        tensorizer: Tensorizer,
+        index: DenseIndexer,
+        device=None,
+        model_type = 'bert'
+    ):
+        self.question_encoder = question_encoder
+        self.batch_size = batch_size
+        self.tensorizer = tensorizer
+        self.index = index
+        self.device = device
+        self.model_type = model_type
+    def generate_question_vectors(self, questions: List[str]) -> T:
+        n = len(questions)
+        bsz = self.batch_size
+        query_vectors = []
+        self.question_encoder.eval()
+        with torch.no_grad():
+            for j, batch_start in enumerate(range(0, n, bsz)):
+                batch_token_tensors = [
+                    self.tensorizer.text_to_tensor(q)
+                    for q in questions[batch_start : batch_start + bsz]
+                ]
+                q_ids_batch = torch.stack(batch_token_tensors, dim=0).to(self.device)
+                q_seg_batch = torch.zeros_like(q_ids_batch).to(self.device)
+                # q_attn_mask = self.tensorizer.get_attn_mask(q_ids_batch)
+                q_attn_mask = (q_ids_batch != 0)
+                if self.model_type == 'bert':
+                    _, out, _ = self.question_encoder(q_ids_batch, q_seg_batch, q_attn_mask)
+                elif self.model_type == 'distilbert':
+                    _, out, _ = self.question_encoder(q_ids_batch, q_attn_mask)
+                query_vectors.extend(out.cpu().split(1, dim=0))
+        query_tensor = torch.cat(query_vectors, dim=0)
+        assert query_tensor.size(0) == len(questions)
+        return query_tensor
+    def get_top_docs(self, query_vectors: np.array, top_docs: int = 100) -> List[Tuple[List[object], List[float]]]:
+        """
+        Does the retrieval of the best matching passages given the query vectors batch
+        :param query_vectors:
+        :param top_docs:
+        :return:
+        """
+        results = self.index.search_knn(query_vectors, top_docs)
+        return results
+class FakeRetrieverArgs:
+    """Used to surpress the existing argparse inside DPR so we can have our own argparse"""
+    def __init__(self):
+        self.do_lower_case = False
+        self.pretrained_model_cfg = None
+        self.encoder_model_type = None
+        self.model_file = None
+        self.projection_dim = 0
+        self.sequence_length = 512
+        self.do_fill_lower_case = False
+        self.desegment_valid_fill = False
+        self.no_cuda = True
+        self.local_rank = -1
+        self.fp16 = False
+        self.fp16_opt_level = "O1"
+class DPRForCrossword(object):
+    """Closedbook model for Crossword clue answering"""
+    def __init__(
+        self,
+        model_file,
+        ctx_file,
+        encoded_ctx_file,
+        batch_size = 16,
+        retrievalmodel=False,
+        process_id = 0,
+        model_type = 'bert'
+    ):
+        self.retrievalmodel = retrievalmodel  # am I a wikipedia retrieval model or a closed-book model
+        args = FakeRetrieverArgs()
+        args.model_file = model_file
+        args.ctx_file = ctx_file
+        args.encoded_ctx_file = encoded_ctx_file
+        args.batch_size = batch_size
+        # self.device = torch.device("cuda:"+str(process_id%torch.cuda.device_count()))
+        self.device = 'cpu'
+        self.model_type = model_type
+        setup_args_gpu(args)
+        saved_state = load_states_from_checkpoint(args.model_file)
+        set_encoder_params_from_state(saved_state.encoder_params, args)
+        tensorizer, encoder, _ = init_biencoder_components(args.encoder_model_type, args, inference_only = True)
+        question_encoder = encoder.question_model
+        question_encoder = question_encoder.to(self.device)
+        question_encoder.eval()
+        # load weights from the model file
+        model_to_load = get_model_obj(question_encoder)
+        prefix_len = len("question_model.")
+        question_encoder_state = {
+            key[prefix_len:]: value
+            for (key, value) in saved_state.model_dict.items()
+            if key.startswith("question_model.")
+        }
+        model_to_load.load_state_dict(question_encoder_state, strict = False)
+        vector_size = model_to_load.get_out_size()
+        index = DenseFlatIndexer(vector_size, 50000)
+        self.retriever = DenseRetriever(
+            question_encoder,
+            args.batch_size,
+            tensorizer,
+            index,
+            self.device,
+            self.model_type
+        )
+        # index all passages
+        embd_file_path = args.encoded_ctx_file
+        if isinstance(embd_file_path, str):
+            file_path = embd_file_path
+        else:
+            file_path = embd_file_path[0]
+        self.retriever.index.index_data(file_path)
+        self.all_passages = self.load_passages(args.ctx_file)
+        self.fill2id = {}
+        for key in self.all_passages.keys():
+            self.fill2id[
+                "".join(
+                    [
+                        letter
+                        for letter in self.all_passages[key][1].upper()
+                        if letter in string.ascii_uppercase
+                    ]
+                )
+            ] = key
+        # might as well uppercase and remove non-alphas from the fills before we start to save time later
+        if not retrievalmodel:
+            temp = {}
+            for my_id in self.all_passages.keys():
+                temp[my_id] = "".join([c.upper() for c in self.all_passages[my_id][1] if c.upper() in string.ascii_uppercase])
+            self.len_all_passages = len(list(self.all_passages.values()))
+            self.all_passages = temp
+    @staticmethod
+    def load_passages(ctx_file: str) -> Dict[object, Tuple[str, str]]:
+        docs = {}
+        if isinstance(ctx_file, tuple):
+            ctx_file = ctx_file[0]
+        if ctx_file.endswith(".gz"):
+            with gzip.open(ctx_file, "rt") as tsvfile:
+                reader = csv.reader(
+                    tsvfile,
+                    delimiter="\t",
+                )
+                # file format: doc_id, doc_text, title
+                for row in reader:
+                    if row[0] != "id":
+                        docs[row[0]] = (row[1], row[2])
+        else:
+            with open(ctx_file) as tsvfile:
+                reader = csv.reader(
+                    tsvfile,
+                    delimiter="\t",
+                )
+                # file format: doc_id, doc_text, title
+                for row in reader:
+                    if row[0] != "id":
+                        docs[row[0]] = (row[1], row[2])
+        return docs
+    def answer_clues_closedbook(self, questions, max_answers, output_strings=False):
+        # assumes clues are preprocessed
+        assert self.retrievalmodel == False
+        questions_tensor = self.retriever.generate_question_vectors(questions)
+        if max_answers > self.len_all_passages:
+            max_answers = self.len_all_passages
+        # get top k results
+        top_ids_and_scores = self.retriever.get_top_docs(questions_tensor.numpy(), max_answers)
+        if not output_strings:
+            return top_ids_and_scores
+        else:
+            # get the string forms
+            all_answers = []
+            all_scores = []
+            for ans in top_ids_and_scores:
+                all_answers.append(list(map(self.all_passages.get, ans[0])))
+                all_scores.append(ans[1])
+            return all_answers, all_scores
+    def get_wikipedia_docs(self, questions, max_docs):
+        # assumes clues are preprocessed
+        assert self.retrievalmodel
+        questions_tensor = self.retriever.generate_question_vectors(questions)
+        # get top k results. add 2 in case of duplicates (see below
+        top_ids_and_scores = self.retriever.get_top_docs(questions_tensor.numpy(), max_docs + 2)
+        all_paragraphs = []
+        for ans in top_ids_and_scores:
+            paragraphs = []
+            for i in range(len(ans[0])):
+                id_ = ans[0][i]
+                id_ = id_.replace("wiki:", "")
+                mydocument = self.all_passages[id_]
+                if mydocument in paragraphs:
+                    print("woah, duplicate!!!")
+                    continue
+                paragraphs.append(mydocument)
+            all_paragraphs.append(paragraphs[0:max_docs])
+        return all_paragraphs

Normal_utils_inf.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import puz
+import re
+import unicodedata
+import sys
+def puz_to_json(fname):
+    """ Converts a puzzle in .puz format to .json format
+    """
+    p = puz.read(fname)
+    numbering = p.clue_numbering()
+    grid = [[None for _ in range(p.width)] for _ in range(p.height)]
+    for row_idx in range(p.height):
+        cell = row_idx * p.width
+        row_solution = p.solution[cell:cell + p.width]
+        for col_index, item in enumerate(row_solution):
+            if p.solution[cell + col_index:cell + col_index + 1] == '.':
+                grid[row_idx][col_index] = 'BLACK'
+            else:
+                grid[row_idx][col_index] = ["", row_solution[col_index: col_index + 1]]
+    across_clues = {}
+    for clue in numbering.across:
+        answer = ''.join(p.solution[clue['cell'] + i] for i in range(clue['len']))
+        across_clues[str(clue['num'])] = [clue['clue'] + ' ', ' ' + answer]
+        grid[int(clue['cell'] / p.width)][clue['cell'] % p.width][0] = str(clue['num'])
+    down_clues = {}
+    for clue in numbering.down:
+        answer = ''.join(p.solution[clue['cell'] + i * numbering.width] for i in range(clue['len']))
+        down_clues[str(clue['num'])] = [clue['clue'] + ' ', ' ' + answer]
+        grid[int(clue['cell'] / p.width)][clue['cell'] % p.width][0] = str(clue['num'])
+    mydict = {'metadata': {'date': None, 'rows': p.height, 'cols': p.width}, 'clues': {'across': across_clues, 'down': down_clues}, 'grid': grid}
+    return mydict
+def puz_to_pairs(filepath):
+    """ Takes in a filepath pointing to a .puz file and returns a list of (clue, fill) pairs in a list
+    """
+    p = puz.read(filepath)
+    numbering = p.clue_numbering()
+    grid = [[None for _ in range(p.width)] for _ in range(p.height)]
+    for row_idx in range(p.height):
+        cell = row_idx * p.width
+        row_solution = p.solution[cell:cell + p.width]
+        for col_index, item in enumerate(row_solution):
+            if p.solution[cell + col_index:cell + col_index + 1] == '.':
+                grid[row_idx][col_index] = 'BLACK'
+            else:
+                grid[row_idx][col_index] = ["", row_solution[col_index: col_index + 1]]
+    pairs = {}
+    for clue in numbering.across:
+        answer = ''.join(p.solution[clue['cell'] + i] for i in range(clue['len']))
+        pairs[clue['clue']] = answer
+    for clue in numbering.down:
+        answer = ''.join(p.solution[clue['cell'] + i * numbering.width] for i in range(clue['len']))
+        pairs[clue['clue']] = answer
+    return [(k, v) for k, v in pairs.items()]

Options_inf.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import argparse
+import logging
+import os
+import random
+import socket
+import numpy as np
+import torch
+logger = logging.getLogger()
+def add_tokenizer_params(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--do_lower_case",
+        action="store_true",
+        help="Whether to lower case the input text. True for uncased models, False for cased models.",
+    )
+def add_encoder_params(parser: argparse.ArgumentParser):
+    """
+    Common parameters to initialize an encoder-based model
+    """
+    parser.add_argument(
+        "--pretrained_model_cfg",
+        default=None,
+        type=str,
+        help="config name for model initialization",
+    )
+    parser.add_argument(
+        "--encoder_model_type",
+        default=None,
+        type=str,
+        help="model type. One of [hf_bert, pytext_bert, fairseq_roberta]",
+    )
+    parser.add_argument(
+        "--pretrained_file",
+        type=str,
+        help="Some encoders need to be initialized from a file",
+    )
+    parser.add_argument(
+        "--model_file",
+        default=None,
+        type=str,
+        help="Saved bi-encoder checkpoint file to initialize the model",
+    )
+    parser.add_argument(
+        "--projection_dim",
+        default=0,
+        type=int,
+        help="Extra linear layer on top of standard bert/roberta encoder",
+    )
+    parser.add_argument(
+        "--sequence_length",
+        type=int,
+        default=512,
+        help="Max length of the encoder input sequence",
+    )
+    parser.add_argument(
+        "--do_fill_lower_case",
+        action="store_true",
+        help="Make all fills lower case. e.g. for cased models such as roberta"
+    )
+    parser.add_argument(
+        "--desegment_valid_fill",
+        action="store_true",
+        help="Desegment model fill output for validation"
+    )
+def add_training_params(parser: argparse.ArgumentParser):
+    """
+    Common parameters for training
+    """
+    add_cuda_params(parser)
+    parser.add_argument(
+        "--train_file", default=None, type=str, help="File pattern for the train set"
+    )
+    parser.add_argument("--dev_file", default=None, type=str, help="")
+    parser.add_argument(
+        "--batch_size", default=2, type=int, help="Amount of questions per batch"
+    )
+    parser.add_argument(
+        "--dev_batch_size",
+        type=int,
+        default=4,
+        help="amount of questions per batch for dev set validation",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="random seed for initialization and dataset shuffling",
+    )
+    parser.add_argument(
+        "--adam_eps", default=1e-8, type=float, help="Epsilon for Adam optimizer."
+    )
+    parser.add_argument(
+        "--adam_betas",
+        default="(0.9, 0.999)",
+        type=str,
+        help="Betas for Adam optimizer.",
+    )
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+    )
+    parser.add_argument("--log_batch_step", default=100, type=int, help="")
+    parser.add_argument("--train_rolling_loss_step", default=100, type=int, help="")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="")
+    parser.add_argument(
+        "--learning_rate",
+        default=1e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
+    )
+    parser.add_argument(
+        "--warmup_steps", default=100, type=int, help="Linear warmup over warmup_steps."
+    )
+    parser.add_argument("--dropout", default=0.1, type=float, help="")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
+    )
+def add_cuda_params(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="local_rank for distributed training on gpus",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit float precision instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+def add_reader_preprocessing_params(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--gold_passages_src",
+        type=str,
+        help="File with the original dataset passages (json format). Required for train set",
+    )
+    parser.add_argument(
+        "--gold_passages_src_dev",
+        type=str,
+        help="File with the original dataset passages (json format). Required for dev set",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=16,
+        help="number of parallel processes to binarize reader data",
+    )
+def get_encoder_checkpoint_params_names():
+    return [
+        "do_lower_case",
+        "pretrained_model_cfg",
+        "encoder_model_type",
+        "pretrained_file",
+        "projection_dim",
+        "sequence_length",
+    ]
+def get_encoder_params_state(args):
+    """
+     Selects the param values to be saved in a checkpoint, so that a trained model faile can be used for downstream
+     tasks without the need to specify these parameter again
+    :return: Dict of params to memorize in a checkpoint
+    """
+    params_to_save = get_encoder_checkpoint_params_names()
+    r = {}
+    for param in params_to_save:
+        r[param] = getattr(args, param)
+    return r
+def set_encoder_params_from_state(state, args):
+    if not state:
+        return
+    params_to_save = get_encoder_checkpoint_params_names()
+    override_params = [
+        (param, state[param])
+        for param in params_to_save
+        if param in state and state[param]
+    ]
+    for param, value in override_params:
+        if hasattr(args, param):
+            logger.warning(
+                "Overriding args parameter value from checkpoint state. Param = %s, value = %s",
+                param,
+                value,
+            )
+        setattr(args, param, value)
+    return args
+def set_seed(args):
+    seed = args.seed
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(seed)
+def setup_args_gpu(args):
+    """
+    Setup arguments CUDA, GPU & distributed training
+    """
+    if args.local_rank == -1 or args.no_cuda:  # single-node multi-gpu (or cpu) mode
+        device = torch.device(
+            "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
+        )
+        args.n_gpu = torch.cuda.device_count()
+    else:  # distributed mode
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+    ws = os.environ.get("WORLD_SIZE")
+    args.distributed_world_size = int(ws) if ws else 1
+    logger.info(
+        "Initialized host %s as d.rank %d on device=%s, n_gpu=%d, world size=%d",
+        socket.gethostname(),
+        args.local_rank,
+        device,
+        args.n_gpu,
+        args.distributed_world_size,
+    )
+    logger.info("16-bits training: %s ", args.fp16)
+def print_args(args):
+    logger.info(" **************** CONFIGURATION **************** ")
+    for key, val in sorted(vars(args).items()):
+        keystr = "{}".format(key) + (" " * (30 - len(key)))
+        logger.info("%s -->   %s", keystr, val)
+    logger.info(" **************** CONFIGURATION **************** ")

Solver_inf.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import re
+from collections import defaultdict
+import string
+from scipy.special import softmax
+import numpy as np
+from Models_inf import answer_clues, setup_closedbook
+class Solver:
+    """
+    This class represents an abstraction over different types of crossword solvers. Each puzzle contains
+    a list of clues, which are associated with (weighted) values for each candidate answer.
+    Args:
+        crossword (Crossword): puzzle to solve
+        max_candidates (int): number of answer candidates to consider per clue
+    """
+    def __init__(self, crossword, model_path, ans_tsv_path, dense_embd_path, max_candidates=1000, process_id = 0, model_type = 'bert'):
+        self.crossword = crossword
+        self.max_candidates = max_candidates
+        self.process_id = process_id
+        self.model_path = model_path
+        self.ans_tsv_path = ans_tsv_path
+        self.dense_embd_glob = dense_embd_path,
+        self.model_type = model_type
+        self.get_candidates()
+    def get_candidates(self):
+        # get answers from neural model and fill up data structures with the results
+        chars = string.ascii_uppercase
+        self.char_map = {char: idx for idx, char in enumerate(chars)}
+        self.candidates = {}
+        all_clues = []
+        for var in self.crossword.variables:
+            all_clues.append(self.crossword.variables[var]['clue'])
+        # replaces stuff like "Opposite of 29-across" with "Opposite of X", where X is the clue for 29-across
+        r = re.compile('([0-9]+)[-\s](down|across)', re.IGNORECASE)
+        matches = [(idx, r.search(clue)) for idx, clue in enumerate(all_clues) if r.search(clue) != None]
+        for (idx, match) in matches:
+            clue = all_clues[idx]
+            var = str(match.group(1)) + str(match.group(2)[0]).upper()
+            if var in self.crossword.variables:
+                clue = clue[:match.start()] + self.crossword.variables[var]['clue'] + clue[match.end():]
+                all_clues[idx] = clue
+        # print("MODEL PATH: ", type(self.dense_embd_glob))
+        # get predictions
+        dpr = setup_closedbook(self.model_path, self.ans_tsv_path, self.dense_embd_glob, self.process_id, self.model_type)
+        all_words, all_scores = answer_clues(dpr, all_clues, max_answers=self.max_candidates, output_strings=True)
+        for index, var in enumerate(self.crossword.variables):
+            length = len(self.crossword.variables[var]["gold"])
+            self.candidates[var] = {"words": [], "bit_array": None, "weights": {}}
+            clue = all_clues[index]
+            words, scores = all_words[index], all_scores[index]
+            # remove answers that are not of the correct length
+            keep_positions = []
+            for word_index, word in enumerate(words):
+                if len(word) == length:
+                    keep_positions.append(word_index)
+            words = [words[i] for i in keep_positions]
+            scores = [scores[i] for i in keep_positions]
+            scores = list(-np.log(softmax(np.array(scores) / 0.75)))
+            for word, score in zip(words, scores):
+                self.candidates[var]["weights"][word] = score
+            # for debugging purposes, print the rank of the gold answer on our candidate list
+            # the gold answer is otherwise *not* used in any way during solving
+            # if self.crossword.variables[var]["gold"] in words:
+            #     print(clue, self.crossword.variables[var]["gold"], words.index(self.crossword.variables[var]["gold"]))
+            # else:
+            #     print('not found', clue, self.crossword.variables[var]["gold"])
+            # fill up some data structures used later in solving
+            for word, score in zip(words, scores):
+                self.candidates[var]["weights"][word] = score
+            weights = self.candidates[var]["weights"]
+            self.candidates[var]["words"] = sorted(weights, key=weights.get)
+            self.candidates[var]["bit_array"] = np.zeros((len(chars), length, len(self.candidates[var]["words"])))
+            self.candidates[var]["single_query_cache"] = [defaultdict(lambda:[]) for _ in range(len(chars))]
+            self.candidates[var]["single_query_cache_indices"] = [defaultdict(lambda:[]) for _ in range(len(chars))]
+            for word_idx, word in enumerate(self.candidates[var]["words"]):
+                for pos_idx, char in enumerate(word):
+                    char_idx = self.char_map[char]
+                    self.candidates[var]["bit_array"][char_idx, pos_idx, word_idx] = 1
+                    self.candidates[var]["single_query_cache"][pos_idx][char].append(word)
+                    self.candidates[var]["single_query_cache_indices"][pos_idx][char].append(word_idx)
+                    # NOTE: TODO, it's possible to cache more here in exchange for doing more work at init time
+        # cleanup a bit
+        del dpr
+    def evaluate(self, solution):
+        # print puzzle accuracy results given a generated solution
+        letters_correct = 0
+        letters_total = 0
+        for i in range(len(self.crossword.letter_grid)):
+            for j in range(len(self.crossword.letter_grid[0])):
+                if self.crossword.letter_grid[i][j] != "":
+                    letters_correct += (self.crossword.letter_grid[i][j] == solution[i][j])
+                    letters_total += 1
+        words_correct = 0
+        words_total = 0
+        for var in self.crossword.variables:
+            cells = self.crossword.variables[var]["cells"]
+            matching_cells = [self.crossword.letter_grid[cell[0]][cell[1]] == solution[cell[0]][cell[1]] for cell in cells]
+            if len(cells) == sum(matching_cells):
+                words_correct += 1
+            else:
+                # print('evaluation: correct word', ''.join([self.crossword.letter_grid[cell[0]][cell[1]] for cell in cells]), 'our prediction:', ''.join([solution[cell[0]][cell[1]] for cell in cells]))
+                pass
+            words_total += 1
+        print("Letters Correct: {}/{} | Words Correct: {}/{}".format(int(letters_correct), int(letters_total), int(words_correct), int(words_total)))
+        print("Letters Correct: {}% | Words Correct: {}%".format(float(letters_correct/letters_total*100), float(words_correct/words_total*100)))
+        info = {
+            "total_letters" : int(letters_total),
+            "total_words" : int(words_total),
+            "correct_letters" : int(letters_correct),
+            "correct_words" : int(words_correct),
+            "correct_letters_percent" : float(letters_correct/letters_total*100),
+            "correct_words_percent" : float(words_correct/words_total*100),
+        }
+        return info

Strict_json.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+def json_CA_json_converter(json_file_path, is_path):
+  if is_path:
+    with open(json_file_path, "r") as file:
+      data = json.load(file)
+  else:
+    data = json_file_path
+  json_conversion_dict = {}
+  rows = data['size']['rows']
+  cols = data['size']['cols']
+  clues = data['clues']
+  answers = data['answers']
+  json_conversion_dict['metadata'] = {'rows': rows, 'cols': cols}
+  across_clue_answer = {}
+  down_clue_answer = {}
+  for clue, ans in zip(clues['across'], answers['across']):
+    split_clue = clue.split(' ')
+    clue_num = split_clue[0][:-1]
+    clue_ = " ".join(split_clue[1:])
+    clue_ = clue_.replace("[", '').replace("]", '')
+    across_clue_answer[clue_num] = [clue_, ans]
+  for clue, ans in zip(clues['down'], answers['down']):
+    split_clue = clue.split(' ')
+    clue_num = split_clue[0][:-1]
+    clue_ = " ".join(split_clue[1:])
+    clue_ = clue_.replace("[", '').replace("]", '')
+    down_clue_answer[clue_num] = [clue_, ans]
+  json_conversion_dict['clues'] = {'across' : across_clue_answer, 'down' : down_clue_answer}
+  grid_info = data['grid']
+  grid_num = data['gridnums']
+  grid_info_list = []
+  for i in range(rows):
+    row_list = []
+    for j in range(cols):
+      if grid_info[i * rows + j] == '.':
+        row_list.append('BLACK')
+      else:
+        if grid_num[i * rows + j] == 0:
+          row_list.append(['', grid_info[i * rows + j]])
+        else:
+          row_list.append([str(grid_num[i * rows + j]), grid_info[i * rows + j]])
+    grid_info_list.append(row_list)
+  json_conversion_dict['grid'] = grid_info_list
+  return json_conversion_dict

Utils_inf.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+import puz
+import wordsegment
+import math
+from wordsegment import load, segment, clean
+import os
+load()
+dictionary = set([a.strip() for a in open('./words_alpha.txt','r').readlines()])
+def num_words(fill):
+    '''segment the text into multiple words and count how many words the text has in total'''
+    segmented = segment(fill)
+    prob = 0.0
+    for word in segmented:
+        if word not in dictionary:
+            return 999, -9999999999999
+        prob += math.log(wordsegment.UNIGRAMS[word])
+    return (len(segmented), prob)
+def get_word_flips(fill, num_candidates=10):
+    '''
+    We take as input a word/phrase that is probably mispelled, something like iluveyou. We then try flipping each one of the letters
+    to all other letters. We then segment those texts into multiple words using num_words, e.g., iloveyou -> i love you. We return the candidates
+    that segment into the fewest number of words.
+    '''
+    results = {}
+    min_length = 999
+    fill = clean(fill)
+    for index, char in enumerate(fill):
+        for new_letter in 'abcdefghijklmnopqrstuvwxyz':
+            new_fill = list(fill)
+            new_fill[index] = new_letter
+            new_fill = ''.join(new_fill)
+            curr_num_words, prob = num_words(new_fill)
+            if curr_num_words not in results:
+                results[curr_num_words] = []
+            results[curr_num_words].append((new_fill, prob))
+            if curr_num_words < min_length:
+                min_length = curr_num_words
+    if min_length == 999:
+        return [fill.upper()]
+    all_results = sum([sorted(results[length], key=lambda x:-x[1]) for length in sorted(list(results.keys()))], [])
+    return [a[0].upper() for a in all_results[0:num_candidates]]
+def convert_puz(fname):
+    # requires pypuz library to run
+    # converts a puzzle in .puz format to .json format
+    p = puz.read(fname)
+    numbering = p.clue_numbering()
+    grid = [[None for _ in range(p.width)] for _ in range(p.height)]
+    for row_idx in range(p.height):
+        cell = row_idx * p.width
+        row_solution = p.solution[cell:cell + p.width]
+        for col_index, item in enumerate(row_solution):
+            if p.solution[cell + col_index:cell + col_index + 1] == '.':
+                grid[row_idx][col_index] = 'BLACK'
+            else:
+                grid[row_idx][col_index] = ["", row_solution[col_index: col_index + 1]]
+    across_clues = {}
+    for clue in numbering.across:
+        answer = ''.join(p.solution[clue['cell'] + i] for i in range(clue['len']))
+        across_clues[str(clue['num'])] = [clue['clue'] + ' ', ' ' + answer]
+        grid[int(clue['cell'] / p.width)][clue['cell'] % p.width][0] = str(clue['num'])
+    down_clues = {}
+    for clue in numbering.down:
+        answer = ''.join(p.solution[clue['cell'] + i * numbering.width] for i in range(clue['len']))
+        down_clues[str(clue['num'])] = [clue['clue'] + ' ', ' ' + answer]
+        grid[int(clue['cell'] / p.width)][clue['cell'] % p.width][0] = str(clue['num'])
+    mydict = {'metadata': {'date': None, 'rows': p.height, 'cols': p.width}, 'clues': {'across': across_clues, 'down': down_clues}, 'grid': grid}
+    return mydict
+def clean(text):
+    '''
+    :param text: question or answer text
+    :return: text with line breaks and trailing spaces removed
+    '''
+    return " ".join(text.strip().split())
+def print_grid(letter_grid):
+    for row in letter_grid:
+        row = [" " if val == "" else val for val in row]
+        print("".join(row), flush=True)

extractpuzzle.py ADDED Viewed

	@@ -0,0 +1,792 @@

+import cv2
+import numpy as np
+import math
+from sklearn.linear_model import LinearRegression
+import pytesseract
+import re
+import matplotlib.pyplot as plt
+pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
+image_path = "try heree.jpg"
+def first_preprocessing(image):
+    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
+    canny = cv2.Canny(gray,75,25)
+    contours,hierarchies = cv2.findContours(canny,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
+    sorted_contours = sorted(contours,key = cv2.contourArea,reverse = True)
+    largest_contour = sorted_contours[0]
+    box = cv2.boundingRect(sorted_contours[0])
+    x = box[0]
+    y = box[1]
+    w = box[2]
+    h = box[3]
+    result = cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
+    return result
+def remove_head(image):
+    custom_config = r'--oem 3 --psm 6'  # Tesseract OCR configuration
+    detected_text = pytesseract.image_to_string(image, config=custom_config)
+    lines = detected_text.split('\n')
+# Find the first line containing some text
+    line_index = 0
+    for i, line in enumerate(lines):
+        if line.strip() != '':
+            line_index = i
+            break
+    first_newline_idx = detected_text.find('\n')
+    result = cv2.rectangle(image, (0, line_index), (image.shape[1], first_newline_idx), (255,255,255), thickness=cv2.FILLED)
+    return result
+def second_preprocessing(image):
+    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
+    canny = cv2.Canny(gray,75,25)
+    contours,hierarchies = cv2.findContours(canny,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
+    sorted_contours = sorted(contours,key = cv2.contourArea,reverse = True)
+    largest_contour = sorted_contours[0]
+    box2 = cv2.boundingRect(sorted_contours[0])
+    x = box2[0]
+    y = box2[1]
+    w = box2[2]
+    h = box2[3]
+    result2 = cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1)
+    return result2
+def find_vertical_profile(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+    vertical_profile = np.sum(binary, axis=0)
+    return vertical_profile
+def detect_steepest_changes(projection_profile, threshold=0.4, start_idx=0, min_valley_width=10, min_search_width=50):
+    differences = np.diff(projection_profile)
+    change_points = np.where(np.abs(differences) > threshold * np.max(np.abs(differences)))[0]
+    left_boundaries = []
+    right_boundaries = []
+    for idx in change_points:
+        if idx <= start_idx:
+            continue
+        if idx - start_idx >= min_search_width:
+            decreasing_profile = projection_profile[idx:]
+            if np.any(decreasing_profile > 0):
+                right_boundary = idx + np.argmin(decreasing_profile)
+                right_boundaries.append(right_boundary)
+            else:
+                continue
+            valley_start = max(start_idx, idx - min_valley_width)
+            valley_start = valley_start-40
+            valley_end = min(idx + min_valley_width, len(projection_profile) - 1)
+            valley = valley_start + np.argmin(projection_profile[valley_start:valley_end])
+            left_boundaries.append(valley)
+            break
+    return left_boundaries, right_boundaries
+def crop_text_columns(image, projection_profile, threshold=0.4):
+    start_idx = 0
+    text_columns = []
+    while True:
+        left_boundaries, right_boundaries = detect_steepest_changes(projection_profile, threshold, start_idx)
+        if not left_boundaries or not right_boundaries:
+            break
+        left = left_boundaries[0]
+        right = right_boundaries[0]
+        text_column = image[:, left:right]
+        text_columns.append(text_column)
+        start_idx = right
+    return text_columns
+def parse_clues(clue_text):
+    lines = clue_text.split('\n')
+    clues = {}
+    number = None
+    column = 0
+    for line in lines:
+        if "column separation" in line:
+            column += 1
+            continue
+        pattern = r"^(\d+(?:\.\d+)?)\s*(.+)"  # Updated pattern to handle decimal point numbers for clues
+        match = re.search(pattern, line)
+        if match:
+            number = float(match.group(1))  # Convert the matched number to float if there is a decimal point
+            if number not in clues:
+                clues[number] = [column,match.group(2).strip()]
+            else:
+                continue
+        elif number is None:
+            continue
+        elif clues[number][0] != column:
+            continue
+        else:
+            clues[number][1] += " " + line.strip()  # Append to the previous clue if it's a multiline clue
+    return clues
+def parse_crossword_clues(text):
+    # Check if "Down" clues are present
+    match = re.search(r'[dD][oO][wW][nN]\n', text)
+    if match:
+        across_clues, down_clues = re.split(r'[dD][oO][wW][nN]\n', text)
+    else:
+        # If "Down" clues are not present, set down_clues to an empty string
+        across_clues, down_clues = text, ""
+    across = parse_clues(across_clues)
+    down = parse_clues(down_clues)
+    return across, down
+def classify_text(filtered_columns):
+    text = ""
+    custom_config = r'--oem 3 --psm 6'
+    for i, column in enumerate(filtered_columns):
+        column2 = cv2.cvtColor(column, cv2.COLOR_BGR2RGB)
+        scale_factor = 2.0  # You can adjust this value
+# Calculate the new dimensions after scaling
+        new_width = int(column2.shape[1] * scale_factor)
+        new_height = int(column2.shape[0] * scale_factor)
+# Resize the image using OpenCV
+        scaled_image = cv2.resize(column2, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
+# Apply image enhancement techniques
+        denoised_image = cv2.fastNlMeansDenoising(scaled_image, None, h=10, templateWindowSize=7, searchWindowSize=21)
+        enhanced_image = cv2.cvtColor(denoised_image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale  # Apply histogram equalization
+        detected_text = pytesseract.image_to_string(enhanced_image, config=custom_config)
+    # print(detected_text)
+        text+=detected_text
+    across_clues, down_clues = parse_crossword_clues(text)
+    return across_clues,down_clues
+def get_text(image):
+    image = cv2.cvtColor(image,cv2.COLOR_GRAY2BGR)
+    result = first_preprocessing(image)
+    result1 = remove_head(result)
+    result2 = second_preprocessing(result1)
+    vertical_profile = find_vertical_profile(result2)
+    combined_columns = crop_text_columns(result2,vertical_profile)
+    across,down = classify_text(combined_columns)
+    return across,down
+################################ Grid Extraction begins here ###########################
+########################################################################################
+# for applying non max suppression of the contours
+def calculate_iou(image, contour1, contour2):
+    # Create masks for each contour
+    mask1 = np.zeros_like(image, dtype=np.uint8)
+    cv2.drawContours(mask1, [contour1], -1, 255, thickness=cv2.FILLED)
+    mask2 = np.zeros_like(image, dtype=np.uint8)
+    cv2.drawContours(mask2, [contour2], -1, 255, thickness=cv2.FILLED)
+    # Find the intersection between the two masks
+    intersection = cv2.bitwise_and(mask1, mask2)
+    # Calculate the intersection area
+    intersection_area = cv2.countNonZero(intersection)
+    # Calculate the union area (Not the accurate one but works alright XD !)
+    union_area = cv2.contourArea(cv2.convexHull(np.concatenate((contour1, contour2))))
+    # Calculate the IoU
+    iou = intersection_area / union_area
+    return iou
+# remove overlapping contours, non square and not quardatic contours
+# this check every contour with every other contour so be careful
+def filter_contours(img_gray2, contours, iou_threshold = 0.6, asp_ratio = 1,tolerance = 0.5):
+    # Remove overlapping contours, removing that are not square
+    filtered_contours = []
+    epsilon = 0.02
+    for contour in contours:
+        # Approximate the contour to reduce the number of points
+        epsilon_multiplier = epsilon * cv2.arcLength(contour, True)
+        approximated_contour = cv2.approxPolyDP(contour, epsilon_multiplier, True)
+        # find the aspect ratio of the contour, if it is close to 1 then keep it otherwise discard
+        _,_,w,h = cv2.boundingRect(approximated_contour)
+        if(abs(float(w)/h - asp_ratio) > tolerance ): continue
+        # Calculate the IoU with all existing contours
+        iou_values = [calculate_iou(img_gray2,np.array(approximated_contour), np.array(existing_contour)) for existing_contour in filtered_contours]
+        # If the IoU value with all existing contours is below the threshold, add the current contour
+        if not any(iou_value > iou_threshold for iou_value in iou_values):
+            filtered_contours.append(approximated_contour)
+    return filtered_contours
+# https://stackoverflow.com/questions/383480/intersection-of-two-lines-defined-in-rho-theta-parameterization/383527#383527
+# Define the parametricIntersect function
+def parametricIntersect(r1, t1, r2, t2):
+    ct1 = np.cos(t1)
+    st1 = np.sin(t1)
+    ct2 = np.cos(t2)
+    st2 = np.sin(t2)
+    d = ct1 * st2 - st1 * ct2
+    if d != 0.0:
+        x = int((st2 * r1 - st1 * r2) / d)
+        y = int((-ct2 * r1 + ct1 * r2) / d)
+        return x, y
+    else:
+        return None
+# Group the coordinate to a list such that each point in a list may belong to a line
+def group_lines(coordinates,axis=0,threshold=10):
+    sorted_coordinates = list(sorted(coordinates,key=lambda x: x[axis]))
+    groups = []
+    current_group = []
+    for i in range(len(sorted_coordinates)):
+        if i!=0 and abs(current_group[0][axis] - sorted_coordinates[i][axis]) > threshold: # condition to change the group
+            if len(current_group) > 4:
+                groups.append(current_group)
+                current_group = []
+        current_group.append(sorted_coordinates[i]) # condition to append to the group
+    if(len(current_group) > 4):
+        groups.append(current_group)
+    return groups
+# Use the Grouped Lines to Fit a line using Linear Regression
+def fit_lines(grouped_lines,is_horizontal = False):
+    actual_lines = []
+    for coordinates in grouped_lines:
+        # Converting into numpy array
+        coordinates_arr = np.array(coordinates)
+        # Separate the x and y coordinates
+        x = coordinates_arr[:, 0]
+        y = coordinates_arr[:, 1]
+        # Fit a linear regression model
+        regressor = LinearRegression()
+        regressor.fit(y.reshape(-1, 1), x)
+        # Get the slope and intercept of the fitted line
+        slope = regressor.coef_[0]
+        intercept = regressor.intercept_
+        if(is_horizontal):
+            intercept = np.mean(y)
+        actual_lines.append((slope,intercept))
+    return actual_lines
+# Calculates difference between two consecutive elements in an array
+def average_distance(arr):
+    n = len(arr)
+    distance_sum = 0
+    for i in range(n - 1):
+        distance_sum += abs(arr[i+1] - arr[i])
+    average = distance_sum / (n - 1)
+    return average
+# If two adjacent lines are near than some threshold, then merge them
+# Returns Results in y = mx + b from
+def average_out_similar_lines(lines_m_c,lines_coord,del_threshold,is_horizontal=False):
+    averaged_lines = []
+    i = 0
+    while(i < len(lines_m_c) - 1):
+        _, intercept1 = lines_m_c[i]
+        _, intercept2 = lines_m_c[i + 1]
+        if abs(intercept2 - intercept1) < del_threshold:
+            new_points = np.array(lines_coord[i] + lines_coord[i+1][:-1])
+            # Separate the x and y coordinates
+            x = new_points[:, 0]
+            y = new_points[:, 1]
+            # Fit a linear regression model
+            regressor = LinearRegression()
+            regressor.fit(y.reshape(-1, 1), x)
+            # Get the slope and intercept of the fitted line
+            slope = regressor.coef_[0]
+            intercept = regressor.intercept_
+            if(is_horizontal):
+                intercept = np.mean(y)
+            averaged_lines.append((slope,intercept))
+            i+=2
+        else:
+            averaged_lines.append(lines_m_c[i])
+            i+=1
+    if(i < len(lines_m_c)):
+        averaged_lines.append(lines_m_c[i])
+    return averaged_lines
+# If two adjacent lines are near than some threshold, then merge them
+# Returns Results in normalized vector form
+def average_out_similar_lines1(lines_m_c,lines_coord,del_threshold):
+    averaged_lines = []
+    i = 0
+    while(i < len(lines_m_c) - 1):
+        _, intercept1 = lines_m_c[i]
+        _, intercept2 = lines_m_c[i + 1]
+        if abs(intercept2 - intercept1) < del_threshold:
+            new_points = np.array(lines_coord[i] + lines_coord[i+1][:-1])
+            coordinates = np.array(new_points)
+            points = coordinates[:, None, :].astype(np.int32)
+            # Fit a line using linear regression
+            [vx, vy, x, y] = cv2.fitLine(points, cv2.DIST_L2, 0, 0.01, 0.01)
+            averaged_lines.append((vx, vy, x, y))
+            i+=2
+        else:
+            new_points = np.array(lines_coord[i])
+            coordinates = np.array(new_points)
+            points = coordinates[:, None, :].astype(np.int32)
+            # Fit a line using linear regression
+            [vx, vy, x, y] = cv2.fitLine(points, cv2.DIST_L2, 0, 0.01, 0.01)
+            averaged_lines.append((vx, vy, x, y))
+            i+=1
+    if(i < len(lines_m_c)):
+        new_points = np.array(lines_coord[i])
+        coordinates = np.array(new_points)
+        points = coordinates[:, None, :].astype(np.int32)
+        # Fit a line using linear regression
+        [vx, vy, x, y] = cv2.fitLine(points, cv2.DIST_L2, 0, 0.01, 0.01)
+        averaged_lines.append((vx, vy, x, y))
+    return averaged_lines
+def get_square_color(image, box):
+    # Determine the size of the square region
+    square_size = (box[1][0] - box[0][0]) / 3
+    # Determine the coordinates of the square region inside the box
+    top_left = (box[0][0] + square_size, box[0][1] + square_size)
+    bottom_right = (box[0][0] + square_size*2, box[0][1] + square_size*2)
+    # Extract the square region from the image
+    square_region = image[int(top_left[1]):int(bottom_right[1]), int(top_left[0]):int(bottom_right[0])]
+    # Calculate the mean pixel value of the square region
+    mean_value = np.mean(square_region)
+    # Determine whether the square region is predominantly black or white
+    if mean_value < 128:
+        square_color = "."
+    else:
+        square_color = " "
+    return square_color
+# accepts image in grayscale
+def extract_grid(image):
+    # Apply Gaussian blur to reduce noise and improve edge detection
+    blurred = cv2.GaussianBlur(image, (3, 3), 0)
+    # Apply Canny edge detection
+    edges = cv2.Canny(blurred, 50, 150)
+    # Apply dilation to connect nearby edges and make them more contiguous
+    kernel = np.ones((5, 5), np.uint8)
+    dilated = cv2.dilate(edges, kernel, iterations=1)
+    # # Applying canny edge detector
+    # detecting contours on the canny image
+    contours, _ = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
+    # sorting the contours by the descending order area of the contour
+    sorted_contours = list(sorted(contours, key=cv2.contourArea,reverse=True))
+    # filtering out the top 10 largest by applying NMS and only selecting square ones (Apsect ratio 1)
+    filtered_contours = filter_contours(image, sorted_contours[0:10],iou_threshold=0.6,asp_ratio=1,tolerance=0.2)
+    # largest Contour Extraction
+    largest_contour = []
+    if(len(filtered_contours)):
+        largest_contour = filtered_contours[0]
+    else:
+        largest_contour = sorted_contours[0]
+    # --- Performing Perspective warp of the largest contour ---
+    coordinates_list = []
+    if(largest_contour.shape != (4,1,2)):
+        largest_contour = cv2.convexHull(largest_contour)
+        if(largest_contour.shape != (4,1,2)):
+            rect = cv2.minAreaRect(largest_contour)
+            largest_contour = cv2.boxPoints(rect)
+            largest_contour = largest_contour.astype('int')
+    coordinates_list = largest_contour.reshape(4, 2).tolist()
+    # Convert coordinates_list to a numpy array
+    coordinates_array = np.array(coordinates_list)
+    # Find the convex hull of the points
+    hull = cv2.convexHull(coordinates_array)
+    # Find the extreme points of the convex hull
+    extreme_points = np.squeeze(hull)
+    # Sort the extreme points by their x and y coordinates to determine the order
+    sorted_points = extreme_points[np.lexsort((extreme_points[:, 1], extreme_points[:, 0]))]
+    # Extract top left, bottom right, top right, and bottom left points
+    tl = sorted_points[0]
+    tr = sorted_points[1]
+    bl = sorted_points[2]
+    br = sorted_points[3]
+    if(tr[1] < tl[1]):
+        tl,tr = tr,tl
+    if(br[1] < bl[1]):
+        bl,br = br,bl
+    # Define pts1
+    pts1 = [tl, bl, tr, br]
+    # Calculate the bounding rectangle coordinates
+    x, y, w, h = 0,0,400,400
+    # Define pts2 as the corners of the bounding rectangle
+    pts2 = [[3, 3], [400, 3], [3, 400], [400, 400]]
+    # Calculate the perspective transformation matrix
+    matrix = cv2.getPerspectiveTransform(np.float32(pts1), np.float32(pts2))
+    # Apply the perspective transformation to the cropped_image
+    transformed_img = cv2.warpPerspective(image, matrix, (403, 403))
+    cropped_image = transformed_img.copy()
+    plt.figure(figsize=(12,8))
+    plt.axis("off")
+    plt.imsave("noice1.jpg",cv2.cvtColor(cropped_image,cv2.COLOR_GRAY2RGB))
+    # if the largest contour was not exactly quadilateral
+    # -- Performing Hough Transform --
+    similarity_threshold = math.floor(w/30) # Thresholds for filtering Similar Hough Lines
+    # Applying Gaussian Blur to reduce noice and improve dege detection
+    blurred = cv2.GaussianBlur(cropped_image, (5, 5), 0)
+    # Perform Canny edge detection on the GrayScale Image
+    edges = cv2.Canny(blurred, 50, 150)
+    lines = cv2.HoughLines(edges, 1, np.pi/180, 200)
+    # Filter out similar lines
+    filtered_lines = []
+    for line in lines:
+        for r_theta in lines:
+            arr = np.array(r_theta[0], dtype=np.float64)
+            rho, theta = arr
+            is_similar = False
+            for filtered_line in filtered_lines:
+                filtered_rho, filtered_theta = filtered_line
+                # similarity threshold is 10
+                if abs(rho - filtered_rho) < similarity_threshold and abs(theta - filtered_theta) < np.pi/180 * similarity_threshold:
+                    is_similar = True
+                    break
+            if not is_similar:
+                filtered_lines.append((rho, theta))
+    # Filter out the horizontal and the vertical lines
+    horizontal_lines = []
+    vertical_lines = []
+    for rho, theta in filtered_lines:
+        a = np.cos(theta)
+        b = np.sin(theta)
+        x0 = a * rho
+        y0 = b * rho
+        x1 = int(x0 + 1000 * (-b))
+        y1 = int(y0 + 1000 * (a))
+        x2 = int(x0 - 1000 * (-b))
+        y2 = int(y0 - 1000 * (a))
+        slope = (y2 - y1) / (x2 - x1 + 0.0001)
+        # do taninv(0.17) it is nearly equal to 10
+        if( abs(slope) <= 0.18 ):
+            horizontal_lines.append((rho,theta))
+        elif (abs(slope) > 6):
+            vertical_lines.append((rho,theta))
+    # Find the intersection points of horizontal and vertical lines
+    hough_corners = []
+    for h_rho, h_theta in horizontal_lines:
+        for v_rho, v_theta in vertical_lines:
+            x, y = parametricIntersect(h_rho, h_theta, v_rho, v_theta)
+            if x is not None and y is not None:
+                hough_corners.append((x, y))
+    # -- Performing Harris Corner Detection --
+    # Create CLAHE object with specified clip limit
+    clahe = cv2.createCLAHE(clipLimit=3, tileGridSize=(8, 8))
+    clahe_image = clahe.apply(cropped_image)
+    # harris corner detection for CLHAE IMAGE
+    dst = cv2.cornerHarris(clahe_image,2,3,0.04)
+    ret,dst = cv2.threshold(dst,0.1*dst.max(),255,0)
+    dst = np.uint8(dst)
+    dst = cv2.dilate(dst,None)
+    ret, labels, stats, centroids = cv2.connectedComponentsWithStats(dst)
+    criteria = (cv2.TERM_CRITERIA_EPS+cv2.TermCriteria_MAX_ITER,100,0.001)
+    harris_corners = cv2.cornerSubPix(clahe_image,np.float32(centroids),(5,5),(-1,-1),criteria)
+    drawn_image = cv2.cvtColor(cropped_image, cv2.COLOR_GRAY2BGR)
+    for i in harris_corners:
+        x,y = i
+        image2 = cv2.circle(drawn_image, (int(x),int(y)), radius=0, color=(0, 0, 255), thickness=3)
+    # -- Using Regression Model to approximate horizontal and vertical Lines
+    # reducing to 0 decimal places
+    corners1 = list(map(lambda coord: (round(coord[0], 0), round(coord[1], 0)), harris_corners))
+    # adding the corners obtained from hough transform
+    corners1 += hough_corners
+    # removing the duplicate corners
+    corners_no_dup = list(set(corners1))
+    min_cell_width = w/30
+    min_cell_height = h/30
+    # grouping coordinates into probabale array that could fit a horizontal and vertical lien
+    vertical_lines = group_lines(corners_no_dup,0,min_cell_height)
+    horizontal_lines = group_lines(corners_no_dup,1,min_cell_height)
+    actual_vertical_lines = fit_lines(vertical_lines)
+    actual_horizontal_lines = fit_lines(horizontal_lines,is_horizontal=True)
+    # Lines obtained from above method are not appropriate, we have to refine them
+    x_probable = [i[1] for i in actual_horizontal_lines] # looking at the intercepts
+    y_probable = [i[1] for i in actual_vertical_lines]
+    del_x_avg = average_distance(x_probable)
+    del_y_avg = average_distance(y_probable)
+    averaged_horizontal_lines1 = []         # This step here is fishy and needs refinement
+    averaged_vertical_lines1 = []
+    multiplier = 0.95
+    i = 0
+    while(1):
+        averaged_horizontal_lines = average_out_similar_lines(actual_horizontal_lines,horizontal_lines,del_y_avg*multiplier,is_horizontal=True)
+        averaged_vertical_lines = average_out_similar_lines(actual_vertical_lines,vertical_lines,del_x_avg*multiplier,is_horizontal=False)
+        i += 1
+        if(i >= 20 or len(averaged_horizontal_lines) == len(averaged_vertical_lines)):
+            break
+        else:
+            multiplier -= 0.05
+    averaged_horizontal_lines1 = average_out_similar_lines1(actual_horizontal_lines,horizontal_lines,del_y_avg*multiplier)
+    averaged_vertical_lines1 = average_out_similar_lines1(actual_vertical_lines,vertical_lines,del_x_avg*multiplier)
+    # plotting the lines to image to find the intersection points
+    drawn_image6 = np.ones_like(cropped_image)*255
+    for vx,vy,cx,cy in  averaged_horizontal_lines1 + averaged_vertical_lines1:
+        w = cropped_image.shape[1]
+        cv2.line(drawn_image6, (int(cx-vx*w), int(cy-vy*w)), (int(cx+vx*w), int(cy+vy*w)), (0, 0, 255),1,cv2.LINE_AA)
+    # -- Finding Intersection points --
+    # Applying Harris Corner Detection to find the intersection points
+    mesh_image = drawn_image6.copy()
+    dst = cv2.cornerHarris(mesh_image,2,3,0.04)
+    ret,dst = cv2.threshold(dst,0.1*dst.max(),255,0)
+    dst = np.uint8(dst)
+    dst = cv2.dilate(dst,None)
+    ret, labels, stats, centroids = cv2.connectedComponentsWithStats(dst)
+    criteria = (cv2.TERM_CRITERIA_EPS+cv2.TermCriteria_MAX_ITER,100,0.001)
+    harris_corners = cv2.cornerSubPix(mesh_image,np.float32(centroids),(5,5),(-1,-1),criteria)
+    drawn_image = cv2.cvtColor(drawn_image6, cv2.COLOR_GRAY2BGR)
+    harris_corners = list(sorted(harris_corners[1:],key = lambda x : x[1]))
+    # -- Finding out the grid color --
+    grayscale = cropped_image.copy()
+    # Perform adaptive thresholding to obtain binary image
+    _, binary = cv2.threshold(grayscale, 128, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+    # Perform morphological operations to remove small text regions
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    binary = cv2.morphologyEx(binary, cv2.MORPH_ELLIPSE, kernel, iterations=1)
+    # Invert the binary image
+    inverted_binary = cv2.bitwise_not(binary)
+    # Restore the image by blending the inverted binary image with the grayscale image
+    restored_image = cv2.bitwise_or(inverted_binary, grayscale)
+    # Apply morphological opening to remove small black dots
+    kernel_opening = np.ones((3, 3), np.uint8)
+    opened_image = cv2.morphologyEx(restored_image, cv2.MORPH_OPEN, kernel_opening, iterations=1)
+    # Apply morphological closing to further refine the restored image
+    kernel_closing = np.ones((5, 5), np.uint8)
+    refined_image = cv2.morphologyEx(opened_image, cv2.MORPH_CLOSE, kernel_closing, iterations=1)
+    # finding out the grid corner
+    grid = []
+    grid_nums = []
+    across_clue_num = []
+    down_clue_num = []
+    sorted_corners = np.array(list(sorted(harris_corners,key=lambda x:x[1])))
+    if(len(sorted_corners) == len(averaged_horizontal_lines1) * len(averaged_vertical_lines1)):
+        sorted_corners_grouped = []
+        for i in range(0,len(sorted_corners),len(averaged_vertical_lines1)):
+            temp_arr = sorted_corners[i:i+len(averaged_vertical_lines1)]
+            temp_arr = list(sorted(temp_arr,key=lambda x: x[0]))
+            sorted_corners_grouped.append(temp_arr)
+        for h_line_idx in range(0,len(sorted_corners_grouped)-1):
+            for corner_idx in range(0,len(sorted_corners_grouped[h_line_idx])-1):
+                # grabbing the four box coordinates
+                box = [sorted_corners_grouped[h_line_idx][corner_idx],sorted_corners_grouped[h_line_idx][corner_idx+1],
+                    sorted_corners_grouped[h_line_idx+1][corner_idx],sorted_corners_grouped[h_line_idx+1][corner_idx+1]]
+                grid.append(get_square_color(refined_image,box))
+        grid_formatted = []
+        for i in range(0, len(grid), len(averaged_vertical_lines1) - 1):
+            grid_formatted.append(grid[i:i + len(averaged_vertical_lines1) - 1])
+        # if (x,y) is present in these array the cell (x,y) is already accounted as a part of answer of across or down
+        in_horizontal = []
+        in_vertical = []
+        num = 0
+        for x in range(0, len(averaged_vertical_lines1) - 1):
+            for y in range(0, len(averaged_horizontal_lines1) - 1):
+                # if the cell is black there's no need to number
+                if grid_formatted[x][y] == '.':
+                    grid_nums.append(0)
+                    continue
+                # if the cell is part of both horizontal and vertical cell then there's no need to number
+                horizontal_presence = (x, y) in in_horizontal
+                vertical_presence = (x, y) in in_vertical
+                # present in both 1 1
+                if horizontal_presence and vertical_presence:
+                    grid_nums.append(0)
+                    continue
+                # present in one i.e 1 0
+                if not horizontal_presence and vertical_presence:
+                    horizontal_length = 0
+                    temp_horizontal_arr = []
+                    # iterate in x direction until the end of the grid or until a black box is found
+                    while x + horizontal_length < len(averaged_horizontal_lines1) - 1 and grid_formatted[x + horizontal_length][y] != '.':
+                        temp_horizontal_arr.append((x + horizontal_length, y))
+                        horizontal_length += 1
+                    # if horizontal length is greater than 1, then append the temp_horizontal_arr to in_horizontal array
+                    if horizontal_length > 1:
+                        in_horizontal.extend(temp_horizontal_arr)
+                        num += 1
+                        across_clue_num.append(num)
+                        grid_nums.append(num)
+                        continue
+                    grid_nums.append(0)
+                # present in one 1 0
+                if not vertical_presence and horizontal_presence:
+                    # do the same for vertical
+                    vertical_length = 0
+                    temp_vertical_arr = []
+                    # iterate in y direction until the end of the grid or until a black box is found
+                    while y + vertical_length < len(averaged_vertical_lines1) - 1 and grid_formatted[x][y+vertical_length] != '.':
+                        temp_vertical_arr.append((x, y+vertical_length))
+                        vertical_length += 1
+                    # if vertical length is greater than 1, then append the temp_vertical_arr to in_vertical array
+                    if vertical_length > 1:
+                        in_vertical.extend(temp_vertical_arr)
+                        num += 1
+                        down_clue_num.append(num)
+                        grid_nums.append(num)
+                        continue
+                    grid_nums.append(0)
+                if(not horizontal_presence and not vertical_presence):
+                    horizontal_length = 0
+                    temp_horizontal_arr = []
+                    # iterate in x direction until the end of the grid or until a black box is found
+                    while x + horizontal_length < len(averaged_horizontal_lines1) - 1 and grid_formatted[x + horizontal_length][y] != '.':
+                        temp_horizontal_arr.append((x + horizontal_length, y))
+                        horizontal_length += 1
+                    # if horizontal length is greater than 1, then append the temp_horizontal_arr to in_horizontal array
+                    # do the same for vertical
+                    vertical_length = 0
+                    temp_vertical_arr = []
+                    # iterate in y direction until the end of the grid or until a black box is found
+                    while y + vertical_length < len(averaged_vertical_lines1) - 1 and grid_formatted[x][y+vertical_length] != '.':
+                        temp_vertical_arr.append((x, y+vertical_length))
+                        vertical_length += 1
+                    # if vertical length is greater than 1, then append the temp_vertical_arr to in_vertical array
+                    if horizontal_length > 1 and horizontal_length > 1:
+                        in_horizontal.extend(temp_horizontal_arr)
+                        in_vertical.extend(temp_vertical_arr)
+                        num += 1
+                        across_clue_num.append(num)
+                        down_clue_num.append(num)
+                        grid_nums.append(num)
+                    elif vertical_length > 1:
+                        in_vertical.extend(temp_vertical_arr)
+                        num += 1
+                        down_clue_num.append(num)
+                        grid_nums.append(num)
+                    elif horizontal_length > 1:
+                        in_horizontal.extend(temp_horizontal_arr)
+                        num += 1
+                        across_clue_num.append(num)
+                        grid_nums.append(num)
+                    else:
+                        grid_nums.append(0)
+    size = { 'rows' : len(averaged_horizontal_lines1)-1,
+            'cols' : len(averaged_vertical_lines1)-1,
+            }
+    dict = {
+        'size' : size,
+        'grid' : grid,
+        'gridnums': grid_nums,
+        'across_nums': down_clue_num,
+        'down_nums' : across_clue_num,
+        'clues':{
+            'across' : [],
+            'down': []
+        }
+    }
+    return dict
+if __name__ == "__main__":
+    img = cv2.imread("D:\\D\\Major Project files\\opencv\\movie.png",0)
+    down = extract_grid(img)
+    print(down)
+    # img = Image.open("chalena3.jpg")
+    # img_gray = img.convert("L")
+    # print(extract_grid(img_gray))

main.py CHANGED Viewed

@@ -1,6 +1,31 @@
-from fastapi import FastAPI
 app = FastAPI()
 @app.get("/")
 async def index():
-   return {"message": "Hello World"}

+from fastapi import Request,FastAPI
+import os
+from Crossword_inf import Crossword
+from BPSolver_inf import BPSolver
+from Strict_json import json_CA_json_converter
+import json
+MODEL_PATH = os.path.join("Inference_components","dpr_biencoder_trained_500k.bin")
+ANS_TSV_PATH = os.path.join("Inference_components","all_answer_list.tsv")
+DENSE_EMBD_PATH = os.path.join("Inference_components","embeddings_all_answers_json_0*")
+MODEL_PATH_DISTIL = os.path.join("Inference_components","distilbert_EPOCHs_7_COMPLETE.bin")
+ANS_TSV_PATH_DISTIL = os.path.join("Inference_components","all_answer_list.tsv")
+DENSE_EMBD_PATH_DISTIL = os.path.join("Inference_components","distilbert_7_epochs_embeddings.pkl")
 app = FastAPI()
 @app.get("/")
 async def index():
+   return {"message": "Hello World"}
+@app.post("/solve")
+async def solve(request: Request):
+   json = await request.json()
+   puzzle = json_CA_json_converter(json, False)
+   crossword = Crossword(puzzle)
+   solver = BPSolver(crossword, model_path = MODEL_PATH_DISTIL, ans_tsv_path = ANS_TSV_PATH_DISTIL, dense_embd_path = DENSE_EMBD_PATH_DISTIL, max_candidates = 40000, model_type = 'distilbert')
+   solution = solver.solve(num_iters = 100, iterative_improvement_steps = 0)
+   return solution, solver.evaluate(solution)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+def init_hf_bert_biencoder(args, **kwargs):
+    from .hf_models import get_bert_biencoder_components
+    return get_bert_biencoder_components(args, **kwargs)
+def init_hf_distilbert_biencoder(args, **kwargs):
+    from .hf_models import get_distilbert_biencoder_components
+    return get_distilbert_biencoder_components(args, **kwargs)
+def init_hf_bert_tenzorizer(args, **kwargs):
+    from .hf_models import get_bert_tensorizer
+    return get_bert_tensorizer(args)
+def init_hf_distilbert_tenzorizer(args, **kwargs):
+    from .hf_models import get_distilbert_tensorizer
+    return get_distilbert_tensorizer(args)
+BIENCODER_INITIALIZERS = {
+    'hf_bert': init_hf_bert_biencoder,
+    'hf_distilbert': init_hf_distilbert_biencoder
+}
+TENSORIZER_INITIALIZERS = {
+    'hf_bert': init_hf_bert_tenzorizer,
+    'hf_distilbert': init_hf_distilbert_tenzorizer
+}
+def init_comp(initializers_dict, type, args, **kwargs):
+    if type in initializers_dict:
+        return initializers_dict[type](args, **kwargs)
+    else:
+        raise RuntimeError('unsupported model type: {}'.format(type))
+def init_biencoder_components(encoder_type: str, args, **kwargs):
+    return init_comp(BIENCODER_INITIALIZERS, encoder_type, args, **kwargs)
+def init_tenzorizer(encoder_type: str, args, **kwargs):
+    return init_comp(TENSORIZER_INITIALIZERS, encoder_type, args, **kwargs)

models/biencoder.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import collections
+import logging
+import random
+from typing import Tuple, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor as T
+from torch import nn
+import sys
+import os
+current_dir = os.path.dirname(__file__)
+data_utils_path = os.path.join(current_dir, '..')
+sys.path.append(data_utils_path)
+from Data_utils_inf import Tensorizer
+from Data_utils_inf import normalize_question
+logger = logging.getLogger(__name__)
+BiEncoderBatch = collections.namedtuple(
+    "BiENcoderInput",
+    [
+        "question_ids",
+        "question_segments",
+        "context_ids",
+        "ctx_segments",
+        "is_positive",
+        "hard_negatives",
+    ],
+)
+def dot_product_scores(q_vectors: T, ctx_vectors: T) -> T:
+    """
+    calculates q->ctx scores for every row in ctx_vector
+    :param q_vector:
+    :param ctx_vector:
+    :return:
+    """
+    # q_vector: n1 x D, ctx_vectors: n2 x D, result n1 x n2
+    r = torch.matmul(q_vectors, torch.transpose(ctx_vectors, 0, 1))
+    return r
+def cosine_scores(q_vector: T, ctx_vectors: T):
+    # q_vector: n1 x D, ctx_vectors: n2 x D, result n1 x n2
+    return F.cosine_similarity(q_vector, ctx_vectors, dim=1)
+class BiEncoder(nn.Module):
+    """Bi-Encoder model component. Encapsulates query/question and context/passage encoders."""
+    def __init__(
+        self,
+        question_model: nn.Module,
+        ctx_model: nn.Module,
+        fix_q_encoder: bool = False,
+        fix_ctx_encoder: bool = False,
+    ):
+        super(BiEncoder, self).__init__()
+        self.question_model = question_model
+        self.ctx_model = ctx_model
+        self.fix_q_encoder = fix_q_encoder
+        self.fix_ctx_encoder = fix_ctx_encoder
+    @staticmethod
+    def get_representation(
+        sub_model: nn.Module,
+        ids: T,
+        segments: T,
+        attn_mask: T,
+        fix_encoder: bool = False,
+    ) -> (T, T, T):
+        sequence_output = None
+        pooled_output = None
+        hidden_states = None
+        if ids is not None:
+            if fix_encoder:
+                with torch.no_grad():
+                    sequence_output, pooled_output, hidden_states = sub_model(
+                        ids, segments, attn_mask
+                    )
+                if sub_model.training:
+                    sequence_output.requires_grad_(requires_grad=True)
+                    pooled_output.requires_grad_(requires_grad=True)
+            else:
+                sequence_output, pooled_output, hidden_states = sub_model(
+                    ids, segments, attn_mask
+                )
+        return sequence_output, pooled_output, hidden_states
+    def forward(
+        self,
+        question_ids: T,
+        question_segments: T,
+        question_attn_mask: T,
+        context_ids: T,
+        ctx_segments: T,
+        ctx_attn_mask: T,
+    ) -> Tuple[T, T]:
+        _q_seq, q_pooled_out, _q_hidden = self.get_representation(
+            self.question_model,
+            question_ids,
+            question_segments,
+            question_attn_mask,
+            self.fix_q_encoder,
+        )
+        _ctx_seq, ctx_pooled_out, _ctx_hidden = self.get_representation(
+            self.ctx_model,
+            context_ids,
+            ctx_segments,
+            ctx_attn_mask,
+            self.fix_ctx_encoder,
+        )
+        return q_pooled_out, ctx_pooled_out
+    @classmethod
+    def create_biencoder_input(
+        cls,
+        samples: List,
+        tensorizer: Tensorizer,
+        insert_title: bool,
+        num_hard_negatives: int = 0,
+        num_other_negatives: int = 0,
+        shuffle: bool = True,
+        shuffle_positives: bool = False,
+        do_lower_fill: bool = False,
+        desegment_valid_fill: bool =False
+    ) -> BiEncoderBatch:
+        """
+        Creates a batch of the biencoder training tuple.
+        :param samples: list of data items (from json) to create the batch for
+        :param tensorizer: components to create model input tensors from a text sequence
+        :param insert_title: enables title insertion at the beginning of the context sequences
+        :param num_hard_negatives: amount of hard negatives per question (taken from samples' pools)
+        :param num_other_negatives: amount of other negatives per question (taken from samples' pools)
+        :param shuffle: shuffles negative passages pools
+        :param shuffle_positives: shuffles positive passages pools
+        :return: BiEncoderBatch tuple
+        """
+        question_tensors = []
+        ctx_tensors = []
+        positive_ctx_indices = []
+        hard_neg_ctx_indices = []
+        for sample in samples:
+            # ctx+ & [ctx-] composition
+            # as of now, take the first(gold) ctx+ only
+            if shuffle and shuffle_positives:
+                positive_ctxs = sample["positive_ctxs"]
+                positive_ctx = positive_ctxs[np.random.choice(len(positive_ctxs))]
+            else:
+                positive_ctx = sample["positive_ctxs"][0]
+            if do_lower_fill:
+                positive_ctx["text"] = positive_ctx["text"].lower()
+            neg_ctxs = sample["negative_ctxs"]
+            hard_neg_ctxs = sample["hard_negative_ctxs"]
+            if do_lower_fill:
+                neg_ctxs, hard_neg_ctxs = list(map(lambda x: {"text": x["text"].lower(), "title": x["title"]}, neg_ctxs)), list(map(lambda x: {"text": x["text"].lower(), "title": x["title"]}, hard_neg_ctxs))
+            question = normalize_question(sample["question"])
+            if shuffle:
+                random.shuffle(neg_ctxs)
+                random.shuffle(hard_neg_ctxs)
+            neg_ctxs = neg_ctxs[0:num_other_negatives]
+            hard_neg_ctxs = hard_neg_ctxs[0:num_hard_negatives]
+            all_ctxs = [positive_ctx] + neg_ctxs + hard_neg_ctxs
+            hard_negatives_start_idx = 1
+            hard_negatives_end_idx = 1 + len(hard_neg_ctxs)
+            current_ctxs_len = len(ctx_tensors)
+            sample_ctxs_tensors = [
+                tensorizer.text_to_tensor(
+                    ctx["text"], title=ctx["title"] if insert_title else None
+                )
+                for ctx in all_ctxs
+            ]
+            ctx_tensors.extend(sample_ctxs_tensors)
+            positive_ctx_indices.append(current_ctxs_len)
+            hard_neg_ctx_indices.append(
+                [
+                    i
+                    for i in range(
+                        current_ctxs_len + hard_negatives_start_idx,
+                        current_ctxs_len + hard_negatives_end_idx,
+                    )
+                ]
+            )
+            question_tensors.append(tensorizer.text_to_tensor(question))
+        ctxs_tensor = torch.cat([ctx.view(1, -1) for ctx in ctx_tensors], dim=0)
+        questions_tensor = torch.cat([q.view(1, -1) for q in question_tensors], dim=0)
+        ctx_segments = torch.zeros_like(ctxs_tensor)
+        question_segments = torch.zeros_like(questions_tensor)
+        return BiEncoderBatch(
+            questions_tensor,
+            question_segments,
+            ctxs_tensor,
+            ctx_segments,
+            positive_ctx_indices,
+            hard_neg_ctx_indices,
+        )
+class DistilBertBiEncoder(nn.Module):
+    """Bi-Encoder model component. Encapsulates query/question and context/passage encoders."""
+    def __init__(
+        self,
+        question_model: nn.Module,
+        ctx_model: nn.Module,
+        fix_q_encoder: bool = False,
+        fix_ctx_encoder: bool = False,
+    ):
+        super(DistilBertBiEncoder, self).__init__()
+        self.question_model = question_model
+        self.ctx_model = ctx_model
+        self.fix_q_encoder = fix_q_encoder
+        self.fix_ctx_encoder = fix_ctx_encoder
+    @staticmethod
+    def get_representation(
+        sub_model: nn.Module,
+        ids: T,
+        segments: T,
+        attn_mask: T,
+        fix_encoder: bool = False,
+    ) -> (T, T, T):
+        sequence_output = None
+        pooled_output = None
+        hidden_states = None
+        if ids is not None:
+            if fix_encoder:
+                with torch.no_grad():
+                    sequence_output, pooled_output, hidden_states = sub_model(
+                        # ids, segments, attn_mask
+                        ids, attn_mask
+                    )
+                if sub_model.training:
+                    sequence_output.requires_grad_(requires_grad=True)
+                    pooled_output.requires_grad_(requires_grad=True)
+            else:
+                sequence_output, pooled_output, hidden_states = sub_model(
+                    # ids, segments, attn_mask
+                    ids,  attn_mask
+                )
+        return sequence_output, pooled_output, hidden_states
+    def forward(
+        self,
+        question_ids: T,
+        question_segments: T,
+        question_attn_mask: T,
+        context_ids: T,
+        ctx_segments: T,
+        ctx_attn_mask: T,
+    ) -> Tuple[T, T]:
+        _q_seq, q_pooled_out, _q_hidden = self.get_representation(
+            self.question_model,
+            question_ids,
+            question_segments,
+            question_attn_mask,
+            self.fix_q_encoder,
+        )
+        _ctx_seq, ctx_pooled_out, _ctx_hidden = self.get_representation(
+            self.ctx_model,
+            context_ids,
+            ctx_segments,
+            ctx_attn_mask,
+            self.fix_ctx_encoder,
+        )
+        return q_pooled_out, ctx_pooled_out
+    @classmethod
+    def create_biencoder_input(
+        cls,
+        samples: List,
+        tensorizer: Tensorizer,
+        insert_title: bool,
+        num_hard_negatives: int = 0,
+        num_other_negatives: int = 0,
+        shuffle: bool = True,
+        shuffle_positives: bool = False,
+        do_lower_fill: bool = False,
+        desegment_valid_fill: bool =False
+    ) -> BiEncoderBatch:
+        """
+        Creates a batch of the biencoder training tuple.
+        :param samples: list of data items (from json) to create the batch for
+        :param tensorizer: components to create model input tensors from a text sequence
+        :param insert_title: enables title insertion at the beginning of the context sequences
+        :param num_hard_negatives: amount of hard negatives per question (taken from samples' pools)
+        :param num_other_negatives: amount of other negatives per question (taken from samples' pools)
+        :param shuffle: shuffles negative passages pools
+        :param shuffle_positives: shuffles positive passages pools
+        :return: BiEncoderBatch tuple
+        """
+        question_tensors = []
+        ctx_tensors = []
+        positive_ctx_indices = []
+        hard_neg_ctx_indices = []
+        for sample in samples:
+            # ctx+ & [ctx-] composition
+            # as of now, take the first(gold) ctx+ only
+            if shuffle and shuffle_positives:
+                positive_ctxs = sample["positive_ctxs"]
+                positive_ctx = positive_ctxs[np.random.choice(len(positive_ctxs))]
+            else:
+                positive_ctx = sample["positive_ctxs"][0]
+            if do_lower_fill:
+                positive_ctx["text"] = positive_ctx["text"].lower()
+            neg_ctxs = sample["negative_ctxs"]
+            hard_neg_ctxs = sample["hard_negative_ctxs"]
+            if do_lower_fill:
+                neg_ctxs, hard_neg_ctxs = list(map(lambda x: {"text": x["text"].lower(), "title": x["title"]}, neg_ctxs)), list(map(lambda x: {"text": x["text"].lower(), "title": x["title"]}, hard_neg_ctxs))
+            question = normalize_question(sample["question"])
+            if shuffle:
+                random.shuffle(neg_ctxs)
+                random.shuffle(hard_neg_ctxs)
+            neg_ctxs = neg_ctxs[0:num_other_negatives]
+            hard_neg_ctxs = hard_neg_ctxs[0:num_hard_negatives]
+            all_ctxs = [positive_ctx] + neg_ctxs + hard_neg_ctxs
+            hard_negatives_start_idx = 1
+            hard_negatives_end_idx = 1 + len(hard_neg_ctxs)
+            current_ctxs_len = len(ctx_tensors)
+            sample_ctxs_tensors = [
+                tensorizer.text_to_tensor(
+                    ctx["text"], title=ctx["title"] if insert_title else None
+                )
+                for ctx in all_ctxs
+            ]
+            ctx_tensors.extend(sample_ctxs_tensors)
+            positive_ctx_indices.append(current_ctxs_len)
+            hard_neg_ctx_indices.append(
+                [
+                    i
+                    for i in range(
+                        current_ctxs_len + hard_negatives_start_idx,
+                        current_ctxs_len + hard_negatives_end_idx,
+                    )
+                ]
+            )
+            question_tensors.append(tensorizer.text_to_tensor(question))
+        ctxs_tensor = torch.cat([ctx.view(1, -1) for ctx in ctx_tensors], dim=0)
+        questions_tensor = torch.cat([q.view(1, -1) for q in question_tensors], dim=0)
+        ctx_segments = torch.zeros_like(ctxs_tensor)
+        question_segments = torch.zeros_like(questions_tensor)
+        return BiEncoderBatch(
+            questions_tensor,
+            question_segments,
+            ctxs_tensor,
+            ctx_segments,
+            positive_ctx_indices,
+            hard_neg_ctx_indices,
+        )
+class BiEncoderNllLoss(object):
+    def calc(
+        self,
+        q_vectors: T,
+        ctx_vectors: T,
+        positive_idx_per_question: list,
+        hard_negatice_idx_per_question: list = None,
+    ) -> Tuple[T, int]:
+        """
+        Computes nll loss for the given lists of question and ctx vectors.
+        Note that although hard_negative_idx_per_question in not currently in use, one can use it for the
+        loss modifications. For example - weighted NLL with different factors for hard vs regular negatives.
+        :return: a tuple of loss value and amount of correct predictions per batch
+        """
+        scores = self.get_scores(q_vectors, ctx_vectors)
+        if len(q_vectors.size()) > 1:
+            q_num = q_vectors.size(0)
+            scores = scores.view(q_num, -1)
+        softmax_scores = F.log_softmax(scores, dim=1)
+        loss = F.nll_loss(
+            softmax_scores,
+            torch.tensor(positive_idx_per_question).to(softmax_scores.device),
+            reduction="mean",
+        )
+        max_score, max_idxs = torch.max(softmax_scores, 1)
+        correct_predictions_count = (
+            max_idxs == torch.tensor(positive_idx_per_question).to(max_idxs.device)
+        ).sum()
+        return loss, correct_predictions_count
+    @staticmethod
+    def get_scores(q_vector: T, ctx_vectors: T) -> T:
+        f = BiEncoderNllLoss.get_similarity_function()
+        return f(q_vector, ctx_vectors)
+    @staticmethod
+    def get_similarity_function():
+        return dot_product_scores

models/hf_models.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import logging
+from typing import Tuple
+import torch
+from torch import Tensor as T
+from torch import nn
+from transformers import BertConfig, BertModel
+from transformers.optimization import AdamW
+from transformers import BertTokenizer
+from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
+import sys
+import os
+current_dir = os.path.dirname(__file__)
+data_utils_path = os.path.join(current_dir, '..')
+sys.path.append(data_utils_path)
+from Data_utils_inf import Tensorizer
+from .biencoder import BiEncoder, DistilBertBiEncoder
+logger = logging.getLogger(__name__)
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def get_bert_biencoder_components(args, inference_only: bool = False, **kwargs):
+    dropout = args.dropout if hasattr(args, "dropout") else 0.0
+    question_encoder = HFBertEncoder.init_encoder(
+        args.pretrained_model_cfg,
+        projection_dim=args.projection_dim,
+        dropout=dropout,
+        **kwargs
+    )
+    ctx_encoder = HFBertEncoder.init_encoder(
+        args.pretrained_model_cfg,
+        projection_dim=args.projection_dim,
+        dropout=dropout,
+        **kwargs
+    )
+    fix_ctx_encoder = (
+        args.fix_ctx_encoder if hasattr(args, "fix_ctx_encoder") else False
+    )
+    biencoder = BiEncoder(
+        question_encoder, ctx_encoder, fix_ctx_encoder=fix_ctx_encoder
+    )
+    optimizer = (
+        get_optimizer(
+            biencoder,
+            learning_rate=args.learning_rate,
+            adam_eps=args.adam_eps,
+            weight_decay=args.weight_decay,
+        )
+        if not inference_only
+        else None
+    )
+    tensorizer = get_bert_tensorizer(args)
+    return tensorizer, biencoder, optimizer
+def get_distilbert_biencoder_components(args, inference_only: bool = False, **kwargs):
+    dropout = args.dropout if hasattr(args, "dropout") else 0.0
+    question_encoder = HFDistilBertEncoder.init_encoder(
+        args.pretrained_model_cfg,
+        projection_dim=args.projection_dim,
+        dropout=dropout,
+        **kwargs
+    )
+    ctx_encoder = HFDistilBertEncoder.init_encoder(
+        args.pretrained_model_cfg,
+        projection_dim=args.projection_dim,
+        dropout=dropout,
+        **kwargs
+    )
+    fix_ctx_encoder = (
+        args.fix_ctx_encoder if hasattr(args, "fix_ctx_encoder") else False
+    )
+    biencoder = DistilBertBiEncoder(
+        question_encoder, ctx_encoder, fix_ctx_encoder = fix_ctx_encoder
+    )
+    optimizer = (
+        get_optimizer(
+            biencoder,
+            learning_rate=args.learning_rate,
+            adam_eps=args.adam_eps,
+            weight_decay=args.weight_decay,
+        )
+        if not inference_only
+        else None
+    )
+    tensorizer = get_distilbert_tensorizer(args)
+    return tensorizer, biencoder, optimizer
+def get_bert_tensorizer(args, tokenizer=None):
+    if not tokenizer:
+        tokenizer = get_bert_tokenizer(
+            args.pretrained_model_cfg, do_lower_case=args.do_lower_case
+        )
+    return BertTensorizer(tokenizer, args.sequence_length)
+def get_distilbert_tensorizer(args, tokenizer=None):
+    if not tokenizer:
+        tokenizer = get_distilbert_tokenizer(
+            args.pretrained_model_cfg, do_lower_case=args.do_lower_case
+        )
+    return DistilBertTensorizer(tokenizer, args.sequence_length)
+def get_optimizer(
+    model: nn.Module,
+    learning_rate: float = 1e-5,
+    adam_eps: float = 1e-8,
+    weight_decay: float = 0.0,
+) -> torch.optim.Optimizer:
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_eps)
+    return optimizer
+def get_bert_tokenizer(pretrained_cfg_name: str, do_lower_case: bool = True):
+    return BertTokenizer.from_pretrained(
+        pretrained_cfg_name, do_lower_case=do_lower_case
+    )
+def get_distilbert_tokenizer(pretrained_cfg_name: str, do_lower_case: bool = True):
+    # still uses HF code for tokenizer since they are the same
+    return DistilBertTokenizer.from_pretrained(
+        pretrained_cfg_name, do_lower_case=do_lower_case
+    )
+class HFDistilBertEncoder(DistilBertModel):
+    def __init__(self, config, project_dim: int = 0):
+        DistilBertModel.__init__(self, config)
+        assert config.hidden_size > 0, "Encoder hidden_size can't be zero"
+        self.encode_proj = (
+            nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None
+        )
+        self.init_weights()
+    @classmethod
+    def init_encoder(
+        cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs
+    ) -> DistilBertModel:
+        cfg = DistilBertConfig.from_pretrained(cfg_name if cfg_name else "distilbert-base-uncased")
+        if dropout != 0:
+            cfg.attention_probs_dropout_prob = dropout
+            cfg.hidden_dropout_prob = dropout
+        return cls.from_pretrained(
+            cfg_name, config=cfg, project_dim=projection_dim, **kwargs
+        )
+    def forward(
+        self, input_ids: T, attention_mask: T
+    ) -> Tuple[T, ...]:
+        if self.config.output_hidden_states:
+            outputs = super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
+            sequence_output = outputs.last_hidden_state
+            pooled_output = outputs.last_hidden_state[:, 0, :]
+            hidden_states = outputs.hidden_states
+        else:
+            hidden_states = None
+            outputs = super().forward(
+                input_ids = input_ids,
+                attention_mask = attention_mask,
+            )
+            sequence_output = outputs.last_hidden_state
+            pooled_output = outputs.last_hidden_state[:, 0, :]
+        if self.encode_proj:
+            pooled_output = self.encode_proj(pooled_output)
+        return sequence_output, pooled_output, hidden_states
+    def get_out_size(self):
+        if self.encode_proj:
+            return self.encode_proj.out_features
+        return self.config.hidden_size
+class HFBertEncoder(BertModel):
+    def __init__(self, config, project_dim: int = 0):
+        BertModel.__init__(self, config)
+        assert config.hidden_size > 0, "Encoder hidden_size can't be zero"
+        self.encode_proj = (
+            nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None
+        )
+        self.init_weights()
+    @classmethod
+    def init_encoder(
+        cls, cfg_name: str, projection_dim: int = 0, dropout: float = 0.1, **kwargs
+    ) -> BertModel:
+        cfg = BertConfig.from_pretrained(cfg_name if cfg_name else "bert-base-uncased")
+        if dropout != 0:
+            cfg.attention_probs_dropout_prob = dropout
+            cfg.hidden_dropout_prob = dropout
+        return cls.from_pretrained(
+            cfg_name, config=cfg, project_dim=projection_dim, **kwargs
+        )
+    def forward(
+        self, input_ids: T, token_type_ids: T, attention_mask: T
+    ) -> Tuple[T, ...]:
+        if self.config.output_hidden_states:
+            outputs = super().forward(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+            )
+            sequence_output = outputs.last_hidden_state
+            pooled_output = outputs.pooler_output
+            hidden_states = outputs.hidden_states
+        else:
+            hidden_states = None
+            outputs = super().forward(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+            )
+            sequence_output = outputs.last_hidden_state
+            pooled_output = outputs.pooler_output
+        if self.encode_proj:
+            pooled_output = self.encode_proj(pooled_output)
+        return sequence_output, pooled_output, hidden_states
+    def get_out_size(self):
+        if self.encode_proj:
+            return self.encode_proj.out_features
+        return self.config.hidden_size
+class DistilBertTensorizer(Tensorizer):
+    def __init__(
+        self, tokenizer: DistilBertTokenizer, max_length: int, pad_to_max: bool = True
+    ):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.pad_to_max = pad_to_max
+    def text_to_tensor(
+        self, text: str, title: str = None, add_special_tokens: bool = True
+    ):
+        if isinstance(text, float):
+            text = 'nan'
+        text = text.strip()
+        # tokenizer automatic padding is explicitly disabled since its inconsistent behavior
+        if title:
+            token_ids = self.tokenizer.encode(
+                title,
+                text_pair = text,
+                add_special_tokens = add_special_tokens,
+                max_length = self.max_length,
+                pad_to_max_length = False,
+                truncation = True,
+            )
+        else:
+            token_ids = self.tokenizer.encode(
+                text,
+                add_special_tokens = add_special_tokens,
+                max_length = self.max_length,
+                pad_to_max_length = False,
+                truncation = True,
+            )
+        seq_len = self.max_length
+        if self.pad_to_max and len(token_ids) < seq_len:
+            token_ids = token_ids + [self.tokenizer.pad_token_id] * (
+                seq_len - len(token_ids)
+            )
+        if len(token_ids) > seq_len:
+            token_ids = token_ids[0:seq_len]
+            token_ids[-1] = self.tokenizer.sep_token_id
+        return torch.tensor(token_ids)
+class BertTensorizer(Tensorizer):
+    def __init__(
+        self, tokenizer: BertTokenizer, max_length: int, pad_to_max: bool = True
+    ):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.pad_to_max = pad_to_max
+    def text_to_tensor(
+        self, text: str, title: str = None, add_special_tokens: bool = True
+    ):
+        if isinstance(text, float):
+            text = 'nan'
+        text = text.strip()
+        # tokenizer automatic padding is explicitly disabled since its inconsistent behavior
+        if title:
+            token_ids = self.tokenizer.encode(
+                title,
+                text_pair=text,
+                add_special_tokens=add_special_tokens,
+                max_length=self.max_length,
+                pad_to_max_length=False,
+                truncation=True,
+            )
+        else:
+            token_ids = self.tokenizer.encode(
+                text,
+                add_special_tokens=add_special_tokens,
+                max_length=self.max_length,
+                pad_to_max_length=False,
+                truncation=True,
+            )
+        seq_len = self.max_length
+        if self.pad_to_max and len(token_ids) < seq_len:
+            token_ids = token_ids + [self.tokenizer.pad_token_id] * (
+                seq_len - len(token_ids)
+            )
+        if len(token_ids) > seq_len:
+            token_ids = token_ids[0:seq_len]
+            token_ids[-1] = self.tokenizer.sep_token_id
+        return torch.tensor(token_ids)
+    def get_pair_separator_ids(self) -> T:
+        return torch.tensor([self.tokenizer.sep_token_id])
+    def get_pad_id(self) -> int:
+        return self.tokenizer.pad_token_id
+    def get_attn_mask(self, tokens_tensor: T) -> T:
+        return tokens_tensor != self.get_pad_id()
+    def is_sub_word_id(self, token_id: int):
+        token = self.tokenizer.convert_ids_to_tokens([token_id])[0]
+        return token.startswith("##") or token.startswith(" ##")
+    def to_string(self, token_ids, skip_special_tokens=True):
+        return self.tokenizer.decode(token_ids, skip_special_tokens=True)
+    def set_pad_to_max(self, do_pad: bool):
+        self.pad_to_max = do_pad

requirements.txt CHANGED Viewed

@@ -1,2 +1,8 @@
 fastapi== 0.104.1
 uvicorn[standard]

 fastapi== 0.104.1
 uvicorn[standard]
+puzpy
+transformers
+wordsegment
+torch
+faiss

words_alpha.txt ADDED Viewed

The diff for this file is too large to render. See raw diff