Spaces:

ofig
/

live-lm-critic

Runtime error

File size: 5,208 Bytes

b6e5241

"""
Edit distance utils...

Originally by https://worksheets.codalab.org/worksheets/0x8fc01c7fc2b742fdb29c05669f0ad7d2
"""
from collections import defaultdict
import numpy as np
import random
import string
from itertools import permutations

def process_filetype(filetype):
    insert = (filetype // 1000) % 2 == 1
    delete = (filetype // 100) % 2 == 1
    substitute = (filetype // 10) % 2 == 1
    swap = filetype % 2 == 1
    return insert, delete, substitute, swap

def get_all_edit_dist_one(word, filetype = 1111, sub_restrict = None):
    """
    Allowable edit_dist_one perturbations:
        1. Insert any lowercase characer at any position other than the start
        2. Delete any character other than the first one
        3. Substitute any lowercase character for any other lowercase letter other than the start
        4. Swap adjacent characters
    We also include the original word. Filetype determines which of the allowable perturbations to use.
    """
    insert, delete, substitute, swap = process_filetype(filetype)
    #last_mod_pos is last thing you could insert before
    last_mod_pos = len(word) #- 1
    ed1 = set()
    if len(word) <= 2 or word[:1].isupper() or word[:1].isnumeric():
        return ed1
    for pos in range(1, last_mod_pos + 1): #can add letters at the end
        if delete and pos < last_mod_pos:
            deletion = word[:pos] + word[pos + 1:]
            ed1.add(deletion)
        if swap and pos < last_mod_pos - 1:
            #swapping thing at pos with thing at pos + 1
            swaped = word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]
            ed1.add(swaped)
        for letter in string.ascii_lowercase: #+"'-": #no need to add '-, as we want to corrupt good to bad
            if insert:
                #Insert right after pos - 1
                insertion = word[:pos] + letter + word[pos:]
                ed1.add(insertion)
            can_substitute = sub_restrict is None or letter in sub_restrict[word[pos]]
            if substitute and pos < last_mod_pos and can_substitute:
                substitution = word[:pos] + letter + word[pos + 1:]
                ed1.add(substitution)
    #Include original word
    # ed1.add(word)
    return ed1

def get_all_internal_permutations(word):
    if len(word) > 10:
        return set([word])
    first_char = word[0]
    last_char = word[-1]
    internal_chars = word[1:-1]
    internal_permutations = set()
    for int_perm in permutations(internal_chars):
        int_perm_str = ''.join(int_perm)
        perm = '{}{}{}'.format(first_char, int_perm_str, last_char)
        internal_permutations.add(perm)
    return internal_permutations

def sample_random_internal_permutations(word, n_perts = 5):
    #We try swapping everything with the second character...
    if len(word) < 4:
        return set([word])
    #iterate through positions between second and last
    perturbations = set()
    start = word[0]
    end = word[-1]
    middle = word[1:-1]
    for _ in range(n_perts):
        middle_list = list(middle)
        random.shuffle(middle_list)
        mixed_up_middle = ''.join(middle_list)
        perturbations.add('{}{}{}'.format(start, mixed_up_middle, end))
    return perturbations

def get_sorted_word(word):
    if len(word) < 3:
        sorted_word = word
    else:
        sorted_word = '{}{}{}'.format(word[0], ''.join(sorted(word[1:-1])), word[-1])
    return sorted_word

def get_sorted_word_set(word):
    if len(word) < 3:
        sorted_word = word
    else:
        sorted_word = '{}{}{}'.format(word[0], ''.join(sorted(word[1:-1])), word[-1])
    return set([sorted_word])


#Used to create agglomerative clusters.
def preprocess_ed1_neighbors(vocab, sub_restrict = None, filetype = 1111):
    vocab = set([word.lower() for word in vocab])
    typo2words = defaultdict(set)
    for word in vocab:
        ed1_typos = get_all_edit_dist_one(word, filetype = filetype, sub_restrict = sub_restrict)
        for typo in ed1_typos:
            typo2words[typo].add(word)

    word2neighbors = defaultdict(set)
    for typo in typo2words:
        for word in typo2words[typo]:
            word2neighbors[word] = word2neighbors[word].union(typo2words[typo])
    return word2neighbors

#Used to create agglomerative clusters.
def ed1_neighbors_mat(vocab, sub_restrict = None, filetype = 1111):
    vocab = [word.lower() for word in vocab]
    word2idx = dict([(word, i) for i, word in enumerate(vocab)])
    word2neighbors = preprocess_ed1_neighbors(vocab, sub_restrict = sub_restrict, filetype = filetype)
    edges = set()
    for word in word2neighbors:
        for neighbor in word2neighbors[word]:
            edge = [word, neighbor]
            edge.sort()
            edge = tuple(edge)
            edges.add(edge)
    edge_mat = np.zeros((len(vocab), len(vocab)), dtype = int)
    for edge in edges:
        vtx1, vtx2 = edge
        idx1, idx2 = word2idx[vtx1], word2idx[vtx2]
        edge_mat[idx1][idx2] = 1
        edge_mat[idx2][idx1] = 1
    return edge_mat



if __name__ == '__main__':
    while True:
        word = input("Enter a word: ")
        print("Total number of possible perturbations: {}".format(len(get_all_edit_dist_one(word))))