File size: 5,208 Bytes
b6e5241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Edit distance utils...

Originally by https://worksheets.codalab.org/worksheets/0x8fc01c7fc2b742fdb29c05669f0ad7d2
"""
from collections import defaultdict
import numpy as np
import random
import string
from itertools import permutations

def process_filetype(filetype):
    insert = (filetype // 1000) % 2 == 1
    delete = (filetype // 100) % 2 == 1
    substitute = (filetype // 10) % 2 == 1
    swap = filetype % 2 == 1
    return insert, delete, substitute, swap

def get_all_edit_dist_one(word, filetype = 1111, sub_restrict = None):
    """
    Allowable edit_dist_one perturbations:
        1. Insert any lowercase characer at any position other than the start
        2. Delete any character other than the first one
        3. Substitute any lowercase character for any other lowercase letter other than the start
        4. Swap adjacent characters
    We also include the original word. Filetype determines which of the allowable perturbations to use.
    """
    insert, delete, substitute, swap = process_filetype(filetype)
    #last_mod_pos is last thing you could insert before
    last_mod_pos = len(word) #- 1
    ed1 = set()
    if len(word) <= 2 or word[:1].isupper() or word[:1].isnumeric():
        return ed1
    for pos in range(1, last_mod_pos + 1): #can add letters at the end
        if delete and pos < last_mod_pos:
            deletion = word[:pos] + word[pos + 1:]
            ed1.add(deletion)
        if swap and pos < last_mod_pos - 1:
            #swapping thing at pos with thing at pos + 1
            swaped = word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]
            ed1.add(swaped)
        for letter in string.ascii_lowercase: #+"'-": #no need to add '-, as we want to corrupt good to bad
            if insert:
                #Insert right after pos - 1
                insertion = word[:pos] + letter + word[pos:]
                ed1.add(insertion)
            can_substitute = sub_restrict is None or letter in sub_restrict[word[pos]]
            if substitute and pos < last_mod_pos and can_substitute:
                substitution = word[:pos] + letter + word[pos + 1:]
                ed1.add(substitution)
    #Include original word
    # ed1.add(word)
    return ed1

def get_all_internal_permutations(word):
    if len(word) > 10:
        return set([word])
    first_char = word[0]
    last_char = word[-1]
    internal_chars = word[1:-1]
    internal_permutations = set()
    for int_perm in permutations(internal_chars):
        int_perm_str = ''.join(int_perm)
        perm = '{}{}{}'.format(first_char, int_perm_str, last_char)
        internal_permutations.add(perm)
    return internal_permutations

def sample_random_internal_permutations(word, n_perts = 5):
    #We try swapping everything with the second character...
    if len(word) < 4:
        return set([word])
    #iterate through positions between second and last
    perturbations = set()
    start = word[0]
    end = word[-1]
    middle = word[1:-1]
    for _ in range(n_perts):
        middle_list = list(middle)
        random.shuffle(middle_list)
        mixed_up_middle = ''.join(middle_list)
        perturbations.add('{}{}{}'.format(start, mixed_up_middle, end))
    return perturbations

def get_sorted_word(word):
    if len(word) < 3:
        sorted_word = word
    else:
        sorted_word = '{}{}{}'.format(word[0], ''.join(sorted(word[1:-1])), word[-1])
    return sorted_word

def get_sorted_word_set(word):
    if len(word) < 3:
        sorted_word = word
    else:
        sorted_word = '{}{}{}'.format(word[0], ''.join(sorted(word[1:-1])), word[-1])
    return set([sorted_word])


#Used to create agglomerative clusters.
def preprocess_ed1_neighbors(vocab, sub_restrict = None, filetype = 1111):
    vocab = set([word.lower() for word in vocab])
    typo2words = defaultdict(set)
    for word in vocab:
        ed1_typos = get_all_edit_dist_one(word, filetype = filetype, sub_restrict = sub_restrict)
        for typo in ed1_typos:
            typo2words[typo].add(word)

    word2neighbors = defaultdict(set)
    for typo in typo2words:
        for word in typo2words[typo]:
            word2neighbors[word] = word2neighbors[word].union(typo2words[typo])
    return word2neighbors

#Used to create agglomerative clusters.
def ed1_neighbors_mat(vocab, sub_restrict = None, filetype = 1111):
    vocab = [word.lower() for word in vocab]
    word2idx = dict([(word, i) for i, word in enumerate(vocab)])
    word2neighbors = preprocess_ed1_neighbors(vocab, sub_restrict = sub_restrict, filetype = filetype)
    edges = set()
    for word in word2neighbors:
        for neighbor in word2neighbors[word]:
            edge = [word, neighbor]
            edge.sort()
            edge = tuple(edge)
            edges.add(edge)
    edge_mat = np.zeros((len(vocab), len(vocab)), dtype = int)
    for edge in edges:
        vtx1, vtx2 = edge
        idx1, idx2 = word2idx[vtx1], word2idx[vtx2]
        edge_mat[idx1][idx2] = 1
        edge_mat[idx2][idx1] = 1
    return edge_mat



if __name__ == '__main__':
    while True:
        word = input("Enter a word: ")
        print("Total number of possible perturbations: {}".format(len(get_all_edit_dist_one(word))))