catasaurus commited on
Commit
fc247ed
1 Parent(s): d7f93de

Added application file

Browse files
Files changed (1) hide show
  1. app.py +191 -0
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ #import os
4
+ #os.environ['KMP_DUPLICATE_LIB_OK']='True'
5
+ #import spacy
6
+
7
+ # Change this according to what words should be corrected to
8
+ SPELL_CORRECT_MIN_CHAR_DIFF = 2
9
+
10
+ TOKENS2INT_ERROR_INT = 32202
11
+
12
+ ONES = [
13
+ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
14
+ "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
15
+ "sixteen", "seventeen", "eighteen", "nineteen",
16
+ ]
17
+
18
+ CHAR_MAPPING = {
19
+ "-": " ",
20
+ "_": " ",
21
+ "and":" ",
22
+ }
23
+ #CHAR_MAPPING.update((str(i), word) for i, word in enumerate([" " + s + " " for s in ONES]))
24
+ TOKEN_MAPPING = {
25
+ "and": " ",
26
+ "oh":"0",
27
+ }
28
+
29
+ def find_char_diff(a, b):
30
+ # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
31
+ char_counts_a = {}
32
+ char_counts_b = {}
33
+ for char in a:
34
+ if char in char_counts_a.keys():
35
+ char_counts_a[char] += 1
36
+ else:
37
+ char_counts_a[char] = 1
38
+ for char in b:
39
+ if char in char_counts_b.keys():
40
+ char_counts_b[char] += 1
41
+ else:
42
+ char_counts_b[char] = 1
43
+ char_diff = 0
44
+ for i in char_counts_a:
45
+ if i in char_counts_b.keys():
46
+ char_diff += abs(char_counts_a[i] - char_counts_b[i])
47
+ else:
48
+ char_diff += char_counts_a[i]
49
+ return char_diff
50
+
51
+ def tokenize(text):
52
+ text = text.lower()
53
+ #print(text)
54
+ text = replace_tokens(''.join(i for i in replace_chars(text)).split())
55
+ #print(text)
56
+ text = [i for i in text if i != ' ']
57
+ #print(text)
58
+ output = []
59
+ for word in text:
60
+ #print(word)
61
+ output.append(convert_word_to_int(word))
62
+ output = [i for i in output if i != ' ']
63
+ #print(output)
64
+ return output
65
+
66
+
67
+ def detokenize(tokens):
68
+ return ' '.join(tokens)
69
+
70
+
71
+ def replace_tokens(tokens, token_mapping=TOKEN_MAPPING):
72
+ return [token_mapping.get(tok, tok) for tok in tokens]
73
+
74
+ def replace_chars(text, char_mapping=CHAR_MAPPING):
75
+ return [char_mapping.get(c, c) for c in text]
76
+
77
+ def convert_word_to_int(in_word, numwords={}):
78
+ # Converts a single word/str into a single int
79
+ tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
80
+ scales = ["hundred", "thousand", "million", "billion", "trillion"]
81
+ if not numwords:
82
+ for idx, word in enumerate(ONES):
83
+ numwords[word] = idx
84
+ for idx, word in enumerate(tens):
85
+ numwords[word] = idx * 10
86
+ for idx, word in enumerate(scales):
87
+ numwords[word] = 10 ** (idx * 3 or 2)
88
+ if in_word in numwords:
89
+ #print(in_word)
90
+ #print(numwords[in_word])
91
+ return numwords[in_word]
92
+ try:
93
+ int(in_word)
94
+ return int(in_word)
95
+ except ValueError:
96
+ pass
97
+ # Spell correction using find_char_diff
98
+ char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
99
+ min_char_diff = min(char_diffs)
100
+ if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
101
+ return char_diffs.index(min_char_diff)
102
+
103
+
104
+ def tokens2int(tokens):
105
+ # Takes a list of tokens and returns a int representation of them
106
+ types = []
107
+ for i in tokens:
108
+ if i <= 9:
109
+ types.append(1)
110
+
111
+ elif i <= 90:
112
+ types.append(2)
113
+
114
+ else:
115
+ types.append(3)
116
+ #print(tokens)
117
+ if len(tokens) <= 3:
118
+ current = 0
119
+ for i, number in enumerate(tokens):
120
+ if i != 0 and types[i] < types[i-1] and current != tokens[i-1] and types[i-1] != 3:
121
+ current += tokens[i] + tokens[i-1]
122
+ elif current <= tokens[i] and current != 0:
123
+ current *= tokens[i]
124
+ elif 3 not in types and 1 not in types:
125
+ current = int(''.join(str(i) for i in tokens))
126
+ break
127
+ elif '111' in ''.join(str(i) for i in types) and 2 not in types and 3 not in types:
128
+ current = int(''.join(str(i) for i in tokens))
129
+ break
130
+ else:
131
+ current += number
132
+
133
+ elif 3 not in types and 2 not in types:
134
+ current = int(''.join(str(i) for i in tokens))
135
+
136
+ else:
137
+ """
138
+ double_list = []
139
+ current_double = []
140
+ double_type_list = []
141
+ for i in tokens:
142
+ if len(current_double) < 2:
143
+ current_double.append(i)
144
+ else:
145
+ double_list.append(current_double)
146
+ current_double = []
147
+ current_double = []
148
+ for i in types:
149
+ if len(current_double) < 2:
150
+ current_double.append(i)
151
+ else:
152
+ double_type_list.append(current_double)
153
+ current_double = []
154
+ print(double_type_list)
155
+ print(double_list)
156
+ current = 0
157
+ for i, type_double in enumerate(double_type_list):
158
+ if len(type_double) == 1:
159
+ current += double_list[i][0]
160
+ elif type_double[0] == type_double[1]:
161
+ current += int(str(double_list[i][0]) + str(double_list[i][1]))
162
+ elif type_double[0] > type_double[1]:
163
+ current += sum(double_list[i])
164
+ elif type_double[0] < type_double[1]:
165
+ current += double_list[i][0] * double_list[i][1]
166
+ #print(current)
167
+ """
168
+ count = 0
169
+ current = 0
170
+ for i, token in enumerate(tokens):
171
+ count += 1
172
+ if count == 2:
173
+ if types[i-1] == types[i]:
174
+ current += int(str(token)+str(tokens[i-1]))
175
+ elif types[i-1] > types[i]:
176
+ current += tokens[i-1] + token
177
+ else:
178
+ current += tokens[i-1] * token
179
+ count = 0
180
+ elif i == len(tokens) - 1:
181
+ current += token
182
+
183
+ return current
184
+
185
+ def text2int(text):
186
+ # Wraps all of the functions up into one
187
+ return tokens2int(tokenize(text))
188
+
189
+
190
+ iface = gr.Interface(fn=text2int, inputs="text", outputs="text")
191
+ iface.launch()