catasaurus commited on
Commit
dc4f50e
1 Parent(s): fc247ed

Added better spell check

Browse files
Files changed (1) hide show
  1. app.py +39 -3
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import gradio as gr
2
-
3
  #import os
4
  #os.environ['KMP_DUPLICATE_LIB_OK']='True'
5
  #import spacy
6
-
 
7
  # Change this according to what words should be corrected to
8
  SPELL_CORRECT_MIN_CHAR_DIFF = 2
9
 
@@ -25,6 +25,39 @@ TOKEN_MAPPING = {
25
  "and": " ",
26
  "oh":"0",
27
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def find_char_diff(a, b):
30
  # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
@@ -94,12 +127,14 @@ def convert_word_to_int(in_word, numwords={}):
94
  return int(in_word)
95
  except ValueError:
96
  pass
 
97
  # Spell correction using find_char_diff
98
  char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
99
  min_char_diff = min(char_diffs)
100
  if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
101
  return char_diffs.index(min_char_diff)
102
-
 
103
 
104
  def tokens2int(tokens):
105
  # Takes a list of tokens and returns a int representation of them
@@ -187,5 +222,6 @@ def text2int(text):
187
  return tokens2int(tokenize(text))
188
 
189
 
 
190
  iface = gr.Interface(fn=text2int, inputs="text", outputs="text")
191
  iface.launch()
 
1
  import gradio as gr
 
2
  #import os
3
  #os.environ['KMP_DUPLICATE_LIB_OK']='True'
4
  #import spacy
5
+ import re
6
+ from collections import Counter
7
  # Change this according to what words should be corrected to
8
  SPELL_CORRECT_MIN_CHAR_DIFF = 2
9
 
 
25
  "and": " ",
26
  "oh":"0",
27
  }
28
+ def words(text): return re.findall(r'\w+', text.lower())
29
+
30
+ WORDS = Counter(words(open('numbers.txt').read()))
31
+
32
+ def P(word, N=sum(WORDS.values())):
33
+ "Probability of `word`."
34
+ return WORDS[word] / N
35
+
36
+ def correction(word):
37
+ "Most probable spelling correction for word."
38
+ return max(candidates(word), key=P)
39
+
40
+ def candidates(word):
41
+ "Generate possible spelling corrections for word."
42
+ return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
43
+
44
+ def known(words):
45
+ "The subset of `words` that appear in the dictionary of WORDS."
46
+ return set(w for w in words if w in WORDS)
47
+
48
+ def edits1(word):
49
+ "All edits that are one edit away from `word`."
50
+ letters = 'abcdefghijklmnopqrstuvwxyz'
51
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
52
+ deletes = [L + R[1:] for L, R in splits if R]
53
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
54
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
55
+ inserts = [L + c + R for L, R in splits for c in letters]
56
+ return set(deletes + transposes + replaces + inserts)
57
+
58
+ def edits2(word):
59
+ "All edits that are two edits away from `word`."
60
+ return (e2 for e1 in edits1(word) for e2 in edits1(e1))
61
 
62
  def find_char_diff(a, b):
63
  # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
 
127
  return int(in_word)
128
  except ValueError:
129
  pass
130
+ """
131
  # Spell correction using find_char_diff
132
  char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
133
  min_char_diff = min(char_diffs)
134
  if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
135
  return char_diffs.index(min_char_diff)
136
+ """
137
+ return numwords[correction(in_word)]
138
 
139
  def tokens2int(tokens):
140
  # Takes a list of tokens and returns a int representation of them
 
222
  return tokens2int(tokenize(text))
223
 
224
 
225
+
226
  iface = gr.Interface(fn=text2int, inputs="text", outputs="text")
227
  iface.launch()