Spaces:

catasaurus
/

text2int

Build error

App Files Files Community

catasaurus commited on Jan 5, 2023

Commit

dc4f50e

•

1 Parent(s): fc247ed

Added better spell check

Browse files

Files changed (1) hide show

app.py +39 -3

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import gradio as gr
 #import os
 #os.environ['KMP_DUPLICATE_LIB_OK']='True'
 #import spacy
 # Change this according to what words should be corrected to
 SPELL_CORRECT_MIN_CHAR_DIFF = 2
@@ -25,6 +25,39 @@ TOKEN_MAPPING = {
     "and": " ",
     "oh":"0",
 }
 def find_char_diff(a, b):
     # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
@@ -94,12 +127,14 @@ def convert_word_to_int(in_word, numwords={}):
         return int(in_word)
     except ValueError:
         pass
     # Spell correction using find_char_diff
     char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
     min_char_diff = min(char_diffs)
     if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
         return char_diffs.index(min_char_diff)
 def tokens2int(tokens):
     # Takes a list of tokens and returns a int representation of them
@@ -187,5 +222,6 @@ def text2int(text):
     return tokens2int(tokenize(text))
 iface = gr.Interface(fn=text2int, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
 #import os
 #os.environ['KMP_DUPLICATE_LIB_OK']='True'
 #import spacy
+import re
+from collections import Counter
 # Change this according to what words should be corrected to
 SPELL_CORRECT_MIN_CHAR_DIFF = 2
     "and": " ",
     "oh":"0",
 }
+def words(text): return re.findall(r'\w+', text.lower())
+WORDS = Counter(words(open('numbers.txt').read()))
+def P(word, N=sum(WORDS.values())):
+    "Probability of `word`."
+    return WORDS[word] / N
+def correction(word):
+    "Most probable spelling correction for word."
+    return max(candidates(word), key=P)
+def candidates(word):
+    "Generate possible spelling corrections for word."
+    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
+def known(words):
+    "The subset of `words` that appear in the dictionary of WORDS."
+    return set(w for w in words if w in WORDS)
+def edits1(word):
+    "All edits that are one edit away from `word`."
+    letters    = 'abcdefghijklmnopqrstuvwxyz'
+    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
+    deletes    = [L + R[1:]               for L, R in splits if R]
+    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
+    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
+    inserts    = [L + c + R               for L, R in splits for c in letters]
+    return set(deletes + transposes + replaces + inserts)
+def edits2(word):
+    "All edits that are two edits away from `word`."
+    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
 def find_char_diff(a, b):
     # Finds the character difference between two str objects by counting the occurences of every character. Not edit distance.
         return int(in_word)
     except ValueError:
         pass
+    """
     # Spell correction using find_char_diff
     char_diffs = [find_char_diff(in_word, i) for i in ONES + tens + scales]
     min_char_diff = min(char_diffs)
     if min_char_diff <= SPELL_CORRECT_MIN_CHAR_DIFF:
         return char_diffs.index(min_char_diff)
+    """
+    return numwords[correction(in_word)]
 def tokens2int(tokens):
     # Takes a list of tokens and returns a int representation of them
     return tokens2int(tokenize(text))
 iface = gr.Interface(fn=text2int, inputs="text", outputs="text")
 iface.launch()