ENGLISH_ASR / text2int.py
cdactvm's picture
Upload 6 files
3b58a97 verified
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# # # Function to convert Hindi text to numerical representation
# from isNumber import is_number
# def text_to_int (textnum, numwords={}):
# units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
# 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
# 'sixteen', 'seventeen', 'eighteen', 'nineteen']
# tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
# scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
# ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
# ordinal_endings = [('ieth', 'y'), ('th', '')]
# if not numwords:
# numwords['and'] = (1, 0)
# for idx, word in enumerate(units): numwords[word] = (1, idx)
# for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
# for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
# textnum = textnum.replace('-', ' ')
# current = result = 0
# curstring = ''
# onnumber = False
# lastunit = False
# lastscale = False
# def is_numword(x):
# if is_number(x):
# return True
# if word in numwords:
# return True
# return False
# def from_numword(x):
# if is_number(x):
# scale = 0
# increment = int(x.replace(',', ''))
# return scale, increment
# return numwords[x]
# for word in textnum.split():
# if word in ordinal_words:
# scale, increment = (1, ordinal_words[word])
# current = current * scale + increment
# if scale > 100:
# result += current
# current = 0
# onnumber = True
# lastunit = False
# lastscale = False
# else:
# for ending, replacement in ordinal_endings:
# if word.endswith(ending):
# word = "%s%s" % (word[:-len(ending)], replacement)
# if (not is_numword(word)) or (word == 'and' and not lastscale):
# if onnumber:
# # Flush the current number we are building
# curstring += repr(result + current) + " "
# curstring += word + " "
# result = current = 0
# onnumber = False
# lastunit = False
# lastscale = False
# else:
# scale, increment = from_numword(word)
# onnumber = True
# if lastunit and (word not in scales):
# # Assume this is part of a string of individual numbers to
# # be flushed, such as a zipcode "one two three four five"
# curstring += repr(result + current)
# result = current = 0
# if scale > 1:
# current = max(1, current)
# current = current * scale + increment
# if scale > 100:
# result += current
# current = 0
# lastscale = False
# lastunit = False
# if word in scales:
# lastscale = True
# elif word in units:
# lastunit = True
# if onnumber:
# curstring += repr(result + current)
# return curstring
# In[5]:
import nbimporter
from isNumber import is_number # Remove or replace this if unnecessary
def text_to_int(textnum, numwords={}):
# Define units, tens, and scales including "lac"
units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen']
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
scales = ['hundred', 'thousand', 'lakh', 'million', 'billion', 'trillion']
ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
if not numwords:
numwords['and'] = (1, 0) # Handle "one hundred and twenty"
# Add units, tens, and scales to numwords
for idx, word in enumerate(units):
numwords[word] = (1, idx)
for idx, word in enumerate(tens):
numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales):
numwords[word] = (10 ** (5 if word == 'lakh' else idx * 3 or 2), 0) # Handle "lac" as 10^5
# Remove hyphens and normalize input
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ''
onnumber = False
lastunit = False
lastscale = False
def is_numword(x):
return is_number(x) or x in numwords
def from_numword(x):
if is_number(x):
return 0, int(x.replace(',', ''))
return numwords[x]
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
lastunit = False
lastscale = False
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = f"{word[:-len(ending)]}{replacement}"
if not is_numword(word) or (word == 'and' and not lastscale):
if onnumber:
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
lastunit = False
lastscale = False
else:
scale, increment = from_numword(word)
onnumber = True
if lastunit and word not in scales:
curstring += repr(result + current) + " "
result = current = 0
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale >= 100:
result += current
current = 0
lastscale = word in scales
lastunit = word in units
if onnumber:
curstring += repr(result + current)
return curstring.strip()
# In[ ]: