#!/usr/bin/env python # coding: utf-8 # In[1]: # # # Function to convert Hindi text to numerical representation # from isNumber import is_number # def text_to_int (textnum, numwords={}): # units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', # 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', # 'sixteen', 'seventeen', 'eighteen', 'nineteen'] # tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] # scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion'] # ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} # ordinal_endings = [('ieth', 'y'), ('th', '')] # if not numwords: # numwords['and'] = (1, 0) # for idx, word in enumerate(units): numwords[word] = (1, idx) # for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) # for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) # textnum = textnum.replace('-', ' ') # current = result = 0 # curstring = '' # onnumber = False # lastunit = False # lastscale = False # def is_numword(x): # if is_number(x): # return True # if word in numwords: # return True # return False # def from_numword(x): # if is_number(x): # scale = 0 # increment = int(x.replace(',', '')) # return scale, increment # return numwords[x] # for word in textnum.split(): # if word in ordinal_words: # scale, increment = (1, ordinal_words[word]) # current = current * scale + increment # if scale > 100: # result += current # current = 0 # onnumber = True # lastunit = False # lastscale = False # else: # for ending, replacement in ordinal_endings: # if word.endswith(ending): # word = "%s%s" % (word[:-len(ending)], replacement) # if (not is_numword(word)) or (word == 'and' and not lastscale): # if onnumber: # # Flush the current number we are building # curstring += repr(result + current) + " " # curstring += word + " " # result = current = 0 # onnumber = False # lastunit = False # lastscale = False # else: # scale, increment = from_numword(word) # onnumber = True # if lastunit and (word not in scales): # # Assume this is part of a string of individual numbers to # # be flushed, such as a zipcode "one two three four five" # curstring += repr(result + current) # result = current = 0 # if scale > 1: # current = max(1, current) # current = current * scale + increment # if scale > 100: # result += current # current = 0 # lastscale = False # lastunit = False # if word in scales: # lastscale = True # elif word in units: # lastunit = True # if onnumber: # curstring += repr(result + current) # return curstring # In[5]: import nbimporter from isNumber import is_number # Remove or replace this if unnecessary def text_to_int(textnum, numwords={}): # Define units, tens, and scales including "lac" units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'] tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] scales = ['hundred', 'thousand', 'lakh', 'million', 'billion', 'trillion'] ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12} ordinal_endings = [('ieth', 'y'), ('th', '')] if not numwords: numwords['and'] = (1, 0) # Handle "one hundred and twenty" # Add units, tens, and scales to numwords for idx, word in enumerate(units): numwords[word] = (1, idx) for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) for idx, word in enumerate(scales): numwords[word] = (10 ** (5 if word == 'lakh' else idx * 3 or 2), 0) # Handle "lac" as 10^5 # Remove hyphens and normalize input textnum = textnum.replace('-', ' ') current = result = 0 curstring = '' onnumber = False lastunit = False lastscale = False def is_numword(x): return is_number(x) or x in numwords def from_numword(x): if is_number(x): return 0, int(x.replace(',', '')) return numwords[x] for word in textnum.split(): if word in ordinal_words: scale, increment = (1, ordinal_words[word]) current = current * scale + increment if scale > 100: result += current current = 0 onnumber = True lastunit = False lastscale = False else: for ending, replacement in ordinal_endings: if word.endswith(ending): word = f"{word[:-len(ending)]}{replacement}" if not is_numword(word) or (word == 'and' and not lastscale): if onnumber: curstring += repr(result + current) + " " curstring += word + " " result = current = 0 onnumber = False lastunit = False lastscale = False else: scale, increment = from_numword(word) onnumber = True if lastunit and word not in scales: curstring += repr(result + current) + " " result = current = 0 if scale > 1: current = max(1, current) current = current * scale + increment if scale >= 100: result += current current = 0 lastscale = word in scales lastunit = word in units if onnumber: curstring += repr(result + current) return curstring.strip() # In[ ]: