Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| # # # Function to convert Hindi text to numerical representation | |
| # from isNumber import is_number | |
| # def text_to_int (textnum, numwords={}): | |
| # units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', | |
| # 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', | |
| # 'sixteen', 'seventeen', 'eighteen', 'nineteen'] | |
| # tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] | |
| # scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion'] | |
| # ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12} | |
| # ordinal_endings = [('ieth', 'y'), ('th', '')] | |
| # if not numwords: | |
| # numwords['and'] = (1, 0) | |
| # for idx, word in enumerate(units): numwords[word] = (1, idx) | |
| # for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) | |
| # for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) | |
| # textnum = textnum.replace('-', ' ') | |
| # current = result = 0 | |
| # curstring = '' | |
| # onnumber = False | |
| # lastunit = False | |
| # lastscale = False | |
| # def is_numword(x): | |
| # if is_number(x): | |
| # return True | |
| # if word in numwords: | |
| # return True | |
| # return False | |
| # def from_numword(x): | |
| # if is_number(x): | |
| # scale = 0 | |
| # increment = int(x.replace(',', '')) | |
| # return scale, increment | |
| # return numwords[x] | |
| # for word in textnum.split(): | |
| # if word in ordinal_words: | |
| # scale, increment = (1, ordinal_words[word]) | |
| # current = current * scale + increment | |
| # if scale > 100: | |
| # result += current | |
| # current = 0 | |
| # onnumber = True | |
| # lastunit = False | |
| # lastscale = False | |
| # else: | |
| # for ending, replacement in ordinal_endings: | |
| # if word.endswith(ending): | |
| # word = "%s%s" % (word[:-len(ending)], replacement) | |
| # if (not is_numword(word)) or (word == 'and' and not lastscale): | |
| # if onnumber: | |
| # # Flush the current number we are building | |
| # curstring += repr(result + current) + " " | |
| # curstring += word + " " | |
| # result = current = 0 | |
| # onnumber = False | |
| # lastunit = False | |
| # lastscale = False | |
| # else: | |
| # scale, increment = from_numword(word) | |
| # onnumber = True | |
| # if lastunit and (word not in scales): | |
| # # Assume this is part of a string of individual numbers to | |
| # # be flushed, such as a zipcode "one two three four five" | |
| # curstring += repr(result + current) | |
| # result = current = 0 | |
| # if scale > 1: | |
| # current = max(1, current) | |
| # current = current * scale + increment | |
| # if scale > 100: | |
| # result += current | |
| # current = 0 | |
| # lastscale = False | |
| # lastunit = False | |
| # if word in scales: | |
| # lastscale = True | |
| # elif word in units: | |
| # lastunit = True | |
| # if onnumber: | |
| # curstring += repr(result + current) | |
| # return curstring | |
| # In[5]: | |
| import nbimporter | |
| from isNumber import is_number # Remove or replace this if unnecessary | |
| def text_to_int(textnum, numwords={}): | |
| # Define units, tens, and scales including "lac" | |
| units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', | |
| 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', | |
| 'sixteen', 'seventeen', 'eighteen', 'nineteen'] | |
| tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] | |
| scales = ['hundred', 'thousand', 'lakh', 'million', 'billion', 'trillion'] | |
| ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12} | |
| ordinal_endings = [('ieth', 'y'), ('th', '')] | |
| if not numwords: | |
| numwords['and'] = (1, 0) # Handle "one hundred and twenty" | |
| # Add units, tens, and scales to numwords | |
| for idx, word in enumerate(units): | |
| numwords[word] = (1, idx) | |
| for idx, word in enumerate(tens): | |
| numwords[word] = (1, idx * 10) | |
| for idx, word in enumerate(scales): | |
| numwords[word] = (10 ** (5 if word == 'lakh' else idx * 3 or 2), 0) # Handle "lac" as 10^5 | |
| # Remove hyphens and normalize input | |
| textnum = textnum.replace('-', ' ') | |
| current = result = 0 | |
| curstring = '' | |
| onnumber = False | |
| lastunit = False | |
| lastscale = False | |
| def is_numword(x): | |
| return is_number(x) or x in numwords | |
| def from_numword(x): | |
| if is_number(x): | |
| return 0, int(x.replace(',', '')) | |
| return numwords[x] | |
| for word in textnum.split(): | |
| if word in ordinal_words: | |
| scale, increment = (1, ordinal_words[word]) | |
| current = current * scale + increment | |
| if scale > 100: | |
| result += current | |
| current = 0 | |
| onnumber = True | |
| lastunit = False | |
| lastscale = False | |
| else: | |
| for ending, replacement in ordinal_endings: | |
| if word.endswith(ending): | |
| word = f"{word[:-len(ending)]}{replacement}" | |
| if not is_numword(word) or (word == 'and' and not lastscale): | |
| if onnumber: | |
| curstring += repr(result + current) + " " | |
| curstring += word + " " | |
| result = current = 0 | |
| onnumber = False | |
| lastunit = False | |
| lastscale = False | |
| else: | |
| scale, increment = from_numword(word) | |
| onnumber = True | |
| if lastunit and word not in scales: | |
| curstring += repr(result + current) + " " | |
| result = current = 0 | |
| if scale > 1: | |
| current = max(1, current) | |
| current = current * scale + increment | |
| if scale >= 100: | |
| result += current | |
| current = 0 | |
| lastscale = word in scales | |
| lastunit = word in units | |
| if onnumber: | |
| curstring += repr(result + current) | |
| return curstring.strip() | |
| # In[ ]: | |