import spacy import re from word2number import w2n # Load the spacy model with GloVe embeddings nlp = spacy.load("en_core_web_lg") def capture_numbers(input_sentence): ''' This is a function to capture cases of refered numbers either in numeric or free-text form ''' try: # Define the regular expression patterns pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)" # Find all matches in the text matches = re.findall(pattern1, input_sentence) # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5 pattern_numbers = [] for match in matches: if len(match) == 3: # add the $pattern string to easily specify them in a subsequent step full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern') pattern_numbers.append(full_string) for elem in pattern_numbers: input_sentence = input_sentence.replace(elem, " ") if pattern_numbers: # Remove duplicates with set and convert back to list pattern_final_numbers = list(set(pattern_numbers)) else: pattern_final_numbers = [] # we delete the captured references from the sentence, because if we capture something like seven point five # then spacy will also identify seven and five, which we do not want it to for element in pattern_final_numbers: target_elem = element.replace("$pattern", "").strip() if target_elem in input_sentence: input_sentence = input_sentence.replace(target_elem, " ") # This is for cases of thirty eight or one million and two, etc. # Define a regular expression to match multiword free-text numbers pattern2 = r"(? 1: return (0, 'MAGNITUDE', 'more_magnitude') elif len(num_ref1) == 0: return (0, 'MAGNITUDE', 'no_magnitude') except: return (0, 'MAGNITUDE', 'unknown_error') # If the free text form is a word, map it to its numerical value if num2 in num_dict: num2 = num_dict[num2] else: try: num2 = int(num2) except: try: num2 = w2n.word_to_num(num2) except: try: # we identify all the numeric references num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)] # if there is exactly one number then we cope with that if len(num_ref2) == 1: num2 = num_ref2[0] # in any other case throw an error elif len(num_ref2) > 1: return (0, 'MAGNITUDE', 'more_magnitude') elif len(num_ref2) == 0: return (0, 'MAGNITUDE', 'no_magnitude') except: return (0, 'MAGNITUDE', 'unknown_error') try: # Convert both parts to float and add them together to get the final decimal value result = float(num1) + float(num2) / (10 ** len(str(num2))) return result except: return (0, 'MAGNITUDE', 'unknown_error') else: # If input text doesn't match the expected pattern, return None return 0 except: return 0 def convert_into_numeric(num_list): ''' This is a function to convert the identified numbers into a numeric form ''' if num_list: # at first we examine how many numbers were captured. Only one number should exist if len(num_list) > 1: return (0, 'MAGNITUDE', 'more_magnitude') else: target_num = num_list[0] # case it is an integer or float, convert it, otherwise move to following cases try: target_num_float = float(target_num) return {'Number': target_num_float} except: # at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before if ',' in target_num: try: target_num = float(target_num.replace(",", ".")) return (0, 'MAGNITUDE', 'format_error') except: return (0, 'MAGNITUDE', 'unknown_error') else: # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations) if "$pattern" in target_num: num, _ = target_num.split("$") # try with this function for all the rest of cases (6 point 5, 6 point five, six point 5) num_conversion = numeric_number_dot_freetext(num) if num_conversion: return {'Number': num_conversion} # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc) else: try: num_conversion = w2n.word_to_num(target_num) return {'Number': num_conversion} # if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference # and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error except: try: target_num = target_num.replace(" a ", " ") new_target_num = "one " + target_num num_conversion = w2n.word_to_num(new_target_num) return {'Number': num_conversion} except: return (0, 'MAGNITUDE', 'unknown_error') else: return (0, 'MAGNITUDE', 'no_magnitude') def magnitude_binding(input_text): ''' This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references ''' try: # capture the referred magnitudes target_numbers = capture_numbers(input_text) # we only accept for one magnitude reference if len(target_numbers) == 1: numeric_target_numbers = convert_into_numeric(target_numbers) return numeric_target_numbers # in case of zero references return the appropriate code (to aid returning the correct prompt) elif len(target_numbers) == 0: return (0, 'MAGNITUDE', 'no_magnitude') # in case of more than one references return the appropriate code (to aid returning the correct prompt) elif len(target_numbers) > 1: return (0, 'MAGNITUDE', 'more_magnitude') # in case of unexpected error return the appropriate code (to aid returning the correct prompt) else: return (0, 'MAGNITUDE', 'unknown_error') except: return (0, 'MAGNITUDE', 'unknown_error')