Spaces:

ValadisCERTH
/

NumbersModuleSerco

Sleeping

File size: 10,945 Bytes

import spacy
import re
from word2number import w2n

# load the spacy model
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")


def capture_numbers(input_sentence):
  '''
    This is a function to capture cases of refered numbers either in numeric or free-text form
  '''

  try:
    # Define the regular expression patterns
    pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"

    # Find all matches in the text
    matches = re.findall(pattern1, input_sentence)

    # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
    pattern_numbers = []
    for match in matches:
        if len(match) == 3:
            # add the $pattern string to easily specify them in a subsequent step
            full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
            pattern_numbers.append(full_string)

    for elem in pattern_numbers:
      input_sentence = input_sentence.replace(elem, " ") 

    if pattern_numbers:
        # Remove duplicates with set and convert back to list
        pattern_final_numbers = list(set(pattern_numbers))
    else:
        pattern_final_numbers = []

    # we delete the captured references from the sentence, because if we capture something like seven point five
    # then spacy will also identify seven and five, which we do not want it to
    for element in pattern_final_numbers:
      target_elem = element.replace("$pattern","").strip() 
      if target_elem in input_sentence:
          input_sentence = input_sentence.replace(target_elem, " ")


    # This is for cases of thirty eight or one million and two, etc.

    # Define a regular expression to match multiword free-text numbers
    pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
  
    # Find all multiword free-text number matches in the sentence
    multi_numbers = re.findall(pattern2, input_sentence)

    if multi_numbers:
      multinumber_final_numbers = list(set(multi_numbers))
    else:
      multinumber_final_numbers = []

    for elem in multinumber_final_numbers:
      if elem in input_sentence:
        input_sentence = input_sentence.replace(elem, " ")

    # we also delete the captured references from the sentence in this case
    for element in multinumber_final_numbers:
      target_elem = element.replace("$pattern","").strip() 
      if target_elem in input_sentence:
          input_sentence = input_sentence.replace(target_elem, " ")

          
    # Parse the input sentence with Spacy
    doc = nlp(input_sentence)

    # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
    s_numbers = [token.text for token in doc if token.like_num]
    
    if s_numbers:
      # Remove duplicates with set and convert back to list
      spacy_final_numbers = list(set(s_numbers))

    else:
      spacy_final_numbers = []

    # return the extracted numbers
    return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers

  except:
    return 0


def numeric_number_dot_freetext(text):
  '''
  This is a function to convert cases of '6 point five and six point 5'
  '''

  try:
      # # Define a dictionary to map words to numbers
      num_dict = {
          'zero': 0,
          'one': 1,
          'two': 2,
          'three': 3,
          'four': 4,
          'five': 5,
          'six': 6,
          'seven': 7,
          'eight': 8,
          'nine': 9,
          'ten': 10,
          'eleven': 11,
          'twelve': 12,
          'thirteen': 13,
          'fourteen': 14,
          'fifteen': 15,
          'sixteen': 16,
          'seventeen': 17,
          'eighteen': 18,
          'nineteen': 19,
          'twenty': 20,
          'thirty': 30,
          'forty': 40,
          'fifty': 50,
          'sixty': 60,
          'seventy': 70,
          'eighty': 80,
          'ninety': 90,
          'hundred': 100,
          'thousand': 1000,
          'million': 1000000,
          'billion': 1000000000,
          'trillion': 1000000000000
      }

      # # Define a regular expression pattern to extract the numeric form and free text form from input text
      pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
      
      # Use regular expression to extract the numeric form and free text form from input text
      match = re.search(pattern, text)

      if match:
          num1 = match.group(1)
          num2 = match.group(2)

          # If the numeric form is a word, map it to its numerical value
          if num1 in num_dict:
              num1 = num_dict[num1]
          
          # if not in the dictionary try also with the w2n library
          else:

              # try to convert to float. That means this is a number, otherwise it is a string so continue
              try:
                num1 = float(num1)
              except:

                # this will handle cases like "bla bla bla seven"
                try:
                  num1 = w2n.word_to_num(num1)

                # this is to handle cases like "bla bla bla 7"
                except:
                    
                    try:
                      # we identify all the numeric references
                      num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]

                      # if there is exactly one number then we cope with that
                      if len(num_ref1) == 1:
                        num1 = num_ref1[0]

                      # in any other case throw an error
                      elif len(num_ref1) > 1:
                        return (0,'MAGNITUDE','more_magnitude')
                      
                      elif len(num_ref1) == 0:
                        return (0,'MAGNITUDE','no_magnitude')

                    except:
                        return (0,'MAGNITUDE','unknown_error')

          
          # If the free text form is a word, map it to its numerical value
          if num2 in num_dict:
              num2 = num_dict[num2]

          else:
              try:
                num2 = int(num2)
              except:
                  try:
                    num2 = w2n.word_to_num(num2)
                  except:
                      try:
                          # we identify all the numeric references
                          num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]

                          # if there is exactly one number then we cope with that
                          if len(num_ref2) == 1:
                            num2 = num_ref2[0]

                          # in any other case throw an error
                          elif len(num_ref2) > 1:
                            return (0,'MAGNITUDE','more_magnitude')
                          
                          elif len(num_ref2) == 0:
                            return (0,'MAGNITUDE','no_magnitude')

                      except:
                          return (0,'MAGNITUDE','unknown_error')


          try:
            # Convert both parts to float and add them together to get the final decimal value
            result = float(num1) + float(num2) / (10 ** len(str(num2)))
            return result
          except:
            return (0, 'MAGNITUDE', 'unknown_error')
          
      
      else:
          # If input text doesn't match the expected pattern, return None
          return 0
  
  except:
    return 0


def convert_into_numeric(num_list):
  '''
  This is a function to convert the identified numbers into a numeric form
  '''

  if num_list:
 
    # at first we examine how many numbers were captured. Only one number should exist
    if len(num_list) > 1:
      return (0,'MAGNITUDE','more_magnitudes')
    
    else:
      target_num = num_list[0] 
      
      # case it is an integer or float, convert it, otherwise move to following cases
      try:
        target_num_float = float(target_num)
        return {'Number' : target_num}
      
      except:
        # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
        if "$pattern" in target_num:
          num, _ = target_num.split("$")

          # Try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
          num_conversion = numeric_number_dot_freetext(num)
          
          if num_conversion:
            return {'Number' : num_conversion}

        # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
        else:
          try:
            num_conversion = w2n.word_to_num(target_num)
            return {'Number' : num_conversion}

          # if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
          # and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error 
          except:

            try:
              target_num = target_num.replace(" a ", " ")
              new_target_num = "one " + target_num
              num_conversion = w2n.word_to_num(new_target_num)
              return {'Number' : num_conversion}

            except:
              return (0,'MAGNITUDE','unknown_error')

  else:
    return (0,'MAGNITUDE','no_magnitude')


def magnitude_binding(input_text):
  '''
  This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
  '''

  try:

    # capture the referred magnitudes
    target_numbers = capture_numbers(input_text)

    # we only accept for one magnitude reference
    if len(target_numbers) == 1:
      numeric_target_numbers = convert_into_numeric(target_numbers)      
      
      return numeric_target_numbers

    # in case of zero references return the appropriate code (to aid returning the correct prompt)
    elif len(target_numbers) == 0:
      return (0,'MAGNITUDE','no_magnitude')

    # in case of more than one references return the appropriate code (to aid returning the correct prompt)
    elif len(target_numbers) > 1:
      return (0,'MAGNITUDE','more_magnitudes')

    # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
    else:
      return (0,'MAGNITUDE','unknown_error')

  except:
    return (0,'MAGNITUDE','unknown_error')