Spaces:

ValadisCERTH
/

NaturalLanguageModule_complete

Runtime error

App Files Files Community

ValadisCERTH commited on May 10, 2023

Commit

b2a1bba

•

1 Parent(s): fe70a3e

Update magnitudeIdentification.py

Browse files

Files changed (1) hide show

magnitudeIdentification.py +296 -187

magnitudeIdentification.py CHANGED Viewed

@@ -1,211 +1,320 @@
 import spacy
 import re
-from datetime import datetime
 # Load the spacy model with GloVe embeddings
 nlp = spacy.load("en_core_web_lg")
-# Define a function to extract dates from text
-def extract_dates(text):
-    """
-    Identify dates both in numeric and free-text from text, using date regex patterns and NER tag
-    """
-    # Define regex patterns for common date formats
-    # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched
-    date_patterns = [
-        r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b',  # Matches dates like "01/01/22" or "1-1-2022"
-        r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)',  # Matches dates like "01/01" or "1-1"
-        r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b',  # Matches dates like "January 1, 2022" or "Feb 28, 22"
-        r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b',  # Matches dates like "1 January 2022" or "28 Feb 22"
-        r'\b[A-Z][a-z]{2,8} \d{2,4}\b',  # Matches dates like "January 2022" or "Feb 22"
-        r'\d{1,2}[/-]\d{4}|\d{2}\s\d{4}'
-        # Matches dates like (05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018, 05/18, 05-18, 05 18, 5/18, 5-18, 5 18) etc.
-    ]
-    # Find all matches for date patterns in the text
-    matches = []
-    for pattern in date_patterns:
-        for match in re.findall(pattern, text):
-            # Check if the match is part of a longer date pattern that has already been matched
-            if all(match not in m for m in matches):
-                matches.append(match)
-    # Use SpaCy to extract additional dates
-    doc = nlp(text)
-    for ent in doc.ents:
-        if ent.label_ == 'DATE':
-            date_str = ent.text
-            # Checks each SpaCy date reference against the matches list to ensure that it is not already included
-            if all(date_str not in m for m in matches):
-                matches.append(date_str)
-    # Remove duplicates and return the matches
-    return list(set(matches))
-def helper_fix_format_date_sf(input_list):
-    input_str = input_list[0]
-    # Split the string into separate key-value pairs
-    pairs = input_str.split(", ")
-    pairs_dict = {}
-    # Convert the key-value pairs into a dictionary
-    for pair in pairs:
-        key, value = pair.split(":")
-        pairs_dict[key] = value
-    # Create a list of dictionaries, ensuring all keys are present
-    output_list = {"day": pairs_dict.get("day", 0),
-                   "month": pairs_dict.get("month", 0),
-                   "year": pairs_dict.get("year", 0)}
-    return [{"date":output_list}]
-def convert_dates(date_list):
-    """
-    Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc)
-    """
-    DATE_FORMATS = {
-        '%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d/%m': 'day:{dt.day}, month:{dt.month}',
-        '%B %d': 'day:{dt.day}, month:{dt.month}',
-        '%b %d': 'day:{dt.day}, month:{dt.month}',
-        '%B %Y': 'month:{dt.month}, year:{dt.year}',
-        '%Y': 'year:{dt.year}',
-        '%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
-        '%m-%d': 'month:{dt.month}, day:{dt.day}',
-        '%-m-%-d': 'month:{dt.month}, day:{dt.day}',
-        '%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%b %Y': 'month:{dt.month}, year:{dt.year}',
-        '%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        # 09 05 2018
-        '%d %m %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        # 05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018
-        '%m %Y': 'month:{dt.month}, year:{dt.year}',
-        '%m/%Y': 'month:{dt.month}, year:{dt.year}',
-        '%m-%Y': 'month:{dt.month}, year:{dt.year}',
-        # 05/18, 05-18, 05 18, 5/18, 5-18, 5 18
-        '%m/%y': 'month:{dt.month}, year:{dt.year}',
-        '%m-%y': 'month:{dt.month}, year:{dt.year}',
-        '%m %y': 'month:{dt.month}, year:{dt.year}',
-        '%-m/%y': 'month:{dt.month}, year:{dt.year}',
-        '%-m-%y': 'month:{dt.month}, year:{dt.year}',
-        '%-m %y': 'month:{dt.month}, year:{dt.year}',
-        # 9th May 2018 etc
-        '%dth %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%dth %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%dst %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%dst %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%dnd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%dnd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%drd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%drd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        # August 9 2018, August 9 18, Jan 1 23, etc.
-        '%B %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%B %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%b %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
-        '%b %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}'
-    }
-    output_list = []
-    for date_str in date_list:
-        valid_format = False
-        for fmt, out_fmt in DATE_FORMATS.items():
             try:
-                dt = datetime.strptime(date_str, fmt)
-                output_list.append(out_fmt.format(dt=dt))
-                valid_format = True
-                break
-            except ValueError:
-                pass
-        if not valid_format:
-            # Attempt to parse using a custom format
             try:
-                if '-' in date_str:
-                    dt = datetime.strptime(date_str, '%m-%d-%y')
                 else:
-                    dt = datetime.strptime(date_str, '%d/%m/%y')
-                output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}')
-            except ValueError:
-                output_list.append(f'INVALID FORMAT: {date_str}')
-    # return output_list
-    return helper_fix_format_date_sf(output_list)
-def dates_binding(text):
-  '''
-  This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references
-  '''
-  try:
-      # capture the referred dates
-      ident_dates = extract_dates(text)
-      # since we now cope for formats like '05 2018' and '09 05 2018', our module would capture them as two seperate cases.
-      # with this line we check if '05 2018' is contained on '09 05 2018', in which case we delete it
-      identified_dates = [elem for elem in ident_dates if not any(elem in other_elem for other_elem in ident_dates if elem != other_elem)]
-      # we only accept for one date reference
-      if len(identified_dates) == 1:
-        formatted_dates = convert_dates(identified_dates)
-        # in case there is a wrong date format then return the appropriate code to prompt back the proper message
-        if 'INVALID FORMAT' in formatted_dates[0]:
-          return (0,'DATES','wrong_date_format')
-        else:
-          return [formatted_dates, identified_dates]
-      # in case of zero references return the appropriate code (to aid returning the correct prompt)
-      elif len(identified_dates) == 0:
-        return (0,'DATES','no_date')
-      # in case of more than one references return the appropriate code (to aid returning the correct prompt)
-      elif len(identified_dates) > 1:
-        return (0,'DATES','more_dates')
-      # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
-      else:
-        return (0,'DATES','unknown_error')
-  except:
-      return (0,'DATES','unknown_error')

 import spacy
 import re
+from word2number import w2n
 # Load the spacy model with GloVe embeddings
 nlp = spacy.load("en_core_web_lg")
+def capture_numbers(input_sentence):
+    '''
+      This is a function to capture cases of refered numbers either in numeric or free-text form
+    '''
+    try:
+        # Define the regular expression patterns
+        pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
+        # Find all matches in the text
+        matches = re.findall(pattern1, input_sentence)
+        # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
+        pattern_numbers = []
+        for match in matches:
+            if len(match) == 3:
+                # add the $pattern string to easily specify them in a subsequent step
+                full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
+                pattern_numbers.append(full_string)
+        for elem in pattern_numbers:
+            input_sentence = input_sentence.replace(elem, " ")
+        if pattern_numbers:
+            # Remove duplicates with set and convert back to list
+            pattern_final_numbers = list(set(pattern_numbers))
+        else:
+            pattern_final_numbers = []
+        # we delete the captured references from the sentence, because if we capture something like seven point five
+        # then spacy will also identify seven and five, which we do not want it to
+        for element in pattern_final_numbers:
+            target_elem = element.replace("$pattern", "").strip()
+            if target_elem in input_sentence:
+                input_sentence = input_sentence.replace(target_elem, " ")
+        # This is for cases of thirty eight or one million and two, etc.
+        # Define a regular expression to match multiword free-text numbers
+        pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
+        # Find all multiword free-text number matches in the sentence
+        multi_numbers = re.findall(pattern2, input_sentence)
+        if multi_numbers:
+            multinumber_final_numbers = list(set(multi_numbers))
+        else:
+            multinumber_final_numbers = []
+        for elem in multinumber_final_numbers:
+            if elem in input_sentence:
+                input_sentence = input_sentence.replace(elem, " ")
+        # we also delete the captured references from the sentence in this case
+        for element in multinumber_final_numbers:
+            target_elem = element.replace("$pattern", "").strip()
+            if target_elem in input_sentence:
+                input_sentence = input_sentence.replace(target_elem, " ")
+        # Parse the input sentence with Spacy
+        doc = nlp(input_sentence)
+        # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
+        s_numbers = [token.text for token in doc if token.like_num]
+        if s_numbers:
+            # Remove duplicates with set and convert back to list
+            spacy_final_numbers = list(set(s_numbers))
+        else:
+            spacy_final_numbers = []
+        # return the extracted numbers
+        return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
+    except:
+        return 0
+def numeric_number_dot_freetext(text):
+    '''
+    This is a function to convert cases of '6 point five, six point 5 etc'
+    '''
+    try:
+        # # Define a dictionary to map words to numbers
+        num_dict = {
+            'zero': 0,
+            'one': 1,
+            'two': 2,
+            'three': 3,
+            'four': 4,
+            'five': 5,
+            'six': 6,
+            'seven': 7,
+            'eight': 8,
+            'nine': 9,
+            'ten': 10,
+            'eleven': 11,
+            'twelve': 12,
+            'thirteen': 13,
+            'fourteen': 14,
+            'fifteen': 15,
+            'sixteen': 16,
+            'seventeen': 17,
+            'eighteen': 18,
+            'nineteen': 19,
+            'twenty': 20,
+            'thirty': 30,
+            'forty': 40,
+            'fifty': 50,
+            'sixty': 60,
+            'seventy': 70,
+            'eighty': 80,
+            'ninety': 90,
+            'hundred': 100,
+            'thousand': 1000,
+            'million': 1000000,
+            'billion': 1000000000,
+            'trillion': 1000000000000
+        }
+        # # Define a regular expression pattern to extract the numeric form and free text form from input text
+        pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
+        # Use regular expression to extract the numeric form and free text form from input text
+        match = re.search(pattern, text)
+        if match:
+            num1 = match.group(1)
+            num2 = match.group(2)
+            # If the numeric form is a word, map it to its numerical value
+            if num1 in num_dict:
+                num1 = num_dict[num1]
+            # if not in the dictionary try also with the w2n library
+            else:
+                # try to convert to float. That means this is a number, otherwise it is a string so continue
+                try:
+                    num1 = float(num1)
+                except:
+                    # this will handle cases like "bla bla bla seven"
+                    try:
+                        num1 = w2n.word_to_num(num1)
+                    # this is to handle cases like "bla bla bla 7"
+                    except:
+                        try:
+                            # we identify all the numeric references
+                            num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
+                            # if there is exactly one number then we cope with that
+                            if len(num_ref1) == 1:
+                                num1 = num_ref1[0]
+                            # in any other case throw an error
+                            elif len(num_ref1) > 1:
+                                return (0, 'MAGNITUDE', 'more_magnitude')
+                            elif len(num_ref1) == 0:
+                                return (0, 'MAGNITUDE', 'no_magnitude')
+                        except:
+                            return (0, 'MAGNITUDE', 'unknown_error')
+            # If the free text form is a word, map it to its numerical value
+            if num2 in num_dict:
+                num2 = num_dict[num2]
+            else:
+                try:
+                    num2 = int(num2)
+                except:
+                    try:
+                        num2 = w2n.word_to_num(num2)
+                    except:
+                        try:
+                            # we identify all the numeric references
+                            num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
+                            # if there is exactly one number then we cope with that
+                            if len(num_ref2) == 1:
+                                num2 = num_ref2[0]
+                            # in any other case throw an error
+                            elif len(num_ref2) > 1:
+                                return (0, 'MAGNITUDE', 'more_magnitude')
+                            elif len(num_ref2) == 0:
+                                return (0, 'MAGNITUDE', 'no_magnitude')
+                        except:
+                            return (0, 'MAGNITUDE', 'unknown_error')
             try:
+                # Convert both parts to float and add them together to get the final decimal value
+                result = float(num1) + float(num2) / (10 ** len(str(num2)))
+                return result
+            except:
+                return (0, 'MAGNITUDE', 'unknown_error')
+        else:
+            # If input text doesn't match the expected pattern, return None
+            return 0
+    except:
+        return 0
+def convert_into_numeric(num_list):
+    '''
+    This is a function to convert the identified numbers into a numeric form
+    '''
+    if num_list:
+        # at first we examine how many numbers were captured. Only one number should exist
+        if len(num_list) > 1:
+            return (0, 'MAGNITUDE', 'more_magnitude')
+        else:
+            target_num = num_list[0]
+            # case it is an integer or float, convert it, otherwise move to following cases
             try:
+                target_num_float = float(target_num)
+                return {'Number': target_num_float}
+            except:
+                # at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before
+                if ',' in target_num:
+                    try:
+                        target_num = float(target_num.replace(",", "."))
+                        return (0, 'MAGNITUDE', 'format_error')
+                    except:
+                        return (0, 'MAGNITUDE', 'unknown_error')
                 else:
+                    # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
+                    if "$pattern" in target_num:
+                        num, _ = target_num.split("$")
+                        # try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
+                        num_conversion = numeric_number_dot_freetext(num)
+                        if num_conversion:
+                            return {'Number': num_conversion}
+                    # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
+                    else:
+                        try:
+                            num_conversion = w2n.word_to_num(target_num)
+                            return {'Number': num_conversion}
+                        # if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
+                        # and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
+                        except:
+                            try:
+                                target_num = target_num.replace(" a ", " ")
+                                new_target_num = "one " + target_num
+                                num_conversion = w2n.word_to_num(new_target_num)
+                                return {'Number': num_conversion}
+                            except:
+                                return (0, 'MAGNITUDE', 'unknown_error')
+    else:
+        return (0, 'MAGNITUDE', 'no_magnitude')
+def magnitude_binding(input_text):
+    '''
+    This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
+    '''
+    try:
+        # capture the referred magnitudes
+        target_numbers = capture_numbers(input_text)
+        # we only accept for one magnitude reference
+        if len(target_numbers) == 1:
+            numeric_target_numbers = convert_into_numeric(target_numbers)
+            return numeric_target_numbers
+        # in case of zero references return the appropriate code (to aid returning the correct prompt)
+        elif len(target_numbers) == 0:
+            return (0, 'MAGNITUDE', 'no_magnitude')
+        # in case of more than one references return the appropriate code (to aid returning the correct prompt)
+        elif len(target_numbers) > 1:
+            return (0, 'MAGNITUDE', 'more_magnitude')
+        # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
+        else:
+            return (0, 'MAGNITUDE', 'unknown_error')
+    except:
+        return (0, 'MAGNITUDE', 'unknown_error')