import spacy import re from datetime import datetime # load the spacy model spacy.cli.download("en_core_web_lg") nlp = spacy.load("en_core_web_lg") # Define a function to extract dates from text def extract_dates(text): """ Identify dates both in numeric and free-text from text, using date regex patterns and NER tag """ # Define regex patterns for common date formats # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched date_patterns = [ r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022" r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1" r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22" r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22" r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22" ] # Find all matches for date patterns in the text matches = [] for pattern in date_patterns: for match in re.findall(pattern, text): # Check if the match is part of a longer date pattern that has already been matched if all(match not in m for m in matches): matches.append(match) # Use SpaCy to extract additional dates doc = nlp(text) for ent in doc.ents: if ent.label_ == 'DATE': date_str = ent.text # Checks each SpaCy date reference against the matches list to ensure that it is not already included if all(date_str not in m for m in matches): matches.append(date_str) # Remove duplicates and return the matches return list(set(matches)) def convert_dates(date_list): """ Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc) """ DATE_FORMATS = { '%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d/%m': 'day:{dt.day}, month:{dt.month}', '%B %d': 'day:{dt.day}, month:{dt.month}', '%b %d': 'day:{dt.day}, month:{dt.month}', '%B %Y': 'month:{dt.month}, year:{dt.year}', '%Y': 'year:{dt.year}', '%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', '%m-%d': 'month:{dt.month}, day:{dt.day}', '%-m-%-d': 'month:{dt.month}, day:{dt.day}', '%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%b %Y': 'month:{dt.month}, year:{dt.year}', '%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', '%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}' } output_list = [] for date_str in date_list: valid_format = False for fmt, out_fmt in DATE_FORMATS.items(): try: dt = datetime.strptime(date_str, fmt) output_list.append(out_fmt.format(dt=dt)) valid_format = True break except ValueError: pass if not valid_format: # Attempt to parse using a custom format try: if '-' in date_str: dt = datetime.strptime(date_str, '%m-%d-%y') else: dt = datetime.strptime(date_str, '%d/%m/%y') output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}') except ValueError: output_list.append(f'INVALID FORMAT: {date_str}') return output_list def dates_binding(text): ''' This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references ''' try: # capture the referred dates identified_dates = extract_dates(text) # we only accept for one date reference if len(identified_dates) == 1: formatted_dates = convert_dates(identified_dates) # in case there is a wrong date format then return the appropriate code to prompt back the proper message if 'INVALID FORMAT' in formatted_dates[0]: return (0,'DATES','wrong_date_format') else: return formatted_dates # in case of zero references return the appropriate code (to aid returning the correct prompt) elif len(identified_dates) == 0: return (0,'DATES','no_date') # in case of more than one references return the appropriate code (to aid returning the correct prompt) elif len(identified_dates) > 1: return (0,'DATES','more_dates') # in case of unexpected error return the appropriate code (to aid returning the correct prompt) else: return (0,'DATES','unknown_error') except: return (0,'DATES','unknown_error')