Spaces:
Sleeping
Sleeping
import spacy | |
import re | |
from datetime import datetime | |
# load the spacy model | |
spacy.cli.download("en_core_web_lg") | |
nlp = spacy.load("en_core_web_lg") | |
# Define a function to extract dates from text | |
def extract_dates(text): | |
""" | |
Identify dates both in numeric and free-text from text, using date regex patterns and NER tag | |
""" | |
# Define regex patterns for common date formats | |
# Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched | |
date_patterns = [ | |
r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022" | |
r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1" | |
r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22" | |
r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22" | |
r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22" | |
] | |
# Find all matches for date patterns in the text | |
matches = [] | |
for pattern in date_patterns: | |
for match in re.findall(pattern, text): | |
# Check if the match is part of a longer date pattern that has already been matched | |
if all(match not in m for m in matches): | |
matches.append(match) | |
# Use SpaCy to extract additional dates | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == 'DATE': | |
date_str = ent.text | |
# Checks each SpaCy date reference against the matches list to ensure that it is not already included | |
if all(date_str not in m for m in matches): | |
matches.append(date_str) | |
# Remove duplicates and return the matches | |
return list(set(matches)) | |
def convert_dates(date_list): | |
""" | |
Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc) | |
""" | |
DATE_FORMATS = { | |
'%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m': 'day:{dt.day}, month:{dt.month}', | |
'%B %d': 'day:{dt.day}, month:{dt.month}', | |
'%b %d': 'day:{dt.day}, month:{dt.month}', | |
'%B %Y': 'month:{dt.month}, year:{dt.year}', | |
'%Y': 'year:{dt.year}', | |
'%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m-%d': 'month:{dt.month}, day:{dt.day}', | |
'%-m-%-d': 'month:{dt.month}, day:{dt.day}', | |
'%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %Y': 'month:{dt.month}, year:{dt.year}', | |
'%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}' | |
} | |
output_list = [] | |
for date_str in date_list: | |
valid_format = False | |
for fmt, out_fmt in DATE_FORMATS.items(): | |
try: | |
dt = datetime.strptime(date_str, fmt) | |
output_list.append(out_fmt.format(dt=dt)) | |
valid_format = True | |
break | |
except ValueError: | |
pass | |
if not valid_format: | |
# Attempt to parse using a custom format | |
try: | |
if '-' in date_str: | |
dt = datetime.strptime(date_str, '%m-%d-%y') | |
else: | |
dt = datetime.strptime(date_str, '%d/%m/%y') | |
output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}') | |
except ValueError: | |
output_list.append(f'INVALID FORMAT: {date_str}') | |
return output_list | |
def dates_binding(text): | |
''' | |
This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references | |
''' | |
try: | |
# capture the referred dates | |
identified_dates = extract_dates(text) | |
# we only accept for one date reference | |
if len(identified_dates) == 1: | |
formatted_dates = convert_dates(identified_dates) | |
# in case there is a wrong date format then return the appropriate code to prompt back the proper message | |
if 'INVALID FORMAT' in formatted_dates[0]: | |
return (0,'DATES','wrong_date_format') | |
else: | |
return formatted_dates | |
# in case of zero references return the appropriate code (to aid returning the correct prompt) | |
elif len(identified_dates) == 0: | |
return (0,'DATES','no_date') | |
# in case of more than one references return the appropriate code (to aid returning the correct prompt) | |
elif len(identified_dates) > 1: | |
return (0,'DATES','more_dates') | |
# in case of unexpected error return the appropriate code (to aid returning the correct prompt) | |
else: | |
return (0,'DATES','unknown_error') | |
except: | |
return (0,'DATES','unknown_error') |