|
from yargy import ( |
|
Parser, |
|
rule, |
|
and_, or_ |
|
) |
|
from yargy.interpretation import fact |
|
from yargy.predicates import ( |
|
eq, gte, lte, length_eq, |
|
dictionary, normalized, |
|
) |
|
import pandas as pd |
|
|
|
|
|
DateRange = fact( |
|
'DateRange', |
|
['start_day', 'start_month', 'start_year', 'stop_day', 'stop_month', 'stop_year'] |
|
) |
|
class DateRange(DateRange): |
|
years_collection = [1900] |
|
|
|
@property |
|
def normalized(self): |
|
if self.start_year != None: |
|
self.years_collection.append(self.start_year) |
|
else: |
|
self.start_year = self.years_collection[-1] |
|
|
|
if self.start_day == None: |
|
self.start_day = 0 |
|
|
|
if self.start_month == None: |
|
self.start_month = 0 |
|
|
|
if self.stop_year == None: |
|
self.stop_year = self.start_year |
|
|
|
if self.stop_month == None: |
|
self.stop_month = self.start_month |
|
|
|
if self.stop_day == None: |
|
self.stop_day = self.start_day |
|
|
|
return self |
|
@property |
|
def get_start_date(self): |
|
return str(self.start_year) + '-' + str(self.start_month).zfill(2) + '-' + str(self.start_day).zfill(2) |
|
@property |
|
def get_stop_date(self): |
|
return str(self.stop_year) + '-' + str(self.stop_month).zfill(2) + '-' + str(self.stop_day).zfill(2) |
|
|
|
MONTHS = { |
|
'январь': 1, |
|
'февраль': 2, |
|
'март': 3, |
|
'апрель': 4, |
|
'май': 5, |
|
'июнь': 6, |
|
'июль': 7, |
|
'август': 8, |
|
'сентябрь': 9, |
|
'октябрь': 10, |
|
'ноябрь': 11, |
|
'декабрь': 12, |
|
} |
|
|
|
|
|
MONTHS_LATIN = { |
|
'I': 1, |
|
'II': 2, |
|
'III': 3, |
|
'IV': 4, |
|
'V': 5, |
|
'VI': 6, |
|
'VII': 7, |
|
'VIII': 8, |
|
'IX': 9, |
|
'X': 10, |
|
'XI': 11, |
|
'XII': 12 |
|
} |
|
|
|
DAY_START = and_( |
|
gte(1), |
|
lte(31) |
|
).interpretation( |
|
DateRange.start_day.custom(int) |
|
) |
|
|
|
DAY_STOP = and_( |
|
gte(1), |
|
lte(31) |
|
).interpretation( |
|
DateRange.stop_day.custom(int) |
|
) |
|
|
|
MONTH_NAME_START = dictionary(MONTHS).interpretation( |
|
DateRange.start_month.normalized().custom(MONTHS.__getitem__) |
|
) |
|
|
|
MONTH_NAME_STOP = dictionary(MONTHS).interpretation( |
|
DateRange.stop_month.normalized().custom(MONTHS.__getitem__) |
|
) |
|
|
|
MONTH_LATIN_NAME_START = dictionary(MONTHS_LATIN).interpretation( |
|
DateRange.start_month.custom(MONTHS_LATIN.__getitem__) |
|
) |
|
|
|
MONTH_LATIN_NAME_STOP = dictionary(MONTHS_LATIN).interpretation( |
|
DateRange.stop_month.custom(MONTHS_LATIN.__getitem__) |
|
) |
|
|
|
MONTH_START = and_( |
|
gte(1), |
|
lte(12) |
|
).interpretation( |
|
DateRange.start_month.custom(int) |
|
) |
|
|
|
MONTH_STOP = and_( |
|
gte(1), |
|
lte(12) |
|
).interpretation( |
|
DateRange.stop_month.custom(int) |
|
) |
|
|
|
|
|
YEAR_START = and_( |
|
gte(1800), |
|
lte(2100) |
|
).interpretation( |
|
DateRange.start_year.custom(int) |
|
) |
|
|
|
YEAR_STOP = and_( |
|
gte(1800), |
|
lte(2100) |
|
).interpretation( |
|
DateRange.stop_year.custom(int) |
|
) |
|
|
|
YEAR_SHORT_START = and_( |
|
length_eq(2), |
|
gte(0), |
|
lte(99) |
|
).interpretation( |
|
DateRange.start_year.custom(lambda _: 1900 + int(_)) |
|
) |
|
|
|
YEAR_SHORT_STOP = and_( |
|
length_eq(2), |
|
gte(0), |
|
lte(99) |
|
).interpretation( |
|
DateRange.stop_year.custom(lambda _: 1900 + int(_)) |
|
) |
|
|
|
YEAR_WORD = or_( |
|
rule('г', eq('.').optional()), |
|
rule(normalized('год')) |
|
) |
|
|
|
PUNCT_DIVISION_DATES = or_( |
|
rule('-'), |
|
rule('—'), |
|
rule('—') |
|
) |
|
|
|
PUNCT = or_( |
|
rule('.'), |
|
rule('/') |
|
) |
|
|
|
DATE_RANGE = or_( |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT_DIVISION_DATES, |
|
DAY_STOP, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_NAME_START, |
|
MONTH_START, |
|
MONTH_LATIN_NAME_START |
|
), |
|
PUNCT.optional(), |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
).optional(), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_NAME_START, |
|
MONTH_START, |
|
MONTH_LATIN_NAME_START |
|
), |
|
PUNCT_DIVISION_DATES, |
|
DAY_STOP, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_NAME_STOP, |
|
MONTH_STOP, |
|
MONTH_LATIN_NAME_STOP |
|
), |
|
PUNCT.optional(), |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
).optional(), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_NAME_START, |
|
MONTH_START, |
|
MONTH_LATIN_NAME_START |
|
), |
|
PUNCT.optional(), |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
), |
|
PUNCT_DIVISION_DATES, |
|
DAY_STOP, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_NAME_STOP, |
|
MONTH_STOP, |
|
MONTH_LATIN_NAME_STOP |
|
), |
|
PUNCT.optional(), |
|
or_( |
|
YEAR_STOP, |
|
YEAR_SHORT_STOP |
|
), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT, |
|
MONTH_LATIN_NAME_START, |
|
'-', |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
), |
|
PUNCT_DIVISION_DATES, |
|
DAY_STOP, |
|
PUNCT, |
|
MONTH_LATIN_NAME_STOP, |
|
'-', |
|
or_( |
|
YEAR_STOP, |
|
YEAR_SHORT_STOP |
|
), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT.optional(), |
|
or_( |
|
MONTH_START, |
|
MONTH_NAME_START, |
|
MONTH_LATIN_NAME_START |
|
), |
|
PUNCT.optional(), |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
).optional(), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
DAY_START, |
|
PUNCT, |
|
MONTH_LATIN_NAME_START, |
|
'-', |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
), |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
YEAR_START, |
|
YEAR_WORD.optional() |
|
), |
|
|
|
rule( |
|
MONTH_NAME_START, |
|
or_( |
|
YEAR_START, |
|
YEAR_SHORT_START |
|
), |
|
YEAR_WORD.optional() |
|
), |
|
).interpretation( |
|
DateRange |
|
) |
|
|
|
|
|
def date_extractor_for_diary(text): |
|
res = { |
|
'date_start' : [], |
|
'date_stop' : [], |
|
'text' : [] |
|
} |
|
entry = '' |
|
for paragraph in text.split('\n'): |
|
parser = Parser(DATE_RANGE) |
|
for match in parser.findall(paragraph): |
|
record = match.fact.normalized |
|
if record.spans[0].start in range (0, 3): |
|
start = record.get_start_date |
|
stop = record.get_stop_date |
|
res['date_start'].append(start) |
|
res['date_stop'].append(stop) |
|
if entry != '': |
|
res['text'].append(entry) |
|
entry = '' |
|
break |
|
entry += paragraph |
|
entry += '\n' |
|
if entry != '': |
|
res['text'].append(entry) |
|
|
|
return pd.DataFrame(res) |
|
|
|
def normalize_dates(start, stop): |
|
if start == stop: |
|
return start |
|
else: |
|
return f'{start} - {stop}' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|