a-v-bely
Fix morphology.py
adf7190
raw
history blame
14.4 kB
import json
# Load data and define globals
unstressed_vocals = tuple('aieou')
stressed_vocals = tuple('áíéóú')
all_vocals = unstressed_vocals + stressed_vocals
vocals_s = unstressed_vocals + ('á', 'é', 'ó',)
all_consonants = tuple('bcdfghjklmnñpqrstvwxzy')
consonants_es = tuple('lrndzjsxh')
consonants_s = tuple(cons for cons in all_consonants if cons not in consonants_es)
with open('language_data/inflexions.json', 'r', encoding='utf-8') as f:
VERB_INFLECTIONS = json.load(f)
with open('language_data/irregular_verbs_list.json', 'r',
encoding='utf-8') as f:
ALL_IRREGULAR_VERBS = json.load(f)
with open('language_data/irregular_verbs.json', 'r',
encoding='utf-8') as f:
IRREGULAR_MODELS = json.load(f)
def multi_replace(tk, seq1=stressed_vocals, seq2=unstressed_vocals):
for i in range(len(seq1)):
tk = tk.replace(seq1[i], seq2[i])
return tk
def pluralize_noun(lemma):
current_vocals = [char for char in lemma if char in all_vocals]
if lemma.endswith(vocals_s):
plural = lemma + 's'
elif lemma.endswith(('í', 'ú')):
if lemma == 'sí':
plural = lemma + 'es'
else:
plural = lemma + 's'
elif lemma.endswith(tuple('lrndzj')) and len(current_vocals) >= 3 and current_vocals[-3] in stressed_vocals:
plural = lemma
elif lemma.endswith(('s', 'x')) and len(current_vocals) > 1 and current_vocals[-1] in unstressed_vocals:
plural = lemma
elif (lemma[-2] in all_vocals and lemma.endswith('y')) \
or (lemma.endswith(consonants_es) or lemma.endswith('ch')):
if lemma.endswith('z'):
plural = lemma[:-1] + 'ces'
else:
plural = lemma + 'es'
elif (lemma[-1] in all_consonants and lemma[-2] in all_consonants) or \
(len(lemma) >= 2 and lemma[-2] in all_vocals and lemma.endswith(consonants_s)):
if lemma == 'club':
return 'clubes'
elif lemma == 'álbum':
return 'álbumes'
elif lemma.endswith(('st', 'zt')):
plural = lemma
else:
plural = lemma + 's'
else:
return None
plural_vocals = [char for char in plural if char in all_vocals]
if lemma.endswith(('n', 's')) and current_vocals[-1] in stressed_vocals:
plural = multi_replace(plural)
elif lemma.endswith('n') and all([vocal in unstressed_vocals for vocal in current_vocals]) \
and len(current_vocals) >= 3:
replacement = {'a': 'á', 'o': 'ó', 'e': 'é', 'u': 'ú', 'í': 'í'}
plural = plural.replace(plural_vocals[-3], replacement[plural_vocals[-3]])
return plural
def pluralize_adjective(lemma, target_tags):
if lemma.endswith('e'):
return lemma + 's'
elif not any([lemma.endswith(_) for _ in all_vocals]): # if endswith a consonant
if lemma.endswith('z'):
return lemma[:-1] + 'ces'
elif lemma.endswith(('n', 's')) and any([_ in lemma for _ in stressed_vocals]):
if target_tags.get('Gender') == 'Masc':
return multi_replace(tk=(lemma + 'es'))
else:
return multi_replace(tk=(lemma + 'as'))
else:
return lemma + 'es'
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc':
return lemma if not lemma.endswith('s') else lemma[:-1]
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc':
return lemma + 's'
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem':
lemma = lemma if not lemma.endswith('s') else lemma[:-1]
if lemma.endswith('a'):
return lemma
elif lemma.endswith('e'):
return lemma
return lemma + 'a'
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem':
return lemma[:-1] + 'as'
def pronouns_and_determinants(lemma, target_tags):
if any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Plur':
return lemma + 's'
elif any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Sing':
return lemma
elif lemma.endswith('l'):
if target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc':
return lemma + 'lo'
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc':
return lemma + 'los'
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem':
return lemma + 'la'
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem':
return lemma + 'las'
elif not any([lemma.endswith(_) for _ in all_vocals]):
return lemma + 'es'
def inflect_noun_adj_pron_det(lemma: str, target_pos: str, target_tags: dict) -> str or None:
try:
if target_pos in ('NOUN', 'PROPN'):
if target_tags.get('Number') == 'Sing':
return lemma
elif target_tags.get('Number') == 'Plur':
return pluralize_noun(lemma=lemma)
elif target_pos == 'ADJ':
return pluralize_adjective(lemma=lemma, target_tags=target_tags)
elif target_pos == 'DET' or target_pos == 'PRON':
return pronouns_and_determinants(lemma=lemma, target_tags=target_tags)
except KeyError:
return None
def add_inflection(lemma, stem, mood, conjugation, tense, person_number, inflections=VERB_INFLECTIONS):
if conjugation in ('0', 0):
return None
conjugation = str(conjugation)
if mood == 'Cnd':
return lemma + inflections[mood][person_number]
if mood == 'Imp':
return stem + inflections[mood][conjugation][person_number]
if tense == 'Fut':
return lemma + inflections['Ind'][tense][person_number]
else:
inflection = stem + inflections[mood][tense][conjugation][person_number]
if inflection.endswith('go'):
inflection = inflection[:-2] + 'jo'
return inflection
def irregular(lemma, stem, conjugation, mood, tense, person_number, irregular_models=IRREGULAR_MODELS):
conj = lemma[-2:]
if mood in ('Imp', 'Cnd'):
suppletive_models = irregular_models[mood]
elif mood in ('Ind', 'Sub') and tense:
suppletive_models = irregular_models[mood][tense]
else:
return 0
for model, verbs_and_tags in suppletive_models.items():
verbs = verbs_and_tags['verbs']
affected_tags = verbs_and_tags['affected_tags']
if lemma in verbs and person_number in affected_tags:
index, inside, *replacement = model.split('--')
replacement = replacement[0] if len(replacement) == 1 else []
if int(index) in (1, 11):
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
stem = 'jueg' if lemma == 'jugar' else stem
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
tense=tense, person_number=person_number)
return inflection
elif int(index) == 7:
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
tense=tense, person_number=person_number)
if (tense == 'Past' and mood == 'Ind') or (tense == 'Imp' and mood == 'Sub'):
inflection = inflection.replace('ují', 'uje').replace('ujió', 'ujo') \
.replace('uji', 'uj').replace('ujié', 'ujé').replace('jm', 'jim').replace('js', 'jis')
return inflection
elif int(index) == 9:
if ((tense == 'Pres' or tense == 'Past') and mood == 'Ind') or mood == 'Sub':
stem = stem + 'y'
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
tense=tense, person_number=person_number)
return inflection.replace('uyi', 'uy').replace('uyi', 'uy')
elif int(index) == 8:
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
tense=tense, person_number=person_number)
return inflection.replace('ñi', 'ñ').replace('lli', 'll')
elif int(index) == 10:
return verbs.get(lemma).get(person_number)
def conjugate_final_form(lemma, stem, conjugation, mood, tense, person_number):
if '_' in lemma:
lemma = lemma.split('_')[-1]
if lemma in ALL_IRREGULAR_VERBS:
inflection = irregular(lemma=lemma, stem=stem, conjugation=conjugation,
mood=mood, tense=tense, person_number=person_number)
if inflection is None:
inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation,
mood=mood, tense=tense, person_number=person_number)
elif inflection == 0:
return None
else:
inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation,
mood=mood, tense=tense, person_number=person_number)
return inflection
def past_participle(lemma, conjugation, gender_number, irregular_model=IRREGULAR_MODELS):
model = irregular_model['Participle']['10--super--exception']['verbs']
if lemma in model:
stem = model.get(lemma)[:-1]
suffix = ''
else:
stem = lemma[:-2]
suffix = 'ad' if conjugation == 1 else 'id'
if gender_number == 'MascSing':
ending = 'o'
else:
if gender_number == 'MascPlur':
ending = 'os'
else:
ending = 'a' if gender_number == 'FemSing' else 'as'
participle = stem + suffix + ending
return participle.replace('iid', 'id')
def conjugate_complex_tenses(verb_lemma, verb_conjugation, haber_mood, haber_tense, haber_person_number):
verb_past_participle = past_participle(lemma=verb_lemma, conjugation=verb_conjugation, gender_number='MascSing')
aux_verb = conjugate_final_form(lemma='haber', stem='hab', conjugation=2,
mood=haber_mood, tense=haber_tense, person_number=haber_person_number)
return f'{aux_verb} {verb_past_participle}'
def gerund_simple(lemma, stem, conjugation):
if (stem.endswith(all_vocals) or lemma == 'ir') and conjugation != 1:
return stem + 'yendo'
else:
return stem + 'ando' if conjugation == 1 else stem + 'iendo'
def gerund(lemma, irregular_models=IRREGULAR_MODELS['Gerund']):
stem = lemma[:-2]
if lemma[-2:] == 'ar':
conjugation = 1
else:
if lemma[-2:] == 'er':
conjugation = 2
else:
if lemma[-2:] == 'ir' or lemma[-2:] == 'ír':
conjugation = 3
else:
conjugation = 0
if lemma not in ALL_IRREGULAR_VERBS:
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation)
else:
for model, verbs in irregular_models.items():
if lemma in verbs['verbs']:
index, inside, *replacement = model.split('--')
replacement = replacement[0] if len(replacement) == 1 else []
if int(index) == 9:
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo')
elif int(index) == 8:
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation) \
.replace('ñi', 'ñ').replace('lli', 'll')
elif int(index) == 10:
return verbs['verbs'][lemma]
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo')
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation)
def verb_inflexions(lemma, target_pos, target_tags):
if '_' in lemma:
lemma = lemma.split('_')[-1]
stem = lemma[:-2]
if lemma[-2:] == 'ar':
conjugation = 1
else:
if lemma[-2:] == 'er':
conjugation = 2
else:
if lemma[-2:] == 'ir' or lemma[-2:] == 'ír':
conjugation = 3
else:
conjugation = 0
person_number = str(target_tags.get('Person')) + str(target_tags.get('Number'))
gender_number = str(target_tags.get('Gender')) + str(target_tags.get('Number'))
verb_form = target_tags.get('VerbForm')
mood = target_tags.get('Mood')
tense = target_tags.get('Tense')
if target_pos in ('VERB', 'AUX', 'ADJ', 'phrase'):
if verb_form == 'Inf':
return lemma
elif verb_form == 'Ger':
return gerund(lemma=lemma)
elif verb_form == 'Fin':
# print('FIN')
return conjugate_final_form(lemma=lemma, stem=stem, conjugation=conjugation,
mood=mood, tense=tense, person_number=person_number, )
elif verb_form == 'Part':
return past_participle(lemma=lemma, conjugation=conjugation, gender_number=gender_number)
elif verb_form == 'Compuesto':
# print('COMP')
return conjugate_complex_tenses(verb_lemma=lemma, verb_conjugation=conjugation, haber_mood=mood,
haber_tense=tense, haber_person_number=person_number)
else:
return None
return None
def inflect(lemma, target_pos, target_tags):
verb_form = target_tags.get('VerbForm')
if (target_pos == 'VERB'
or (target_pos == 'ADJ' and verb_form == 'Part')
or (target_pos == 'phrase' and verb_form == 'Compuesto')):
return verb_inflexions(lemma=lemma, target_pos=target_pos, target_tags=target_tags)
elif target_pos in ('NOUN', 'PROPN', 'ADJ', 'DET', 'PRON'):
return inflect_noun_adj_pron_det(lemma=lemma, target_pos=target_pos, target_tags=target_tags)
else:
return None