Spaces:
Sleeping
Sleeping
import json | |
# Load data and define globals | |
unstressed_vocals = tuple('aieou') | |
stressed_vocals = tuple('áíéóú') | |
all_vocals = unstressed_vocals + stressed_vocals | |
vocals_s = unstressed_vocals + ('á', 'é', 'ó',) | |
all_consonants = tuple('bcdfghjklmnñpqrstvwxzy') | |
consonants_es = tuple('lrndzjsxh') | |
consonants_s = tuple(cons for cons in all_consonants if cons not in consonants_es) | |
with open('language_data/inflexions.json', 'r', encoding='utf-8') as f: | |
VERB_INFLECTIONS = json.load(f) | |
with open('language_data/irregular_verbs_list.json', 'r', | |
encoding='utf-8') as f: | |
ALL_IRREGULAR_VERBS = json.load(f) | |
with open('language_data/irregular_verbs.json', 'r', | |
encoding='utf-8') as f: | |
IRREGULAR_MODELS = json.load(f) | |
def multi_replace(tk, seq1=stressed_vocals, seq2=unstressed_vocals): | |
for i in range(len(seq1)): | |
tk = tk.replace(seq1[i], seq2[i]) | |
return tk | |
def pluralize_noun(lemma): | |
current_vocals = [char for char in lemma if char in all_vocals] | |
if lemma.endswith(vocals_s): | |
plural = lemma + 's' | |
elif lemma.endswith(('í', 'ú')): | |
if lemma == 'sí': | |
plural = lemma + 'es' | |
else: | |
plural = lemma + 's' | |
elif lemma.endswith(tuple('lrndzj')) and len(current_vocals) >= 3 and current_vocals[-3] in stressed_vocals: | |
plural = lemma | |
elif lemma.endswith(('s', 'x')) and len(current_vocals) > 1 and current_vocals[-1] in unstressed_vocals: | |
plural = lemma | |
elif (lemma[-2] in all_vocals and lemma.endswith('y')) \ | |
or (lemma.endswith(consonants_es) or lemma.endswith('ch')): | |
if lemma.endswith('z'): | |
plural = lemma[:-1] + 'ces' | |
else: | |
plural = lemma + 'es' | |
elif (lemma[-1] in all_consonants and lemma[-2] in all_consonants) or \ | |
(len(lemma) >= 2 and lemma[-2] in all_vocals and lemma.endswith(consonants_s)): | |
if lemma == 'club': | |
return 'clubes' | |
elif lemma == 'álbum': | |
return 'álbumes' | |
elif lemma.endswith(('st', 'zt')): | |
plural = lemma | |
else: | |
plural = lemma + 's' | |
else: | |
return None | |
plural_vocals = [char for char in plural if char in all_vocals] | |
if lemma.endswith(('n', 's')) and current_vocals[-1] in stressed_vocals: | |
plural = multi_replace(plural) | |
elif lemma.endswith('n') and all([vocal in unstressed_vocals for vocal in current_vocals]) \ | |
and len(current_vocals) >= 3: | |
replacement = {'a': 'á', 'o': 'ó', 'e': 'é', 'u': 'ú', 'í': 'í'} | |
plural = plural.replace(plural_vocals[-3], replacement[plural_vocals[-3]]) | |
return plural | |
def pluralize_adjective(lemma, target_tags): | |
if lemma.endswith('e'): | |
return lemma + 's' | |
elif not any([lemma.endswith(_) for _ in all_vocals]): # if endswith a consonant | |
if lemma.endswith('z'): | |
return lemma[:-1] + 'ces' | |
elif lemma.endswith(('n', 's')) and any([_ in lemma for _ in stressed_vocals]): | |
if target_tags.get('Gender') == 'Masc': | |
return multi_replace(tk=(lemma + 'es')) | |
else: | |
return multi_replace(tk=(lemma + 'as')) | |
else: | |
return lemma + 'es' | |
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc': | |
return lemma if not lemma.endswith('s') else lemma[:-1] | |
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc': | |
return lemma + 's' | |
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem': | |
lemma = lemma if not lemma.endswith('s') else lemma[:-1] | |
if lemma.endswith('a'): | |
return lemma | |
elif lemma.endswith('e'): | |
return lemma | |
return lemma + 'a' | |
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem': | |
return lemma[:-1] + 'as' | |
def pronouns_and_determinants(lemma, target_tags): | |
if any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Plur': | |
return lemma + 's' | |
elif any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Sing': | |
return lemma | |
elif lemma.endswith('l'): | |
if target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc': | |
return lemma + 'lo' | |
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc': | |
return lemma + 'los' | |
elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem': | |
return lemma + 'la' | |
elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem': | |
return lemma + 'las' | |
elif not any([lemma.endswith(_) for _ in all_vocals]): | |
return lemma + 'es' | |
def inflect_noun_adj_pron_det(lemma: str, target_pos: str, target_tags: dict) -> str or None: | |
try: | |
if target_pos in ('NOUN', 'PROPN'): | |
if target_tags.get('Number') == 'Sing': | |
return lemma | |
elif target_tags.get('Number') == 'Plur': | |
return pluralize_noun(lemma=lemma) | |
elif target_pos == 'ADJ': | |
return pluralize_adjective(lemma=lemma, target_tags=target_tags) | |
elif target_pos == 'DET' or target_pos == 'PRON': | |
return pronouns_and_determinants(lemma=lemma, target_tags=target_tags) | |
except KeyError: | |
return None | |
def add_inflection(lemma, stem, mood, conjugation, tense, person_number, inflections=VERB_INFLECTIONS): | |
if conjugation in ('0', 0): | |
return None | |
conjugation = str(conjugation) | |
if mood == 'Cnd': | |
return lemma + inflections[mood][person_number] | |
if mood == 'Imp': | |
return stem + inflections[mood][conjugation][person_number] | |
if tense == 'Fut': | |
return lemma + inflections['Ind'][tense][person_number] | |
else: | |
inflection = stem + inflections[mood][tense][conjugation][person_number] | |
if inflection.endswith('go'): | |
inflection = inflection[:-2] + 'jo' | |
return inflection | |
def irregular(lemma, stem, conjugation, mood, tense, person_number, irregular_models=IRREGULAR_MODELS): | |
conj = lemma[-2:] | |
if mood in ('Imp', 'Cnd'): | |
suppletive_models = irregular_models[mood] | |
elif mood in ('Ind', 'Sub') and tense: | |
suppletive_models = irregular_models[mood][tense] | |
else: | |
return 0 | |
for model, verbs_and_tags in suppletive_models.items(): | |
verbs = verbs_and_tags['verbs'] | |
affected_tags = verbs_and_tags['affected_tags'] | |
if lemma in verbs and person_number in affected_tags: | |
index, inside, *replacement = model.split('--') | |
replacement = replacement[0] if len(replacement) == 1 else [] | |
if int(index) in (1, 11): | |
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1] | |
stem = 'jueg' if lemma == 'jugar' else stem | |
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation, | |
tense=tense, person_number=person_number) | |
return inflection | |
elif int(index) == 7: | |
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1] | |
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation, | |
tense=tense, person_number=person_number) | |
if (tense == 'Past' and mood == 'Ind') or (tense == 'Imp' and mood == 'Sub'): | |
inflection = inflection.replace('ují', 'uje').replace('ujió', 'ujo') \ | |
.replace('uji', 'uj').replace('ujié', 'ujé').replace('jm', 'jim').replace('js', 'jis') | |
return inflection | |
elif int(index) == 9: | |
if ((tense == 'Pres' or tense == 'Past') and mood == 'Ind') or mood == 'Sub': | |
stem = stem + 'y' | |
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation, | |
tense=tense, person_number=person_number) | |
return inflection.replace('uyi', 'uy').replace('uyi', 'uy') | |
elif int(index) == 8: | |
inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation, | |
tense=tense, person_number=person_number) | |
return inflection.replace('ñi', 'ñ').replace('lli', 'll') | |
elif int(index) == 10: | |
return verbs.get(lemma).get(person_number) | |
def conjugate_final_form(lemma, stem, conjugation, mood, tense, person_number): | |
if '_' in lemma: | |
lemma = lemma.split('_')[-1] | |
if lemma in ALL_IRREGULAR_VERBS: | |
inflection = irregular(lemma=lemma, stem=stem, conjugation=conjugation, | |
mood=mood, tense=tense, person_number=person_number) | |
if inflection is None: | |
inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation, | |
mood=mood, tense=tense, person_number=person_number) | |
elif inflection == 0: | |
return None | |
else: | |
inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation, | |
mood=mood, tense=tense, person_number=person_number) | |
return inflection | |
def past_participle(lemma, conjugation, gender_number, irregular_model=IRREGULAR_MODELS): | |
model = irregular_model['Participle']['10--super--exception']['verbs'] | |
if lemma in model: | |
stem = model.get(lemma)[:-1] | |
suffix = '' | |
else: | |
stem = lemma[:-2] | |
suffix = 'ad' if conjugation == 1 else 'id' | |
if gender_number == 'MascSing': | |
ending = 'o' | |
else: | |
if gender_number == 'MascPlur': | |
ending = 'os' | |
else: | |
ending = 'a' if gender_number == 'FemSing' else 'as' | |
participle = stem + suffix + ending | |
return participle.replace('iid', 'id') | |
def conjugate_complex_tenses(verb_lemma, verb_conjugation, haber_mood, haber_tense, haber_person_number): | |
verb_past_participle = past_participle(lemma=verb_lemma, conjugation=verb_conjugation, gender_number='MascSing') | |
aux_verb = conjugate_final_form(lemma='haber', stem='hab', conjugation=2, | |
mood=haber_mood, tense=haber_tense, person_number=haber_person_number) | |
return f'{aux_verb} {verb_past_participle}' | |
def gerund_simple(lemma, stem, conjugation): | |
if (stem.endswith(all_vocals) or lemma == 'ir') and conjugation != 1: | |
return stem + 'yendo' | |
else: | |
return stem + 'ando' if conjugation == 1 else stem + 'iendo' | |
def gerund(lemma, irregular_models=IRREGULAR_MODELS['Gerund']): | |
stem = lemma[:-2] | |
if lemma[-2:] == 'ar': | |
conjugation = 1 | |
else: | |
if lemma[-2:] == 'er': | |
conjugation = 2 | |
else: | |
if lemma[-2:] == 'ir' or lemma[-2:] == 'ír': | |
conjugation = 3 | |
else: | |
conjugation = 0 | |
if lemma not in ALL_IRREGULAR_VERBS: | |
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation) | |
else: | |
for model, verbs in irregular_models.items(): | |
if lemma in verbs['verbs']: | |
index, inside, *replacement = model.split('--') | |
replacement = replacement[0] if len(replacement) == 1 else [] | |
if int(index) == 9: | |
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo') | |
elif int(index) == 8: | |
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation) \ | |
.replace('ñi', 'ñ').replace('lli', 'll') | |
elif int(index) == 10: | |
return verbs['verbs'][lemma] | |
stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1] | |
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo') | |
return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation) | |
def verb_inflexions(lemma, target_pos, target_tags): | |
if '_' in lemma: | |
lemma = lemma.split('_')[-1] | |
stem = lemma[:-2] | |
if lemma[-2:] == 'ar': | |
conjugation = 1 | |
else: | |
if lemma[-2:] == 'er': | |
conjugation = 2 | |
else: | |
if lemma[-2:] == 'ir' or lemma[-2:] == 'ír': | |
conjugation = 3 | |
else: | |
conjugation = 0 | |
person_number = str(target_tags.get('Person')) + str(target_tags.get('Number')) | |
gender_number = str(target_tags.get('Gender')) + str(target_tags.get('Number')) | |
verb_form = target_tags.get('VerbForm') | |
mood = target_tags.get('Mood') | |
tense = target_tags.get('Tense') | |
if target_pos in ('VERB', 'AUX', 'ADJ', 'phrase'): | |
if verb_form == 'Inf': | |
return lemma | |
elif verb_form == 'Ger': | |
return gerund(lemma=lemma) | |
elif verb_form == 'Fin': | |
return conjugate_final_form(lemma=lemma, stem=stem, conjugation=conjugation, | |
mood=mood, tense=tense, person_number=person_number, ) | |
elif verb_form == 'Part': | |
return past_participle(lemma=lemma, conjugation=conjugation, gender_number=gender_number) | |
elif verb_form == 'Compuesto': | |
return conjugate_complex_tenses(verb_lemma=lemma, verb_conjugation=conjugation, haber_mood=mood, | |
haber_tense=tense, haber_person_number=person_number) | |
else: | |
return None | |
return None | |
def inflect(lemma, target_pos, target_tags): | |
verb_form = target_tags.get('VerbForm') | |
if (target_pos == 'VERB' | |
or (target_pos == 'ADJ' and verb_form == 'Part') | |
or (target_pos == 'phrase' and verb_form == 'Compuesto')): | |
return verb_inflexions(lemma=lemma, target_pos=target_pos, target_tags=target_tags) | |
elif target_pos in ('NOUN', 'PROPN', 'ADJ', 'DET', 'PRON'): | |
return inflect_noun_adj_pron_det(lemma=lemma, target_pos=target_pos, target_tags=target_tags) | |
else: | |
return None | |