Spaces:

a-v-bely
/

spanish-task-generator

Sleeping

spanish-task-generator / utilities_language_general /morphology.py

a-v-bely

Fix morphology.py

adf7190 about 1 year ago

14.4 kB

	import json

	# Load data and define globals
	unstressed_vocals = tuple('aieou')
	stressed_vocals = tuple('áíéóú')
	all_vocals = unstressed_vocals + stressed_vocals
	vocals_s = unstressed_vocals + ('á', 'é', 'ó',)
	all_consonants = tuple('bcdfghjklmnñpqrstvwxzy')
	consonants_es = tuple('lrndzjsxh')
	consonants_s = tuple(cons for cons in all_consonants if cons not in consonants_es)

	with open('language_data/inflexions.json', 'r', encoding='utf-8') as f:
	VERB_INFLECTIONS = json.load(f)
	with open('language_data/irregular_verbs_list.json', 'r',
	encoding='utf-8') as f:
	ALL_IRREGULAR_VERBS = json.load(f)
	with open('language_data/irregular_verbs.json', 'r',
	encoding='utf-8') as f:
	IRREGULAR_MODELS = json.load(f)


	def multi_replace(tk, seq1=stressed_vocals, seq2=unstressed_vocals):
	for i in range(len(seq1)):
	tk = tk.replace(seq1[i], seq2[i])
	return tk


	def pluralize_noun(lemma):
	current_vocals = [char for char in lemma if char in all_vocals]
	if lemma.endswith(vocals_s):
	plural = lemma + 's'
	elif lemma.endswith(('í', 'ú')):
	if lemma == 'sí':
	plural = lemma + 'es'
	else:
	plural = lemma + 's'
	elif lemma.endswith(tuple('lrndzj')) and len(current_vocals) >= 3 and current_vocals[-3] in stressed_vocals:
	plural = lemma
	elif lemma.endswith(('s', 'x')) and len(current_vocals) > 1 and current_vocals[-1] in unstressed_vocals:
	plural = lemma
	elif (lemma[-2] in all_vocals and lemma.endswith('y')) \
	or (lemma.endswith(consonants_es) or lemma.endswith('ch')):
	if lemma.endswith('z'):
	plural = lemma[:-1] + 'ces'
	else:
	plural = lemma + 'es'
	elif (lemma[-1] in all_consonants and lemma[-2] in all_consonants) or \
	(len(lemma) >= 2 and lemma[-2] in all_vocals and lemma.endswith(consonants_s)):
	if lemma == 'club':
	return 'clubes'
	elif lemma == 'álbum':
	return 'álbumes'
	elif lemma.endswith(('st', 'zt')):
	plural = lemma
	else:
	plural = lemma + 's'
	else:
	return None
	plural_vocals = [char for char in plural if char in all_vocals]
	if lemma.endswith(('n', 's')) and current_vocals[-1] in stressed_vocals:
	plural = multi_replace(plural)
	elif lemma.endswith('n') and all([vocal in unstressed_vocals for vocal in current_vocals]) \
	and len(current_vocals) >= 3:
	replacement = {'a': 'á', 'o': 'ó', 'e': 'é', 'u': 'ú', 'í': 'í'}
	plural = plural.replace(plural_vocals[-3], replacement[plural_vocals[-3]])
	return plural


	def pluralize_adjective(lemma, target_tags):
	if lemma.endswith('e'):
	return lemma + 's'
	elif not any([lemma.endswith(_) for _ in all_vocals]): # if endswith a consonant
	if lemma.endswith('z'):
	return lemma[:-1] + 'ces'
	elif lemma.endswith(('n', 's')) and any([_ in lemma for _ in stressed_vocals]):
	if target_tags.get('Gender') == 'Masc':
	return multi_replace(tk=(lemma + 'es'))
	else:
	return multi_replace(tk=(lemma + 'as'))
	else:
	return lemma + 'es'
	elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc':
	return lemma if not lemma.endswith('s') else lemma[:-1]
	elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc':
	return lemma + 's'
	elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem':
	lemma = lemma if not lemma.endswith('s') else lemma[:-1]
	if lemma.endswith('a'):
	return lemma
	elif lemma.endswith('e'):
	return lemma
	return lemma + 'a'
	elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem':
	return lemma[:-1] + 'as'


	def pronouns_and_determinants(lemma, target_tags):
	if any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Plur':
	return lemma + 's'
	elif any([lemma.endswith(_) for _ in all_vocals]) and target_tags.get('Number') == 'Sing':
	return lemma
	elif lemma.endswith('l'):
	if target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Masc':
	return lemma + 'lo'
	elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Masc':
	return lemma + 'los'
	elif target_tags.get('Number') == 'Sing' and target_tags.get('Gender') == 'Fem':
	return lemma + 'la'
	elif target_tags.get('Number') == 'Plur' and target_tags.get('Gender') == 'Fem':
	return lemma + 'las'
	elif not any([lemma.endswith(_) for _ in all_vocals]):
	return lemma + 'es'


	def inflect_noun_adj_pron_det(lemma: str, target_pos: str, target_tags: dict) -> str or None:
	try:
	if target_pos in ('NOUN', 'PROPN'):
	if target_tags.get('Number') == 'Sing':
	return lemma
	elif target_tags.get('Number') == 'Plur':
	return pluralize_noun(lemma=lemma)
	elif target_pos == 'ADJ':
	return pluralize_adjective(lemma=lemma, target_tags=target_tags)
	elif target_pos == 'DET' or target_pos == 'PRON':
	return pronouns_and_determinants(lemma=lemma, target_tags=target_tags)
	except KeyError:
	return None


	def add_inflection(lemma, stem, mood, conjugation, tense, person_number, inflections=VERB_INFLECTIONS):
	if conjugation in ('0', 0):
	return None
	conjugation = str(conjugation)
	if mood == 'Cnd':
	return lemma + inflections[mood][person_number]
	if mood == 'Imp':
	return stem + inflections[mood][conjugation][person_number]
	if tense == 'Fut':
	return lemma + inflections['Ind'][tense][person_number]
	else:
	inflection = stem + inflections[mood][tense][conjugation][person_number]
	if inflection.endswith('go'):
	inflection = inflection[:-2] + 'jo'
	return inflection


	def irregular(lemma, stem, conjugation, mood, tense, person_number, irregular_models=IRREGULAR_MODELS):
	conj = lemma[-2:]
	if mood in ('Imp', 'Cnd'):
	suppletive_models = irregular_models[mood]
	elif mood in ('Ind', 'Sub') and tense:
	suppletive_models = irregular_models[mood][tense]
	else:
	return 0
	for model, verbs_and_tags in suppletive_models.items():
	verbs = verbs_and_tags['verbs']
	affected_tags = verbs_and_tags['affected_tags']
	if lemma in verbs and person_number in affected_tags:
	index, inside, *replacement = model.split('--')
	replacement = replacement[0] if len(replacement) == 1 else []
	if int(index) in (1, 11):
	stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
	stem = 'jueg' if lemma == 'jugar' else stem
	inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
	tense=tense, person_number=person_number)
	return inflection
	elif int(index) == 7:
	stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
	inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
	tense=tense, person_number=person_number)
	if (tense == 'Past' and mood == 'Ind') or (tense == 'Imp' and mood == 'Sub'):
	inflection = inflection.replace('ují', 'uje').replace('ujió', 'ujo') \
	.replace('uji', 'uj').replace('ujié', 'ujé').replace('jm', 'jim').replace('js', 'jis')
	return inflection
	elif int(index) == 9:
	if ((tense == 'Pres' or tense == 'Past') and mood == 'Ind') or mood == 'Sub':
	stem = stem + 'y'
	inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
	tense=tense, person_number=person_number)
	return inflection.replace('uyi', 'uy').replace('uyi', 'uy')
	elif int(index) == 8:
	inflection = add_inflection(lemma=stem + conj, stem=stem, mood=mood, conjugation=conjugation,
	tense=tense, person_number=person_number)
	return inflection.replace('ñi', 'ñ').replace('lli', 'll')
	elif int(index) == 10:
	return verbs.get(lemma).get(person_number)


	def conjugate_final_form(lemma, stem, conjugation, mood, tense, person_number):
	if '_' in lemma:
	lemma = lemma.split('_')[-1]
	if lemma in ALL_IRREGULAR_VERBS:
	inflection = irregular(lemma=lemma, stem=stem, conjugation=conjugation,
	mood=mood, tense=tense, person_number=person_number)
	if inflection is None:
	inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation,
	mood=mood, tense=tense, person_number=person_number)
	elif inflection == 0:
	return None
	else:
	inflection = add_inflection(lemma=lemma, stem=stem, conjugation=conjugation,
	mood=mood, tense=tense, person_number=person_number)
	return inflection


	def past_participle(lemma, conjugation, gender_number, irregular_model=IRREGULAR_MODELS):
	model = irregular_model['Participle']['10--super--exception']['verbs']
	if lemma in model:
	stem = model.get(lemma)[:-1]
	suffix = ''
	else:
	stem = lemma[:-2]
	suffix = 'ad' if conjugation == 1 else 'id'
	if gender_number == 'MascSing':
	ending = 'o'
	else:
	if gender_number == 'MascPlur':
	ending = 'os'
	else:
	ending = 'a' if gender_number == 'FemSing' else 'as'
	participle = stem + suffix + ending
	return participle.replace('iid', 'id')


	def conjugate_complex_tenses(verb_lemma, verb_conjugation, haber_mood, haber_tense, haber_person_number):
	verb_past_participle = past_participle(lemma=verb_lemma, conjugation=verb_conjugation, gender_number='MascSing')
	aux_verb = conjugate_final_form(lemma='haber', stem='hab', conjugation=2,
	mood=haber_mood, tense=haber_tense, person_number=haber_person_number)
	return f'{aux_verb} {verb_past_participle}'


	def gerund_simple(lemma, stem, conjugation):
	if (stem.endswith(all_vocals) or lemma == 'ir') and conjugation != 1:
	return stem + 'yendo'
	else:
	return stem + 'ando' if conjugation == 1 else stem + 'iendo'


	def gerund(lemma, irregular_models=IRREGULAR_MODELS['Gerund']):
	stem = lemma[:-2]
	if lemma[-2:] == 'ar':
	conjugation = 1
	else:
	if lemma[-2:] == 'er':
	conjugation = 2
	else:
	if lemma[-2:] == 'ir' or lemma[-2:] == 'ír':
	conjugation = 3
	else:
	conjugation = 0
	if lemma not in ALL_IRREGULAR_VERBS:
	return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation)
	else:
	for model, verbs in irregular_models.items():
	if lemma in verbs['verbs']:
	index, inside, *replacement = model.split('--')
	replacement = replacement[0] if len(replacement) == 1 else []
	if int(index) == 9:
	return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo')
	elif int(index) == 8:
	return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation) \
	.replace('ñi', 'ñ').replace('lli', 'll')
	elif int(index) == 10:
	return verbs['verbs'][lemma]
	stem = stem[::-1].replace(inside, replacement[::-1], 1)[::-1]
	return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation).replace('iyendo', 'iendo')
	return gerund_simple(lemma=lemma, stem=stem, conjugation=conjugation)


	def verb_inflexions(lemma, target_pos, target_tags):
	if '_' in lemma:
	lemma = lemma.split('_')[-1]
	stem = lemma[:-2]
	if lemma[-2:] == 'ar':
	conjugation = 1
	else:
	if lemma[-2:] == 'er':
	conjugation = 2
	else:
	if lemma[-2:] == 'ir' or lemma[-2:] == 'ír':
	conjugation = 3
	else:
	conjugation = 0
	person_number = str(target_tags.get('Person')) + str(target_tags.get('Number'))
	gender_number = str(target_tags.get('Gender')) + str(target_tags.get('Number'))
	verb_form = target_tags.get('VerbForm')
	mood = target_tags.get('Mood')
	tense = target_tags.get('Tense')
	if target_pos in ('VERB', 'AUX', 'ADJ', 'phrase'):
	if verb_form == 'Inf':
	return lemma
	elif verb_form == 'Ger':
	return gerund(lemma=lemma)
	elif verb_form == 'Fin':
	# print('FIN')
	return conjugate_final_form(lemma=lemma, stem=stem, conjugation=conjugation,
	mood=mood, tense=tense, person_number=person_number, )
	elif verb_form == 'Part':
	return past_participle(lemma=lemma, conjugation=conjugation, gender_number=gender_number)
	elif verb_form == 'Compuesto':
	# print('COMP')
	return conjugate_complex_tenses(verb_lemma=lemma, verb_conjugation=conjugation, haber_mood=mood,
	haber_tense=tense, haber_person_number=person_number)
	else:
	return None
	return None


	def inflect(lemma, target_pos, target_tags):
	verb_form = target_tags.get('VerbForm')
	if (target_pos == 'VERB'
	or (target_pos == 'ADJ' and verb_form == 'Part')
	or (target_pos == 'phrase' and verb_form == 'Compuesto')):
	return verb_inflexions(lemma=lemma, target_pos=target_pos, target_tags=target_tags)
	elif target_pos in ('NOUN', 'PROPN', 'ADJ', 'DET', 'PRON'):
	return inflect_noun_adj_pron_det(lemma=lemma, target_pos=target_pos, target_tags=target_tags)
	else:
	return None