--- datasets: - alexjerpelea/AroTranslate-rup-ron-dataset language: - ro license: cc-by-nc-4.0 tags: - aromanian - macedo-romanian --- This is, to the author's knowledge, the first coherent Aromanian translator. It is a [NLLB-200-600M](https://huggingface.co/facebook/nllb-200-distilled-600M) model fine-tuned for translating between Aromanian and Romanian, using this [dataset](https://huggingface.co/datasets/alexjerpelea/aromanian-romanian-MT-corpus). Read more about AroTranslate at [this GitHub repository](https://github.com/lolismek/AroTranslate.git). We present the following results: | | ron -> rup | rup -> ron | |:----|:-----|:-----| | BLEU | 35.31 | 54.69 | | ChrF2++ | 61.27 | 68.87 | Note: * As Aromanian does not have a standard writing system, please see code below for text normalization. * For Romanian text, it is important to use diacritics for best translation results. How to use the model: ```py from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, NllbTokenizer import re # load model and tokenizer: model = AutoModelForSeq2SeqLM.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-v1') tokenizer = tokenizer = AutoTokenizer.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-v1') # translate function: def translate( text, src_lang='ron_Latn', tgt_lang='rup_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs ): tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang inputs = tokenizer( text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length ) model.eval() result = model.generate( **inputs.to(model.device), forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang), max_new_tokens=int(a + b * inputs.input_ids.shape[1]), num_beams=num_beams, **kwargs ) return tokenizer.batch_decode(result, skip_special_tokens=True) def clean_text(text, lang): if isinstance(text, float): return text # consecutive spaces text = re.sub(r'\s+', ' ', text).strip() # old romanian î in the middle of the word text = re.sub(r'(?<=\w)î(?=\w)', 'â', text) if lang == 'ron': text = text.replace('Ş', 'Ș') text = text.replace('ş', 'ș') text = text.replace('Ţ', 'Ț') text = text.replace('ţ', 'ț') else: text = text.replace('ş', 'sh') text = text.replace('ș', 'sh') text = text.replace('ţ', 'ts') text = text.replace('ț', 'ts') text = text.replace('Ş', 'Sh') text = text.replace('Ș', 'Sh') text = text.replace('Ţ', 'Ts') text = text.replace('Ț', 'Ts') text = text.replace('ľ', 'lj') text = text.replace('Ľ', 'L') text = text.replace("l'", "lj") text = text.replace("l’", "lj") text = text.replace("L'", "Lj") text = text.replace("L’", "Lj") text = text.replace('ḑ', 'dz') text = text.replace('Ḑ', 'dz') text = text.replace('ḍ', 'dz') text = text.replace('Ḍ', 'Dz') # TODO: add n' text = text.replace('ń', 'nj') text = text.replace('Ń', 'Nj') text = text.replace('ñ', 'nj') text = text.replace('Ñ', 'Nj') text = text.replace('ă', 'ã') text = text.replace('Â', 'Ã') text = text.replace('â', 'ã') text = text.replace('Ă', 'Ã') text = text.replace('á', 'ã') text = text.replace('à', 'ã') text = text.replace('Á', 'Ã') text = text.replace('À', 'Ã') text = text.replace('Î', 'Ã') text = text.replace('î', 'ã') # weird foreign characters text = text.replace('ŭ', 'u') text = text.replace('ς', 'c') text = text.replace('é', 'e') text = text.replace('í', 'i') text = text.replace('ū', 'u') text = text.replace('ì', 'i') text = text.replace('ā', 'a') text = text.replace('ĭ', 'i') text = text.replace('γ', 'y') text = text.replace('ï', 'i') text = text.replace('ó', 'o') text = text.replace('θ', 'O') # for both languages: text = text.replace('—', '-') text = text.replace('–', '-') text = text.replace('…', '...') text = text.replace('*', '') text = text.replace('<', '') text = text.replace('>', '') text = text.replace('„', '"') text = text.replace('”', '"') text = text.replace('“', '"') text = text.replace('”', '"') text = text.replace('\xa0', '') text = text.replace('\ufeff', '') text = text.replace('\n', '') return text # Aromanian to Romanian: t = '''Trã atsea cãdzu pri mare cripare, shi tutã dzua stãtea ãnvirinat.''' t = clean_text(t, 'rup') print(translate(t, 'rup_Latn', 'ron_Latn')) # Romanian to Aromanian: t = '''Apoi se opri puțin, o sorbi din ochi, o sărută și - când începu să scâncească, îi cântă iar:''' t = clean_text(t, 'rup') print(translate(t, 'rup_Latn', 'ron_Latn')) ``` ## License Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License. When using this work, please mention its name as "AroTranslate" and the author.