import gradio as gr import torch from transformers import AutoModelForSeq2SeqLM, NllbTokenizer # model_ru_qm_path = 'TSjB/mbart-large-52-ru-qm-v1' # model_qm_ru_path = 'TSjB/mbart-large-52-qm-ru-v1' MODEL_PATH = 'TSjB/NLLB-201-600M-QM-V1' # 2. Models #tokenizer_ru_qm = MBart50Tokenizer.from_pretrained(model_ru_qm_path) #tokenizer_qm_ru = MBart50Tokenizer.from_pretrained(model_qm_ru_path) #model_ru_qm = MBartForConditionalGeneration.from_pretrained(model_ru_qm_path) #model_qm_ru = MBartForConditionalGeneration.from_pretrained(model_qm_ru_path) tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH) # 3. Fix tokenizer def fixTokenizer(tokenizer, new_lang='krc_Cyrl'): """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder) tokenizer.lang_code_to_id[new_lang] = old_len-1 tokenizer.id_to_lang_code[old_len-1] = new_lang # always move "mask" to the last position tokenizer.fairseq_tokens_to_ids[""] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id) tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()} if new_lang not in tokenizer._additional_special_tokens: tokenizer._additional_special_tokens.append(new_lang) # clear the added token encoder; otherwise a new token may end up there by mistake tokenizer.added_tokens_encoder = {} tokenizer.added_tokens_decoder = {} fixTokenizer(tokenizer) # 4. Change letters def fromModel(str, dialect = "qrc"): if dialect == "qrc": str = str.replace("тюйюл", "тюл") str = str.replace("Тюйюл", "Тюл") str = str.replace("уку", "гылын qуш") str = str.replace("Уку", "Гылын qуш") str = str.replace("хораз", "гугурукку") str = str.replace("Хораз", "Гугурукку") str = str.replace("юзмез", "qум") str = str.replace("Юзмез", "Qум") str = str.replace("jиля", "jыла") str = str.replace("Jиля", "Jыла") str = str.replace("ярабий", "арабин") str = str.replace("арабий", "арабин") str = str.replace("Ярабий", "Арабин") str = str.replace("Арабий", "Арабин") str = str.replace("нтта", "нтда") str = str.replace("ртте", "ртде") str = str.replace("jамауат", "jамаgат") str = str.replace("jамаwат", "jамаgат") str = str.replace("Jамауат", "Jамаgат") str = str.replace("Jамаwат", "Jамаgат") str = str.replace("шуёх", "шох") str = str.replace("Шуёх", "Шох") str = str.replace("шёндю", "бусаgат") str = str.replace("Шёндю", "Бусаgат") str = str.replace("уgай", "оgай") str = str.replace("Уgай", "Оgай") # str = str.replace("терк", "тез") str = str.replace("саnа", "сенnе") str = str.replace("сеnе", "сенnе") str = str.replace("Саnа", "Сенnе") str = str.replace("Сеnе", "Сенnе") str = str.replace("маnа", "менnе") str = str.replace("меnе", "менnе") str = str.replace("Маnа", "Менnе") str = str.replace("Меnе", "Менnе") str = str.replace("аяq jол", "jахтана") str = str.replace("Аяq jол", "Jахтана") str = str.replace("сыbат", "сыфат") str = str.replace("Сыbат", "Сыфат") str = str.replace("b", "б") str = str.replace("q", "къ") str = str.replace("Q", "Къ") str = str.replace("g", "гъ") str = str.replace("G", "Гъ") str = str.replace("j", "дж") str = str.replace("J", "Дж") str = str.replace("w", "ў") str = str.replace("W", "Ў") str = str.replace("n", "нг") str = str.replace("N", "Нг") elif dialect == "hlm": str = str.replace("тюл", "тюйюл") str = str.replace("Тюл", "Тюйюл") str = str.replace("гылын qуш", "уку") str = str.replace("Гылын qуш", "Уку") str = str.replace("гугурукку", "хораз") str = str.replace("Гугурукку", "Хораз") str = str.replace("qум", "юзмез") str = str.replace("Qум", "Юзмез") str = str.replace("jыла", "jиля") str = str.replace("Jыла", "Jиля") str = str.replace("арабин", "ярабий") str = str.replace("арабий", "ярабий") str = str.replace("Арабин", "Ярабий") str = str.replace("Арабий", "Ярабий") str = str.replace("нтда", "нтта") str = str.replace("ртде", "ртте") str = str.replace("jамаgат", "jамаwат") str = str.replace("Jамаgат", "Jамаwат") str = str.replace("шох", "шуёх") str = str.replace("Шох", "Шуёх") str = str.replace("бусаgат", "шёндю") str = str.replace("Бусаgат", "Шёндю") str = str.replace("оgай", "уgай") str = str.replace("Оgай", "Уgай") str = str.replace("тез", "терк") str = str.replace("сенnе", "саnа") str = str.replace("сеnе", "саnа") str = str.replace("Сенnе", "Саnа") str = str.replace("Сеnе", "Саnа") str = str.replace("менnе", "маnа") str = str.replace("меnе", "маnа") str = str.replace("Менnе", "Маnа") str = str.replace("Меnе", "Маnа") str = str.replace("jахтана", "аяq jол") str = str.replace("Jахтана", "аяq jол") str = str.replace("хо", "хаw") str = str.replace("Хо", "Хаw") str = str.replace("сыbат", "сыфат") str = str.replace("Сыbат", "Сыфат") str = str.replace("b", "п") str = str.replace("q", "къ") str = str.replace("Q", "Къ") str = str.replace("g", "гъ") str = str.replace("G", "Гъ") str = str.replace("j", "ж") str = str.replace("J", "Ж") str = str.replace("w", "ў") str = str.replace("W", "Ў") str = str.replace("n", "нг") str = str.replace("N", "Нг") elif dialect == "mqr": str = str.replace("тюл", "тюйюл") str = str.replace("Тюл", "Тюйюл") str = str.replace("гылын qуш", "уку") str = str.replace("Гылын qуш", "Уку") str = str.replace("гугурукку", "хораз") str = str.replace("Гугурукку", "Хораз") str = str.replace("qум", "юзмез") str = str.replace("Qум", "Юзмез") str = str.replace("jыла", "jиля") str = str.replace("Jыла", "Jиля") str = str.replace("арабин", "ярабий") str = str.replace("арабий", "ярабий") str = str.replace("Арабин", "Ярабий") str = str.replace("Арабий", "Ярабий") str = str.replace("нтда", "нтта") str = str.replace("ртде", "ртте") str = str.replace("jамаgат", "jамаwат") str = str.replace("Jамаgат", "Jамаwат") str = str.replace("шох", "шуёх") str = str.replace("Шох", "Шуёх") str = str.replace("бусаgат", "шёндю") str = str.replace("Бусаgат", "Шёндю") str = str.replace("оgай", "уgай") str = str.replace("Оgай", "Уgай") str = str.replace("тез", "терк") str = str.replace("сенnе", "саnа") str = str.replace("сеnе", "саnа") str = str.replace("Сенnе", "Саnа") str = str.replace("Сеnе", "Саnа") str = str.replace("менnе", "маnа") str = str.replace("меnе", "маnа") str = str.replace("Менnе", "Маnа") str = str.replace("Меnе", "Маnа") str = str.replace("jахтана", "аяq jол") str = str.replace("Jахтана", "аяq jол") str = str.replace("хо", "хаw") str = str.replace("Хо", "Хаw") str = str.replace("сыbат", "сыфат") str = str.replace("Сыbат", "Сыфат") str = str.replace("b", "п") str = str.replace("q", "къ") str = str.replace("Q", "Къ") str = str.replace("g", "гъ") str = str.replace("G", "Гъ") str = str.replace("j", "з") str = str.replace("J", "З") str = str.replace("w", "ў") str = str.replace("W", "Ў") str = str.replace("n", "нг") str = str.replace("N", "Нг") str = str.replace("ч", "ц") str = str.replace("Ч", "Ц") str = str.replace("п", "ф") str = str.replace("П", "Ф") str = str.replace("къ|гъ", "х") return str def toModel(str): str = str.replace("дж", "j") str = str.replace("Дж", "J") str = str.replace("ДЖ", "J") str = str.replace("ж", "j") str = str.replace("Ж", "J") str = str.replace("себеп", "себеb") str = str.replace("себеб", "себеb") str = str.replace("Себеп", "Себеb") str = str.replace("Себеб", "Себеb") str = str.replace("тюйюл", "тюл") str = str.replace("Тюйюл", "Тюл") str = str.replace("уку", "гылын qуш") str = str.replace("Уку", "Гылын qуш") str = str.replace("хораз", "гугурукку") str = str.replace("Хораз", "Гугурукку") str = str.replace("юзмез", "qум") str = str.replace("Юзмез", "Qум") str = str.replace("арап", "араb") str = str.replace("араб", "араb") str = str.replace("Арап", "Араb") str = str.replace("Араб", "Араb") str = str.replace("jиля", "jыла") str = str.replace("jыла", "jыла") str = str.replace("jыла", "jыла") str = str.replace("Jиля", "Jыла") str = str.replace("Jыла", "Jыла") str = str.replace("Jыла", "Jыла") str = str.replace("ярабий", "арабин") str = str.replace("арабий", "арабин") str = str.replace("Ярабий", "Арабин") str = str.replace("Арабий", "Арабин") str = str.replace("нтта", "нтда") str = str.replace("ртте", "ртде") str = str.replace("jамагъат", "jамаgат") str = str.replace("jамауат", "jамаgат") str = str.replace("jамагъат", "jамаgат") str = str.replace("jамауат", "jамаgат") str = str.replace("Jамагъат", "Jамаgат") str = str.replace("Jамауат", "Jамаgат") str = str.replace("Jамагъат", "Jамаgат") str = str.replace("Jамаўат", "Jамаgат") str = str.replace("шуёх", "шох") str = str.replace("Шуёх", "Шох") str = str.replace("шёндю", "бусаgат") str = str.replace("бусагъат", "бусаgат") str = str.replace("Шёндю", "Бусаgат") str = str.replace("Бусагъат", "Бусаgат") str = str.replace("угъай", "оgай") str = str.replace("огъай", "оgай") str = str.replace("Угъай", "Оgай") str = str.replace("Огъай", "Оgай") # str = str.replace("терк", "тез") # str = str.replace("терк", "тез") str = str.replace("санга", "сенnе") str = str.replace("сенге", "сенnе") str = str.replace("сеннге", "сенnе") str = str.replace("Санга", "Сенnе") str = str.replace("Сеннге", "Сенnе") str = str.replace("Сенге", "Сенnе") str = str.replace("манга", "менnе") str = str.replace("меннге", "менnе") str = str.replace("менге", "менnе") str = str.replace("Манга", "Менnе") str = str.replace("Меннге", "Менnе") str = str.replace("Менге", "Менnе") str = str.replace("аякъ jол", "jахтана") str = str.replace("аякъ jол", "jахтана") str = str.replace("jахтана", "jахтана") str = str.replace("jахтана", "jахтана") str = str.replace("Аякъ jол", "Jахтана") str = str.replace("Аякъ jол", "Jахтана") str = str.replace("Jахтана", "Jахтана") str = str.replace("Jахтана", "Jахтана") str = str.replace("къамж", "qамыzh") str = str.replace("къамыж", "qамыzh") str = str.replace("Къамж", "Qамыzh") str = str.replace("Къамыж", "Qамыzh") str = str.replace("къымыж", "qымыzh") str = str.replace("къымыж", "qымыzh") str = str.replace("Къымыж", "Qымыzh") str = str.replace("Къымыж", "Qымыzh") str = str.replace("хау", "хо") str = str.replace("хаў", "хо") str = str.replace("Хау", "Хо") str = str.replace("Хаў", "Хо") str = str.replace("уа", "wa") str = str.replace("ўа", "wa") str = str.replace("Уа", "Wa") str = str.replace("Ўа", "Wa") str = str.replace("п", "b") str = str.replace("б", "b") str = str.replace("къ", "q") str = str.replace("Къ", "Q") str = str.replace("КЪ", "Q") str = str.replace("гъ", "g") str = str.replace("Гъ", "G") str = str.replace("ГЪ", "G") str = str.replace("ц", "ч") str = str.replace("Ц", "Ч") str = str.replace("ф", "п") str = str.replace("сыпат", "сыфат") str = str.replace("Сыпат", "Сыфат") str = str.replace("Ф", "П") str = str.replace("(?<=[аыоуэеиёюя])у(?=[аыоуэеиёюя])|(?<=[аыоуэеиёюя])ў(?=[аыоуэеиёюя])|(?<=[АЫОУЭЕИЁЮЯ])у(?=[АЫОУЭЕИЁЮЯ])|(?<=[АЫОУЭЕИЁЮЯ])ў(?=[АЫОУЭЕИЁЮЯ])", "w") str = str.replace("(?<=[аыоуэеиёюя])у|(?<=[аыоуэеиёюя])ў|(?<=[АЫОУЭЕИЁЮЯ])у|(?<=[АЫОУЭЕИЁЮЯ])ў", "w") # str = str.replace("у(?=[аыоуэеиёюя])|ў(?=[аыоуэеиёюя])|у(?=[АЫОУЭЕИЁЮЯ])|ў(?=[АЫОУЭЕИЁЮЯ])", "w") # str = str.replace("У(?=[аыоуэеиёюя])|Ў(?=[аыоуэеиёюя])|У(?=[АЫОУЭЕИЁЮЯ])|Ў(?=[АЫОУЭЕИЁЮЯ])", "W") str = str.replace("zh", "ж") str = str.replace("нг", "n") str = str.replace("Нг", " N") str = str.replace("НГ", " N") return str # 4. Translate function # 5. Translate #def translatePy(text, model, tokenizer, src='ru_RU', trg='qm_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False, n_out=None, **kwargs): # tokenizer.src_lang = src # tokenizer.tgt_lang = trg # encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024) # if max_length == 'auto': # max_length = int(32 + 1.5 * encoded.input_ids.shape[1]) # if train_mode: # model.train() # else: # model.eval() # generated_tokens = model.generate( # **encoded.to(model.device), # forced_bos_token_id=tokenizer.lang_code_to_id[trg], # max_length=max_length, # num_beams=num_beams, # repetition_penalty=repetition_penalty, # # early_stopping=True, # num_return_sequences=n_out or 1, # **kwargs # ) # out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # if isinstance(text, str) and n_out is None: # return out[0] # return out def translatePy(text, src_lang='rus_Cyrl', tgt_lang='krc_Cyrl', a=32, b=3, max_input_length=1024, num_beams=3, **kwargs ): """Turn a text or a list of texts into a list of translations""" tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang inputs = tokenizer( text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length ) model.eval() # turn off training mode result = model.generate( **inputs.to(model.device), forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang), max_new_tokens=int(a + b * inputs.input_ids.shape[1]), num_beams=num_beams, **kwargs ) return tokenizer.batch_decode(result, skip_special_tokens=True) def transl(text, til, change_letters = True): str = '' if til == "Къарачай-Малкъар": if change_letters == True: str = translatePy(toModel(text), src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl') else: str = translatePy(text, src_lang = 'krc_Cyrl', tgt_lang='rus_Cyrl') elif til == "Русский": if change_letters == True: str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl') str = fromModel(str) else: str = translatePy(text, src_lang = 'rus_Cyrl', tgt_lang='krc_Cyrl') return str demo = gr.Interface( fn=transl, inputs=[gr.Textbox(lines=1, placeholder="Your sentence here...", label = "input"), gr.Radio( ["Къарачай-Малкъар", "Русский"], label="Language", value = "Русский"), gr.Checkbox(label="Change letter", info="It's for inner using", value = True)], outputs="text" ) demo.launch()