--- license: mit --- The model uses only sign **ӏ** for explosive consonants (small cyrillic palochka letter)! The model was teached by folloving David Dale's instructions for erzya language and using code from his repository. Commentaries in Russian were left untouched. ```python import torch from transformers import BertTokenizer, AutoModel import numpy as np import pandas as pd import razdel import matplotlib.pyplot as plt from tqdm.auto import tqdm, trange ``` Download the model from Huggingface repository: ```python model_name = 'NM-development/labse-en-ru-ce-prototype' tokenizer = BertTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) ``` Assign files with the texts you want to split into parallel sentences: ```python file_ru = None file_nm = None if file_ru is None or file_nm is None: nm_text = 'Ламро. Сахьт. Къена. Адам. Зуда. Вокха. Тӏулг.' ru_text = 'Горец. Час. Старый. Человек. Жена. Высокий. Камень.' else: with open(file_nm, 'r') as f1, open(file_ru, 'r') as f2: nm_text = f1.read() ru_text = f2.read() ``` In the following section define auxillary functions for parallel sentence comparison: ```python def embed(text): encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt') with torch.inference_mode(): model_output = model(**encoded_input.to(model.device)) embeddings = model_output.pooler_output embeddings = torch.nn.functional.normalize(embeddings) return embeddings[0].cpu().numpy() def center_norm(v): v = v - v.mean(0) return v / (v**2).sum(1, keepdims=True) ** 0.5 def center_dot(x, y): m = (x.sum(0) + y.sum(0)) / (x.shape[0] + y.shape[0]) x = x - m y = y - m x = x / (x**2).sum(1, keepdims=True) ** 0.5 y = y / (y**2).sum(1, keepdims=True) ** 0.5 return np.dot(x, y.T) def get_top_mean_by_row(x, k=5): m, n = x.shape k = min(k, n) topk_indices = np.argpartition(x, -k, axis=1)[:, -k:] rows, _ = np.indices((m, k)) return x[rows, topk_indices].mean(1) def align3(sims): #sims = np.dot(center_norm(orig_vecs), center_norm(sum_vecs).T) ** 3 #sims = center_dot(orig_embeds, sum_embeds) #** 3 rewards = np.zeros_like(sims) choices = np.zeros_like(sims).astype(int) # 1: choose this pair, 2: decrease i, 3: decrease j # алгоритм, разрешающий пропускать сколько угодно пар, лишь бы была монотонность for i in range(sims.shape[0]): for j in range(0, sims.shape[1]): # вариант первый: выровнять i-тое предложение с j-тым score_add = sims[i, j] if i > 0 and j > 0: # вот как тогда выровняются предыдущие score_add += rewards[i-1, j-1] choices[i, j] = 1 best = score_add if i > 0 and rewards[i-1, j] > best: best = rewards[i-1, j] choices[i, j] = 2 if j > 0 and rewards[i, j-1] > best: best = rewards[i, j-1] choices[i, j] = 3 rewards[i, j] = best alignment = [] i = sims.shape[0] - 1 j = sims.shape[1] - 1 while i > 0 and j > 0: if choices[i, j] == 1: alignment.append([i, j]) i -= 1 j -= 1 elif choices[i, j] == 2: i -= 1 else: j -= 1 return alignment[::-1] def make_sents(text): sents = [s.text.replace('\n', ' ').strip() for p in text.split('\n\n') for s in razdel.sentenize(p)] sents = [s for s in sents if s] return sents ``` Firstly split your texts into sentences: ```python sents_nm = make_sents(nm_text) sents_ru = make_sents(ru_text) ``` Then embed all the chunks: ```python emb_ru = np.stack([embed(s) for s in tqdm(sents_ru)]) emb_nm = np.stack([embed(s) for s in tqdm(sents_nm)]) ``` Now compare sentenses' semanics vectors and build correlation heatmap: ```python pen = np.array([[min(len(x), len(y)) / max(len(x), len(y)) for x in sents_nm] for y in sents_ru]) sims = np.maximum(0, np.dot(emb_ru, emb_nm.T)) ** 1 * pen alpha = 0.2 penalty = 0.2 sims_rel = (sims.T - get_top_mean_by_row(sims) * alpha).T - get_top_mean_by_row(sims.T) * alpha - penalty alignment = align3(sims_rel) print(sum(sims[i, j] for i, j in alignment) / min(sims.shape)) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.imshow(sims_rel) plt.subplot(1, 2, 2) plt.scatter(*list(zip(*alignment)), s=5); ``` Finally, save the parallel corpus into a json file: ```python nm_ru_parallel_corpus = pd.DataFrame({'nm_text' : [sents_nm[x[1]] for x in alignment], 'ru_text' : [sents_ru[x[0]] for x in alignment]}) corpus_filename = 'nm_ru_corpus.json' with open(corpus_filename, 'w') as f: nm_ru_parallel_corpus.to_json(f, force_ascii=False, indent=4) ```