metadata
license: mit
The model uses only sign ӏ
for explosive consonants (small cyrillic palochka letter)!
import torch
from transformers import BertTokenizer, AutoModel
import numpy as np
import pandas as pd
import razdel
import matplotlib.pyplot as plt
from tqdm.auto import tqdm, trange
Download the model from Huggingface repository:
model_name = 'NM-development/labse-en-ru-ce-prototype'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
Assign files with the texts you want to split into parallel sentences:
file_ru = None
file_nm = None
if file_ru is None or file_nm is None:
nm_text = 'Ламро. Сахьт. Къена. Адам. Зуда. Вокха. Тӏулг.'
ru_text = 'Горец. Час. Старый. Человек. Жена. Высокий. Камень.'
else:
with open(file_nm, 'r') as f1, open(file_ru, 'r') as f2:
nm_text = f1.read()
ru_text = f2.read()
In the following section define auxillary functions for parallel sentence comparison:
def embed(text):
encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
with torch.inference_mode():
model_output = model(**encoded_input.to(model.device))
embeddings = model_output.pooler_output
embeddings = torch.nn.functional.normalize(embeddings)
return embeddings[0].cpu().numpy()
def center_norm(v):
v = v - v.mean(0)
return v / (v**2).sum(1, keepdims=True) ** 0.5
def center_dot(x, y):
m = (x.sum(0) + y.sum(0)) / (x.shape[0] + y.shape[0])
x = x - m
y = y - m
x = x / (x**2).sum(1, keepdims=True) ** 0.5
y = y / (y**2).sum(1, keepdims=True) ** 0.5
return np.dot(x, y.T)
def get_top_mean_by_row(x, k=5):
m, n = x.shape
k = min(k, n)
topk_indices = np.argpartition(x, -k, axis=1)[:, -k:]
rows, _ = np.indices((m, k))
return x[rows, topk_indices].mean(1)
def align3(sims):
#sims = np.dot(center_norm(orig_vecs), center_norm(sum_vecs).T) ** 3
#sims = center_dot(orig_embeds, sum_embeds) #** 3
rewards = np.zeros_like(sims)
choices = np.zeros_like(sims).astype(int) # 1: choose this pair, 2: decrease i, 3: decrease j
# алгоритм, разрешающий пропускать сколько угодно пар, лишь бы была монотонность
for i in range(sims.shape[0]):
for j in range(0, sims.shape[1]):
# вариант первый: выровнять i-тое предложение с j-тым
score_add = sims[i, j]
if i > 0 and j > 0: # вот как тогда выровняются предыдущие
score_add += rewards[i-1, j-1]
choices[i, j] = 1
best = score_add
if i > 0 and rewards[i-1, j] > best:
best = rewards[i-1, j]
choices[i, j] = 2
if j > 0 and rewards[i, j-1] > best:
best = rewards[i, j-1]
choices[i, j] = 3
rewards[i, j] = best
alignment = []
i = sims.shape[0] - 1
j = sims.shape[1] - 1
while i > 0 and j > 0:
if choices[i, j] == 1:
alignment.append([i, j])
i -= 1
j -= 1
elif choices[i, j] == 2:
i -= 1
else:
j -= 1
return alignment[::-1]
def make_sents(text):
sents = [s.text.replace('\n', ' ').strip() for p in text.split('\n\n') for s in razdel.sentenize(p)]
sents = [s for s in sents if s]
return sents
Firstly split your texts into sentences:
sents_nm = make_sents(nm_text)
sents_ru = make_sents(ru_text)
Then embed all the chunks:
emb_ru = np.stack([embed(s) for s in tqdm(sents_ru)])
emb_nm = np.stack([embed(s) for s in tqdm(sents_nm)])
Now compare sentenses' semanics vectors and build correlation heatmap:
pen = np.array([[min(len(x), len(y)) / max(len(x), len(y)) for x in sents_nm] for y in sents_ru])
sims = np.maximum(0, np.dot(emb_ru, emb_nm.T)) ** 1 * pen
alpha = 0.2
penalty = 0.2
sims_rel = (sims.T - get_top_mean_by_row(sims) * alpha).T - get_top_mean_by_row(sims.T) * alpha - penalty
alignment = align3(sims_rel)
print(sum(sims[i, j] for i, j in alignment) / min(sims.shape))
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(sims_rel)
plt.subplot(1, 2, 2)
plt.scatter(*list(zip(*alignment)), s=5);
Finally, save the parallel corpus into a json file:
nm_ru_parallel_corpus = pd.DataFrame({'nm_text' : [sents_nm[x[1]] for x in alignment], 'ru_text' : [sents_ru[x[0]] for x in alignment]})
corpus_filename = 'nm_ru_corpus.json'
with open(corpus_filename, 'w') as f:
nm_ru_parallel_corpus.to_json(f, force_ascii=False, indent=4)