#!/usr/bin/env python3 # coding=utf-8 # # Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of # Mathematics and Physics, Charles University, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """Word embeddings computation class.""" import json import sys import time import urllib.request import numpy as np class WEmbeddings: """Class to keep multiple constructed word embedding computation models.""" MODELS_MAP = { # Key: model name. Value: transformer model name, layer start, layer end. "bert-base-multilingual-uncased-last4": ("bert-base-multilingual-uncased", -4, None), "robeczech-base-last4": ("ufal/robeczech-base", -4, None), "xlm-roberta-base-last4": ("xlm-roberta-base", -4, None), "bert-large-portuguese-cased-last4":("neuralmind/bert-large-portuguese-cased", -4, None), "bert-base-portuguese-cased-last4":("neuralmind/bert-base-portuguese-cased", -4, None), } MAX_SUBWORDS_PER_SENTENCE = 510 class _Model: """Construct a tokenizer and transformers model graph.""" def __init__(self, transformers_model, layer_start, layer_end, loader_lock): self._model_loaded = False self._transformers_model_name = transformers_model self._layer_start = layer_start self._layer_end = layer_end self._loader_lock = loader_lock def load(self): if self._model_loaded: return with self._loader_lock: import tensorflow as tf import transformers if self._model_loaded: return self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._transformers_model_name, use_fast=True) self._transformers_model = transformers.TFAutoModel.from_pretrained( self._transformers_model_name, config=transformers.AutoConfig.from_pretrained(self._transformers_model_name, output_hidden_states=True), from_pt=True ) def compute_embeddings(subwords, segments): subword_embeddings_layers = self._transformers_model( (tf.maximum(subwords, 0), tf.cast(tf.not_equal(subwords, -1), tf.int32)) ).hidden_states subword_embeddings = tf.math.reduce_mean(subword_embeddings_layers[self._layer_start:self._layer_end], axis=0) # Average subwords (word pieces) word embeddings for each token def average_subwords(embeddings_and_segments): subword_embeddings, segments = embeddings_and_segments return tf.math.segment_mean(subword_embeddings, segments) word_embeddings = tf.map_fn(average_subwords, (subword_embeddings[:, 1:], segments), dtype=tf.float32)[:, :-1] return word_embeddings self.compute_embeddings = tf.function(compute_embeddings).get_concrete_function( tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32) ) self._model_loaded = True def __init__(self, max_form_len=64, threads=None, preload_models=[]): import tensorflow as tf import threading # Impose the limit on the number of threads, if given if threads is not None: tf.config.threading.set_inter_op_parallelism_threads(threads) tf.config.threading.set_intra_op_parallelism_threads(threads) self._max_form_len = max_form_len loader_lock = threading.Lock() self._models = {} for model_name, (transformers_model, layer_start, layer_end) in self.MODELS_MAP.items(): self._models[model_name] = self._Model(transformers_model, layer_start, layer_end, loader_lock) if model_name in preload_models or "all" in preload_models: self._models[model_name].load() def compute_embeddings(self, model, sentences): """Computes word embeddings. Arguments: model: one of the keys of self.MODELS_MAP. sentences: 2D Python array with sentences with tokens (strings). Returns: embeddings as a Python list of 1D Numpy arrays """ if model not in self._models: print("No such WEmbeddings model {}".format(model), file=sys.stderr, flush=True) embeddings = [] if sentences: model = self._models[model] model.load() time_tokenization = time.time() sentences_subwords = model.tokenizer( [(" " if i else "") + word[:self._max_form_len] for sentence in sentences for i, word in enumerate(sentence)], add_special_tokens=False ).input_ids subwords, segments, parts = [], [], [] for sentence in sentences: segments.append([]) subwords.append([]) parts.append([0]) sentence_subwords, sentences_subwords = sentences_subwords[:len(sentence)], sentences_subwords[len(sentence):] for word_subwords in sentence_subwords: # Split sentences with too many subwords if len(subwords[-1]) + len(word_subwords) > self.MAX_SUBWORDS_PER_SENTENCE: subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1]) segments.append([]) subwords.append([]) parts[-1].append(0) segments[-1].extend([parts[-1][-1]] * len(word_subwords)) subwords[-1].extend(word_subwords) parts[-1][-1] += 1 subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1]) max_sentence_len = max(len(sentence) for sentence in sentences) max_subwords = max(len(sentence) for sentence in subwords) time_embeddings = time.time() np_subwords = np.full([len(subwords), max_subwords], -1, np.int32) for i, subword in enumerate(subwords): np_subwords[i, :len(subword)] = subword np_segments = np.full([len(segments), max_subwords - 1], max_sentence_len, np.int32) for i, segment in enumerate(segments): np_segments[i, :len(segment)] = segment embeddings_with_parts = model.compute_embeddings(np_subwords, np_segments).numpy() # Concatenate splitted sentences current_sentence_part = 0 for sentence_parts in parts: embeddings.append(np.concatenate( [embeddings_with_parts[current_sentence_part + i, :sentence_part] for i, sentence_part in enumerate(sentence_parts)], axis=0)) current_sentence_part += len(sentence_parts) print("WEmbeddings in {:.1f}ms,".format(1000 * (time.time() - time_embeddings)), "tokenization in {:.1f}ms,".format(1000*(time_embeddings - time_tokenization)), "batch {},".format(len(sentences)), "max sentence len {},".format(max_sentence_len), "max subwords {}.".format(max_subwords), file=sys.stderr, flush=True) return embeddings class ClientNetwork: def __init__(self, url): self._url = url def compute_embeddings(self, model, sentences): with urllib.request.urlopen( "http://{}/wembeddings".format(self._url), data=json.dumps({"model": model, "sentences": sentences}, ensure_ascii=True).encode("ascii"), ) as response: embeddings = [] for _ in sentences: embeddings.append(np.lib.format.read_array(response, allow_pickle=False)) return embeddings