File size: 8,396 Bytes
be8596b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
#!/usr/bin/env python3
# coding=utf-8
#
# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Word embeddings computation class."""
import json
import sys
import time
import urllib.request
import numpy as np
class WEmbeddings:
"""Class to keep multiple constructed word embedding computation models."""
MODELS_MAP = {
# Key: model name. Value: transformer model name, layer start, layer end.
"bert-base-multilingual-uncased-last4": ("bert-base-multilingual-uncased", -4, None),
"robeczech-base-last4": ("ufal/robeczech-base", -4, None),
"xlm-roberta-base-last4": ("xlm-roberta-base", -4, None),
"bert-large-portuguese-cased-last4":("neuralmind/bert-large-portuguese-cased", -4, None),
"bert-base-portuguese-cased-last4":("neuralmind/bert-base-portuguese-cased", -4, None),
}
MAX_SUBWORDS_PER_SENTENCE = 510
class _Model:
"""Construct a tokenizer and transformers model graph."""
def __init__(self, transformers_model, layer_start, layer_end, loader_lock):
self._model_loaded = False
self._transformers_model_name = transformers_model
self._layer_start = layer_start
self._layer_end = layer_end
self._loader_lock = loader_lock
def load(self):
if self._model_loaded: return
with self._loader_lock:
import tensorflow as tf
import transformers
if self._model_loaded: return
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._transformers_model_name, use_fast=True)
self._transformers_model = transformers.TFAutoModel.from_pretrained(
self._transformers_model_name,
config=transformers.AutoConfig.from_pretrained(self._transformers_model_name, output_hidden_states=True),
from_pt=True
)
def compute_embeddings(subwords, segments):
subword_embeddings_layers = self._transformers_model(
(tf.maximum(subwords, 0), tf.cast(tf.not_equal(subwords, -1), tf.int32))
).hidden_states
subword_embeddings = tf.math.reduce_mean(subword_embeddings_layers[self._layer_start:self._layer_end], axis=0)
# Average subwords (word pieces) word embeddings for each token
def average_subwords(embeddings_and_segments):
subword_embeddings, segments = embeddings_and_segments
return tf.math.segment_mean(subword_embeddings, segments)
word_embeddings = tf.map_fn(average_subwords, (subword_embeddings[:, 1:], segments), dtype=tf.float32)[:, :-1]
return word_embeddings
self.compute_embeddings = tf.function(compute_embeddings).get_concrete_function(
tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32)
)
self._model_loaded = True
def __init__(self, max_form_len=64, threads=None, preload_models=[]):
import tensorflow as tf
import threading
# Impose the limit on the number of threads, if given
if threads is not None:
tf.config.threading.set_inter_op_parallelism_threads(threads)
tf.config.threading.set_intra_op_parallelism_threads(threads)
self._max_form_len = max_form_len
loader_lock = threading.Lock()
self._models = {}
for model_name, (transformers_model, layer_start, layer_end) in self.MODELS_MAP.items():
self._models[model_name] = self._Model(transformers_model, layer_start, layer_end, loader_lock)
if model_name in preload_models or "all" in preload_models:
self._models[model_name].load()
def compute_embeddings(self, model, sentences):
"""Computes word embeddings.
Arguments:
model: one of the keys of self.MODELS_MAP.
sentences: 2D Python array with sentences with tokens (strings).
Returns:
embeddings as a Python list of 1D Numpy arrays
"""
if model not in self._models:
print("No such WEmbeddings model {}".format(model), file=sys.stderr, flush=True)
embeddings = []
if sentences:
model = self._models[model]
model.load()
time_tokenization = time.time()
sentences_subwords = model.tokenizer(
[(" " if i else "") + word[:self._max_form_len] for sentence in sentences for i, word in enumerate(sentence)],
add_special_tokens=False
).input_ids
subwords, segments, parts = [], [], []
for sentence in sentences:
segments.append([])
subwords.append([])
parts.append([0])
sentence_subwords, sentences_subwords = sentences_subwords[:len(sentence)], sentences_subwords[len(sentence):]
for word_subwords in sentence_subwords:
# Split sentences with too many subwords
if len(subwords[-1]) + len(word_subwords) > self.MAX_SUBWORDS_PER_SENTENCE:
subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
segments.append([])
subwords.append([])
parts[-1].append(0)
segments[-1].extend([parts[-1][-1]] * len(word_subwords))
subwords[-1].extend(word_subwords)
parts[-1][-1] += 1
subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
max_sentence_len = max(len(sentence) for sentence in sentences)
max_subwords = max(len(sentence) for sentence in subwords)
time_embeddings = time.time()
np_subwords = np.full([len(subwords), max_subwords], -1, np.int32)
for i, subword in enumerate(subwords):
np_subwords[i, :len(subword)] = subword
np_segments = np.full([len(segments), max_subwords - 1], max_sentence_len, np.int32)
for i, segment in enumerate(segments):
np_segments[i, :len(segment)] = segment
embeddings_with_parts = model.compute_embeddings(np_subwords, np_segments).numpy()
# Concatenate splitted sentences
current_sentence_part = 0
for sentence_parts in parts:
embeddings.append(np.concatenate(
[embeddings_with_parts[current_sentence_part + i, :sentence_part] for i, sentence_part in enumerate(sentence_parts)],
axis=0))
current_sentence_part += len(sentence_parts)
print("WEmbeddings in {:.1f}ms,".format(1000 * (time.time() - time_embeddings)),
"tokenization in {:.1f}ms,".format(1000*(time_embeddings - time_tokenization)),
"batch {},".format(len(sentences)),
"max sentence len {},".format(max_sentence_len),
"max subwords {}.".format(max_subwords),
file=sys.stderr, flush=True)
return embeddings
class ClientNetwork:
def __init__(self, url):
self._url = url
def compute_embeddings(self, model, sentences):
with urllib.request.urlopen(
"http://{}/wembeddings".format(self._url),
data=json.dumps({"model": model, "sentences": sentences}, ensure_ascii=True).encode("ascii"),
) as response:
embeddings = []
for _ in sentences:
embeddings.append(np.lib.format.read_array(response, allow_pickle=False))
return embeddings
|