--- license: cc datasets: - vkovenko/cross_domain_uk_reviews language: - uk metrics: - f1 library_name: keras pipeline_tag: text-classification --- Deep lstm with attention module trained for rating estimation of Ukrainian reviews. Code with example usage of the model can be found in the following repository: https://github.com/HikkaV/Ukrainian-Reviews-Estimation/. Model uses custom layer and tokenizer is used in a custom class, thus the one can load the model and tokenizer using the following code: ```python import tensorflow as tf from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex import tokenizers from tokenizers import Tokenizer, models, decoders, processors from tokenizers import pre_tokenizers, trainers, Regex import huggingface_hub class Attention(tf.keras.layers.Layer): def __init__(self, units=128, **kwargs): super(Attention,self).__init__(**kwargs) self.units = units def build(self, input_shape): self.W1=self.add_weight(name='attention_weights_1', shape=(input_shape[-1], self.units), initializer='glorot_uniform', trainable=True) self.W2=self.add_weight(name='attention_weights_2', shape=(1, self.units), initializer='glorot_uniform', trainable=True) super(Attention, self).build(input_shape) def call(self, x): x = tf.transpose(x, perm=[0, 2, 1]) attention = tf.nn.softmax(tf.matmul(self.W2, tf.nn.tanh(tf.matmul(self.W1, x)))) weighted_context = tf.reduce_sum(x * attention, axis=-1) return weighted_context, attention def get_config(self): config = super().get_config().copy() config.update({ 'units': self.units }) return config #download model model = tf.keras.models.load_model(huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation', 'deep_lstm_attention_w2v_huber.h5', local_dir='model'), compile=False, custom_objects={'Attention':Attention}) class BPETokenizer: def __init__(self, vocab, merges): self.suffix = '' self.tokenizer = Tokenizer(models.BPE.from_file(vocab=vocab, merges=merges, end_of_word_suffix=self.suffix)) self.tokenizer.pre_tokenizer = pre_tokenizers.Split(Regex(r"[\w'-]+|[^\w\s'-]+"),'removed', True) self.id_to_token = self.tokenizer.id_to_token self.encode_batch = self.tokenizer.encode_batch self.token_to_id = self.tokenizer.token_to_id self.encode = self.tokenizer.encode def tokens_to_ids(self, tokens): return list(map(self.token_to_id, tokens)) def ids_to_tokens(self, ids): return list(map(self.id_to_token, ids)) def decode(self, tokens, return_indices=False): decoded = [] merged_indices = [] i = 0 while i