vkovenko
/

deep_lstm_attention_ukr_reviews_rating_estimation

Text Classification

Model card Files Files and versions Community

deep_lstm_attention_ukr_reviews_rating_estimation / README.md

vkovenko's picture

Update README.md

abcedeb about 1 year ago

|

raw history blame contribute delete

4.32 kB

	---
	license: cc
	datasets:
	- vkovenko/cross_domain_uk_reviews
	language:
	- uk
	metrics:
	- f1
	library_name: keras
	pipeline_tag: text-classification
	---
	Deep lstm with attention module trained for rating estimation of Ukrainian reviews.

	Code with example usage of the model can be found in the following repository: https://github.com/HikkaV/Ukrainian-Reviews-Estimation/.

	Model uses custom layer and tokenizer is used in a custom class, thus the one can load the model and tokenizer using the following code:

	```python
	import tensorflow as tf
	from tokenizers import Tokenizer, models, pre_tokenizers, trainers, Regex
	import tokenizers
	from tokenizers import Tokenizer, models, decoders, processors
	from tokenizers import pre_tokenizers, trainers, Regex
	import huggingface_hub

	class Attention(tf.keras.layers.Layer):
	def __init__(self,
	units=128, **kwargs):
	super(Attention,self).__init__(**kwargs)
	self.units = units

	def build(self, input_shape):
	self.W1=self.add_weight(name='attention_weights_1', shape=(input_shape[-1], self.units),
	initializer='glorot_uniform', trainable=True)

	self.W2=self.add_weight(name='attention_weights_2', shape=(1, self.units),
	initializer='glorot_uniform', trainable=True)

	super(Attention, self).build(input_shape)

	def call(self, x):
	x = tf.transpose(x, perm=[0, 2, 1])
	attention = tf.nn.softmax(tf.matmul(self.W2, tf.nn.tanh(tf.matmul(self.W1, x))))
	weighted_context = tf.reduce_sum(x * attention, axis=-1)
	return weighted_context, attention

	def get_config(self):
	config = super().get_config().copy()
	config.update({
	'units': self.units
	})
	return config

	#download model
	model = tf.keras.models.load_model(huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
	'deep_lstm_attention_w2v_huber.h5',
	local_dir='model'),
	compile=False,
	custom_objects={'Attention':Attention})


	class BPETokenizer:
	def __init__(self, vocab, merges):
	self.suffix = '</w>'
	self.tokenizer = Tokenizer(models.BPE.from_file(vocab=vocab,
	merges=merges, end_of_word_suffix=self.suffix))
	self.tokenizer.pre_tokenizer = pre_tokenizers.Split(Regex(r"[\w'-]+\|[^\w\s'-]+"),'removed', True)
	self.id_to_token = self.tokenizer.id_to_token
	self.encode_batch = self.tokenizer.encode_batch
	self.token_to_id = self.tokenizer.token_to_id
	self.encode = self.tokenizer.encode

	def tokens_to_ids(self, tokens):
	return list(map(self.token_to_id, tokens))

	def ids_to_tokens(self, ids):
	return list(map(self.id_to_token, ids))


	def decode(self, tokens, return_indices=False):
	decoded = []
	merged_indices = []
	i = 0
	while i<len(tokens):
	if tokens[i].endswith(self.suffix):
	decoded.append(tokens[i])
	merged_indices.append([i])
	i+=1
	else:
	merged_token = ''
	tmp_indc = []
	while not tokens[i].endswith(self.suffix):
	merged_token+=tokens[i]
	tmp_indc.append(i)
	i+=1
	merged_token+=tokens[i]
	tmp_indc.append(i)
	decoded.append(merged_token)
	merged_indices.append(tmp_indc)
	i+=1

	if return_indices:
	return decoded, merged_indices
	else:
	return decoded
	#download tokenizer
	tokenizer = BPETokenizer(vocab=huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
	'tokenizer_30k.json',
	local_dir='model'),
	merges=huggingface_hub.hf_hub_download('vkovenko/deep_lstm_attention_ukr_reviews_rating_estimation',
	'merges_tokenizer.txt',
	local_dir='model')
	)

	```