Spaces:

theformatisvalid
/

text_classificators

Build error

App Files Files Community

text_classificators / src /neural_classifiers.py

theformatisvalid

Upload 7 files

2153792 verified about 1 month ago

raw

history blame contribute delete

9.66 kB

	import os
	from typing import Optional, Union, Tuple, Dict, Any, Literal
	import numpy as np

	try:
	import tensorflow as tf
	from tensorflow.keras import layers, models, optimizers, callbacks
	from tensorflow.keras.models import Model
	from tensorflow.keras.layers import (
	Input, Embedding, Dense, Dropout, GlobalMaxPooling1D,
	Conv1D, LSTM, GRU, Bidirectional, Attention, GlobalAveragePooling1D
	)
	TF_AVAILABLE = True
	except ImportError:
	TF_AVAILABLE = False

	try:
	import torch
	import torch.nn as nn
	from torch.nn.utils.rnn import pad_sequence
	from transformers import (
	AutoTokenizer, AutoModel, AutoConfig,
	BertForSequenceClassification, RobertaForSequenceClassification,
	DistilBertForSequenceClassification, Trainer, TrainingArguments
	)
	from transformers.tokenization_utils_base import BatchEncoding
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False


	class AttentionLayer(tf.keras.layers.Layer):
	def __init__(self, **kwargs):
	super().__init__(**kwargs)

	def build(self, input_shape):
	self.W = self.add_weight(
	shape=(input_shape[-1], 1),
	initializer='random_normal',
	trainable=True,
	name='attention_weight'
	)
	self.b = self.add_weight(
	shape=(input_shape[1], 1),
	initializer='zeros',
	trainable=True,
	name='attention_bias'
	)
	super().build(input_shape)

	def call(self, inputs, **kwargs):
	e = tf.keras.activations.tanh(tf.matmul(inputs, self.W) + self.b)
	e = tf.squeeze(e, axis=-1)
	a = tf.nn.softmax(e, axis=1)
	a = tf.expand_dims(a, axis=-1)
	weighted_input = inputs * a
	return tf.reduce_sum(weighted_input, axis=1)


	def build_mlp(
	input_dim: int,
	num_classes: int,
	hidden_dims: list = [256, 128],
	dropout: float = 0.3,
	activation: str = 'relu'
	) -> 'tf.keras.Model':
	if not TF_AVAILABLE:
	raise ImportError("TensorFlow not available")
	inputs = Input(shape=(input_dim,))
	x = inputs
	for dim in hidden_dims:
	x = Dense(dim, activation=activation)(x)
	x = Dropout(dropout)(x)
	outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
	return models.Model(inputs, outputs)


	def build_kim_cnn(
	max_len: int,
	vocab_size: int,
	embed_dim: int,
	num_classes: int,
	filter_sizes: list = [3, 4, 5],
	num_filters: int = 100,
	dropout: float = 0.5,
	pre_embed_matrix: Optional[np.ndarray] = None
	) -> 'tf.keras.Model':
	if not TF_AVAILABLE:
	raise ImportError("TensorFlow not available")
	inputs = Input(shape=(max_len,))
	if pre_embed_matrix is not None:
	embedding = Embedding(
	vocab_size, embed_dim,
	weights=[pre_embed_matrix],
	trainable=False
	)(inputs)
	else:
	embedding = Embedding(vocab_size, embed_dim)(inputs)

	pooled_outputs = []
	for fs in filter_sizes:
	x = Conv1D(num_filters, fs, activation='relu')(embedding)
	x = GlobalMaxPooling1D()(x)
	pooled_outputs.append(x)

	merged = tf.concat(pooled_outputs, axis=1)
	x = Dropout(dropout)(merged)
	outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
	return models.Model(inputs, outputs)


	def build_lstm(
	max_len: int,
	vocab_size: int,
	embed_dim: int,
	num_classes: int,
	lstm_units: int = 128,
	dropout: float = 0.3,
	bidirectional: bool = False,
	pre_embed_matrix: Optional[np.ndarray] = None
	) -> 'tf.keras.Model':
	if not TF_AVAILABLE:
	raise ImportError("TensorFlow not available")
	inputs = Input(shape=(max_len,))
	if pre_embed_matrix is not None:
	x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
	else:
	x = Embedding(vocab_size, embed_dim)(inputs)

	rnn_layer = LSTM(lstm_units, dropout=dropout, recurrent_dropout=dropout)
	if bidirectional:
	x = Bidirectional(rnn_layer)(x)
	else:
	x = rnn_layer(x)

	outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
	return models.Model(inputs, outputs)


	def build_cnn_lstm(
	max_len: int,
	vocab_size: int,
	embed_dim: int,
	num_classes: int,
	filter_size: int = 3,
	num_filters: int = 128,
	lstm_units: int = 64,
	dropout: float = 0.3,
	pre_embed_matrix: Optional[np.ndarray] = None
	) -> 'tf.keras.Model':
	if not TF_AVAILABLE:
	raise ImportError("TensorFlow not available")
	inputs = Input(shape=(max_len,))
	if pre_embed_matrix is not None:
	x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
	else:
	x = Embedding(vocab_size, embed_dim)(inputs)

	x = Conv1D(num_filters, filter_size, activation='relu', padding='same')(x)
	x = LSTM(lstm_units, dropout=dropout)(x)
	outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
	return models.Model(inputs, outputs)


	def build_birnn_attention(
	max_len: int,
	vocab_size: int,
	embed_dim: int,
	num_classes: int,
	rnn_units: int = 64,
	dropout: float = 0.3,
	pre_embed_matrix: Optional[np.ndarray] = None
	) -> 'tf.keras.Model':
	if not TF_AVAILABLE:
	raise ImportError("TensorFlow not available")
	inputs = Input(shape=(max_len,))
	if pre_embed_matrix is not None:
	x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
	else:
	x = Embedding(vocab_size, embed_dim)(inputs)

	x = Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=dropout))(x)
	x = AttentionLayer()(x)
	outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
	return models.Model(inputs, outputs)


	_RUSSIAN_TRANSFORMERS = {
	"rubert": "DeepPavlov/rubert-base-cased",
	"ruroberta": "sberbank-ai/ruRoberta-large",
	"distilbert-multilingual": "distilbert-base-multilingual-cased"
	}

	def get_transformer_classifier(
	model_name: str = "rubert",
	num_classes: int = 2,
	problem_type: Literal["single_label", "multi_label"] = "single_label"
	) -> Tuple[Any, Any]:
	if not TORCH_AVAILABLE:
	raise ImportError("PyTorch or transformers not available")

	if model_name not in _RUSSIAN_TRANSFORMERS:
	raise ValueError(f"Unknown model_name. Choose from: {list(_RUSSIAN_TRANSFORMERS.keys())}")

	model_id = _RUSSIAN_TRANSFORMERS[model_name]

	tokenizer = AutoTokenizer.from_pretrained(model_id)

	if "roberta" in model_id.lower():
	model = RobertaForSequenceClassification.from_pretrained(
	model_id, num_labels=num_classes
	)
	elif "distilbert" in model_id.lower():
	model = DistilBertForSequenceClassification.from_pretrained(
	model_id, num_labels=num_classes
	)
	else:
	model = BertForSequenceClassification.from_pretrained(
	model_id, num_labels=num_classes
	)

	if problem_type == "multi_label":
	model.config.problem_type = "multi_label_classification"
	else:
	model.config.problem_type = "single_label_classification"

	return model, tokenizer


	def quantize_pytorch_model(model: 'torch.nn.Module', backend: str = "qnnpack") -> 'torch.nn.Module':
	if not TORCH_AVAILABLE:
	raise ImportError("PyTorch not available")
	model.eval()
	model.qconfig = torch.quantization.get_default_qconfig(backend)
	torch.quantization.prepare(model, inplace=True)
	torch.quantization.convert(model, inplace=True)
	return model


	def prune_keras_model(model: 'tf.keras.Model', sparsity: float = 0.5) -> 'tf.keras.Model':
	try:
	import tensorflow_model_optimization as tfmot
	except ImportError:
	raise ImportError("Install tensorflow-model-optimization for pruning")
	pruning_params = {
	'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
	initial_sparsity=0.0, final_sparsity=sparsity, begin_step=0, end_step=1000
	)
	}
	model_for_pruning = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
	return model_for_pruning


	def prepare_keras_inputs(
	texts: list,
	tokenizer=None,
	max_len: int = 128,
	vocab: Optional[dict] = None
	) -> np.ndarray:
	if tokenizer is not None:
	encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="np")
	return encodings['input_ids']
	else:
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	tk = Tokenizer(oov_token="<OOV>")
	if vocab:
	tk.word_index = vocab
	else:
	tk.fit_on_texts(texts)
	sequences = tk.texts_to_sequences(texts)
	return pad_sequences(sequences, maxlen=max_len)


	def compile_keras_model(
	model: 'tf.keras.Model',
	learning_rate: float = 2e-5,
	num_classes: int = 2
	):
	loss = 'sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy'
	model.compile(
	optimizer=optimizers.Adam(learning_rate=learning_rate),
	loss=loss,
	metrics=['accuracy']
	)
	return model