text_classificators / src /neural_classifiers.py
theformatisvalid's picture
Upload 7 files
2153792 verified
import os
from typing import Optional, Union, Tuple, Dict, Any, Literal
import numpy as np
try:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
Input, Embedding, Dense, Dropout, GlobalMaxPooling1D,
Conv1D, LSTM, GRU, Bidirectional, Attention, GlobalAveragePooling1D
)
TF_AVAILABLE = True
except ImportError:
TF_AVAILABLE = False
try:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import (
AutoTokenizer, AutoModel, AutoConfig,
BertForSequenceClassification, RobertaForSequenceClassification,
DistilBertForSequenceClassification, Trainer, TrainingArguments
)
from transformers.tokenization_utils_base import BatchEncoding
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
class AttentionLayer(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(
shape=(input_shape[-1], 1),
initializer='random_normal',
trainable=True,
name='attention_weight'
)
self.b = self.add_weight(
shape=(input_shape[1], 1),
initializer='zeros',
trainable=True,
name='attention_bias'
)
super().build(input_shape)
def call(self, inputs, **kwargs):
e = tf.keras.activations.tanh(tf.matmul(inputs, self.W) + self.b)
e = tf.squeeze(e, axis=-1)
a = tf.nn.softmax(e, axis=1)
a = tf.expand_dims(a, axis=-1)
weighted_input = inputs * a
return tf.reduce_sum(weighted_input, axis=1)
def build_mlp(
input_dim: int,
num_classes: int,
hidden_dims: list = [256, 128],
dropout: float = 0.3,
activation: str = 'relu'
) -> 'tf.keras.Model':
if not TF_AVAILABLE:
raise ImportError("TensorFlow not available")
inputs = Input(shape=(input_dim,))
x = inputs
for dim in hidden_dims:
x = Dense(dim, activation=activation)(x)
x = Dropout(dropout)(x)
outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
return models.Model(inputs, outputs)
def build_kim_cnn(
max_len: int,
vocab_size: int,
embed_dim: int,
num_classes: int,
filter_sizes: list = [3, 4, 5],
num_filters: int = 100,
dropout: float = 0.5,
pre_embed_matrix: Optional[np.ndarray] = None
) -> 'tf.keras.Model':
if not TF_AVAILABLE:
raise ImportError("TensorFlow not available")
inputs = Input(shape=(max_len,))
if pre_embed_matrix is not None:
embedding = Embedding(
vocab_size, embed_dim,
weights=[pre_embed_matrix],
trainable=False
)(inputs)
else:
embedding = Embedding(vocab_size, embed_dim)(inputs)
pooled_outputs = []
for fs in filter_sizes:
x = Conv1D(num_filters, fs, activation='relu')(embedding)
x = GlobalMaxPooling1D()(x)
pooled_outputs.append(x)
merged = tf.concat(pooled_outputs, axis=1)
x = Dropout(dropout)(merged)
outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
return models.Model(inputs, outputs)
def build_lstm(
max_len: int,
vocab_size: int,
embed_dim: int,
num_classes: int,
lstm_units: int = 128,
dropout: float = 0.3,
bidirectional: bool = False,
pre_embed_matrix: Optional[np.ndarray] = None
) -> 'tf.keras.Model':
if not TF_AVAILABLE:
raise ImportError("TensorFlow not available")
inputs = Input(shape=(max_len,))
if pre_embed_matrix is not None:
x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
else:
x = Embedding(vocab_size, embed_dim)(inputs)
rnn_layer = LSTM(lstm_units, dropout=dropout, recurrent_dropout=dropout)
if bidirectional:
x = Bidirectional(rnn_layer)(x)
else:
x = rnn_layer(x)
outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
return models.Model(inputs, outputs)
def build_cnn_lstm(
max_len: int,
vocab_size: int,
embed_dim: int,
num_classes: int,
filter_size: int = 3,
num_filters: int = 128,
lstm_units: int = 64,
dropout: float = 0.3,
pre_embed_matrix: Optional[np.ndarray] = None
) -> 'tf.keras.Model':
if not TF_AVAILABLE:
raise ImportError("TensorFlow not available")
inputs = Input(shape=(max_len,))
if pre_embed_matrix is not None:
x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
else:
x = Embedding(vocab_size, embed_dim)(inputs)
x = Conv1D(num_filters, filter_size, activation='relu', padding='same')(x)
x = LSTM(lstm_units, dropout=dropout)(x)
outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
return models.Model(inputs, outputs)
def build_birnn_attention(
max_len: int,
vocab_size: int,
embed_dim: int,
num_classes: int,
rnn_units: int = 64,
dropout: float = 0.3,
pre_embed_matrix: Optional[np.ndarray] = None
) -> 'tf.keras.Model':
if not TF_AVAILABLE:
raise ImportError("TensorFlow not available")
inputs = Input(shape=(max_len,))
if pre_embed_matrix is not None:
x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
else:
x = Embedding(vocab_size, embed_dim)(inputs)
x = Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=dropout))(x)
x = AttentionLayer()(x)
outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
return models.Model(inputs, outputs)
_RUSSIAN_TRANSFORMERS = {
"rubert": "DeepPavlov/rubert-base-cased",
"ruroberta": "sberbank-ai/ruRoberta-large",
"distilbert-multilingual": "distilbert-base-multilingual-cased"
}
def get_transformer_classifier(
model_name: str = "rubert",
num_classes: int = 2,
problem_type: Literal["single_label", "multi_label"] = "single_label"
) -> Tuple[Any, Any]:
if not TORCH_AVAILABLE:
raise ImportError("PyTorch or transformers not available")
if model_name not in _RUSSIAN_TRANSFORMERS:
raise ValueError(f"Unknown model_name. Choose from: {list(_RUSSIAN_TRANSFORMERS.keys())}")
model_id = _RUSSIAN_TRANSFORMERS[model_name]
tokenizer = AutoTokenizer.from_pretrained(model_id)
if "roberta" in model_id.lower():
model = RobertaForSequenceClassification.from_pretrained(
model_id, num_labels=num_classes
)
elif "distilbert" in model_id.lower():
model = DistilBertForSequenceClassification.from_pretrained(
model_id, num_labels=num_classes
)
else:
model = BertForSequenceClassification.from_pretrained(
model_id, num_labels=num_classes
)
if problem_type == "multi_label":
model.config.problem_type = "multi_label_classification"
else:
model.config.problem_type = "single_label_classification"
return model, tokenizer
def quantize_pytorch_model(model: 'torch.nn.Module', backend: str = "qnnpack") -> 'torch.nn.Module':
if not TORCH_AVAILABLE:
raise ImportError("PyTorch not available")
model.eval()
model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.quantization.prepare(model, inplace=True)
torch.quantization.convert(model, inplace=True)
return model
def prune_keras_model(model: 'tf.keras.Model', sparsity: float = 0.5) -> 'tf.keras.Model':
try:
import tensorflow_model_optimization as tfmot
except ImportError:
raise ImportError("Install tensorflow-model-optimization for pruning")
pruning_params = {
'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
initial_sparsity=0.0, final_sparsity=sparsity, begin_step=0, end_step=1000
)
}
model_for_pruning = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
return model_for_pruning
def prepare_keras_inputs(
texts: list,
tokenizer=None,
max_len: int = 128,
vocab: Optional[dict] = None
) -> np.ndarray:
if tokenizer is not None:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="np")
return encodings['input_ids']
else:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tk = Tokenizer(oov_token="<OOV>")
if vocab:
tk.word_index = vocab
else:
tk.fit_on_texts(texts)
sequences = tk.texts_to_sequences(texts)
return pad_sequences(sequences, maxlen=max_len)
def compile_keras_model(
model: 'tf.keras.Model',
learning_rate: float = 2e-5,
num_classes: int = 2
):
loss = 'sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy'
model.compile(
optimizer=optimizers.Adam(learning_rate=learning_rate),
loss=loss,
metrics=['accuracy']
)
return model