tinkoff-ai
/

response-quality-classifier-tiny

@@ -9,10 +9,10 @@ widget:
   example_title: "Dialog example 3"
 ---
 This classification model is based on [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2).
 The model should be used to produce relevance and specificity of the last message in the context of a dialog.
 It is pretrained on corpus of dialog data from social networks and finetuned on [tinkoff-ai/context_similarity](https://huggingface.co/tinkoff-ai/context_similarity).
 The performance of the model on validation split [tinkoff-ai/context_similarity](https://huggingface.co/tinkoff-ai/context_similarity) (with the best thresholds for validation samples):
@@ -27,8 +27,81 @@ The model can be loaded as follows:
 ```python
 # pip install transformers
 from transformers import AutoTokenizer, AutoModel
-tokenizer = AutoTokenizer.from_pretrained("tinkoff-ai/context_similarity")
-model = AutoModel.from_pretrained("tinkoff-ai/context_similarity")
 # model.cuda()
 ```

   example_title: "Dialog example 3"
 ---
 This classification model is based on [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2).
 The model should be used to produce relevance and specificity of the last message in the context of a dialog.
 It is pretrained on corpus of dialog data from social networks and finetuned on [tinkoff-ai/context_similarity](https://huggingface.co/tinkoff-ai/context_similarity).
 The performance of the model on validation split [tinkoff-ai/context_similarity](https://huggingface.co/tinkoff-ai/context_similarity) (with the best thresholds for validation samples):
 ```python
 # pip install transformers
+import transformers
 from transformers import AutoTokenizer, AutoModel
+import torch
+from typing import List, Dict
+tokenizer = AutoTokenizer.from_pretrained("tinkoff-ai/response-quality-classifier-tiny")
+model = AutoModel.from_pretrained("tinkoff-ai/response-quality-classifier-tiny")
 # model.cuda()
+context_3 = 'привет'
+context_2 = 'привет!'
+context_1 = 'как дела?'
+response = 'у меня все хорошо, а у тебя как?'
+sample = {
+    'context_3': context_3,
+    'context_2': context_2,
+    'context_1': context_1,
+    'response': response
+}
+SEP_TOKEN = '[SEP]'
+CLS_TOKEN = '[CLS]'
+RESPONSE_TOKEN = '[RESPONSE_TOKEN]'
+MAX_SEQ_LENGTH = 128
+sorted_dialog_columns = ['context_3', 'context_2', 'context_1', 'response']
+def tokenize_dialog_data(
+        tokenizer: transformers.PreTrainedTokenizer,
+        sample: Dict,
+        max_seq_length: int,
+        sorted_dialog_columns: List,
+):
+    """
+    Tokenize both contexts and response of dialog data separately
+    """
+    len_message_history = len(sorted_dialog_columns)
+    max_seq_length = min(max_seq_length, tokenizer.model_max_length)
+    max_each_message_length = max_seq_length // len_message_history - 1
+    messages = [sample[k] for k in sorted_dialog_columns]
+    result = {model_input_name: [] for model_input_name in tokenizer.model_input_names}
+    messages = [str(message) if message is not None else '' for message in messages]
+    tokens = tokenizer(
+        messages, padding=False, max_length=max_each_message_length, truncation=True, add_special_tokens=False
+    )
+    for model_input_name in tokens.keys():
+        result[model_input_name].extend(tokens[model_input_name])
+    return result
+def merge_dialog_data(
+        tokenizer: transformers.PreTrainedTokenizer,
+        sample: Dict
+):
+    cls_token = tokenizer(CLS_TOKEN, add_special_tokens=False)
+    sep_token = tokenizer(SEP_TOKEN, add_special_tokens=False)
+    response_token = tokenizer(RESPONSE_TOKEN, add_special_tokens=False)
+    model_input_names = tokenizer.model_input_names
+    result = {}
+    for model_input_name in model_input_names:
+        tokens = []
+        tokens.extend(cls_token[model_input_name])
+        for i, message in enumerate(sample[model_input_name]):
+            tokens.extend(message)
+            if i < len(sample[model_input_name]) - 2:
+                tokens.extend(sep_token[model_input_name])
+            elif i == len(sample[model_input_name]) - 2:
+                tokens.extend(response_token[model_input_name])
+        result[model_input_name] = torch.tensor([tokens])
+        if torch.cuda.is_available():
+            result[model_input_name] = result[model_input_name].cuda()
+    return result
+tokenized_dialog = tokenize_dialog_data(tokenizer, sample, MAX_SEQ_LENGTH, sorted_dialog_columns)
+tokens = merge_dialog_data(tokenizer, tokenized_dialog)
+with torch.inference_mode():
+    logits = model(**tokens).logits
+    probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
+print(probas)
 ```