d.tsimerman commited on
Commit
cb1ecfc
1 Parent(s): 8317bc8
Files changed (2) hide show
  1. app.py +94 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import transformers
3
+ import torch
4
+ from typing import List, Dict
5
+
6
+
7
+ model_name = st.selectbox(
8
+ 'Выберите модель',
9
+ ('tinkoff-ai/crossencoder-tiny', 'tinkoff-ai/crossencoder-medium', 'tinkoff-ai/crossencoder-large')
10
+ )
11
+
12
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
13
+ model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
14
+ if torch.cuda.is_available():
15
+ model = model.cuda()
16
+
17
+ context_3 = st.text_input('Пользователь 1', 'Привет')
18
+ context_2 = st.text_input('Пользователь 1', 'Здарова')
19
+ context_1 = st.text_input('Пользователь 1', 'Как жизнь?')
20
+ response = st.text_input('Пользователь 1', 'Норм')
21
+ sample = {
22
+ 'context_3': context_3,
23
+ 'context_2': context_2,
24
+ 'context_1': context_1,
25
+ 'response': response
26
+ }
27
+
28
+
29
+ SEP_TOKEN = '[SEP]'
30
+ CLS_TOKEN = '[CLS]'
31
+ RESPONSE_TOKEN = '[RESPONSE_TOKEN]'
32
+ MAX_SEQ_LENGTH = 128
33
+ sorted_dialog_columns = ['context_3', 'context_2', 'context_1', 'response']
34
+
35
+
36
+ def tokenize_dialog_data(
37
+ tokenizer: transformers.PreTrainedTokenizer,
38
+ sample: Dict,
39
+ max_seq_length: int,
40
+ sorted_dialog_columns: List,
41
+ ):
42
+ """
43
+ Tokenize both contexts and response of dialog data separately
44
+ """
45
+ len_message_history = len(sorted_dialog_columns)
46
+ max_seq_length = min(max_seq_length, tokenizer.model_max_length)
47
+ max_each_message_length = max_seq_length // len_message_history - 1
48
+ messages = [sample[k] for k in sorted_dialog_columns]
49
+ result = {model_input_name: [] for model_input_name in tokenizer.model_input_names}
50
+ messages = [str(message) if message is not None else '' for message in messages]
51
+ tokens = tokenizer(
52
+ messages, padding=False, max_length=max_each_message_length, truncation=True, add_special_tokens=False
53
+ )
54
+ for model_input_name in tokens.keys():
55
+ result[model_input_name].extend(tokens[model_input_name])
56
+ return result
57
+
58
+
59
+ def merge_dialog_data(
60
+ tokenizer: transformers.PreTrainedTokenizer,
61
+ sample: Dict
62
+ ):
63
+ cls_token = tokenizer(CLS_TOKEN, add_special_tokens=False)
64
+ sep_token = tokenizer(SEP_TOKEN, add_special_tokens=False)
65
+ response_token = tokenizer(RESPONSE_TOKEN, add_special_tokens=False)
66
+ model_input_names = tokenizer.model_input_names
67
+ result = {}
68
+ for model_input_name in model_input_names:
69
+ tokens = []
70
+ tokens.extend(cls_token[model_input_name])
71
+ for i, message in enumerate(sample[model_input_name]):
72
+ tokens.extend(message)
73
+ if i < len(sample[model_input_name]) - 2:
74
+ tokens.extend(sep_token[model_input_name])
75
+ elif i == len(sample[model_input_name]) - 2:
76
+ tokens.extend(response_token[model_input_name])
77
+ result[model_input_name] = torch.tensor([tokens])
78
+ if torch.cuda.is_available():
79
+ result[model_input_name] = result[model_input_name].cuda()
80
+ return result
81
+
82
+ tokenized_dialog = tokenize_dialog_data(tokenizer, sample, MAX_SEQ_LENGTH, sorted_dialog_columns)
83
+ tokens = merge_dialog_data(tokenizer, tokenized_dialog)
84
+ logits = model(**tokens).logits
85
+ probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
86
+
87
+ st.metric(
88
+ label='Вероятность того, что последний ответ релевантный',
89
+ value=probas[0]
90
+ )
91
+ st.metric(
92
+ label='Вероятность того, что последний ответ специфичный',
93
+ value=probas[0]
94
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch