File size: 3,889 Bytes
4beebda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d638f0b
 
4beebda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-

import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelWithLMHead
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


def setup_tokenizer():
  tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
  tokenizer.save_pretrained("tokenizer")


import os
os.system("mkdir -p tokenizer")
setup_tokenizer()


# from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))
  
class Mish(nn.Module):
    def forward(self, input):
        return mish(input)

class EmoModel(nn.Module):
    def __init__(self, base_model, n_classes=2, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            # originally, n_classes = 6
            # now, we want to use VA, change it to 2
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_, *args):
        X, attention_mask = input_
        hidden_states = self.base_model(X, attention_mask=attention_mask)
        return self.classifier(hidden_states[0][:, 0, :])
        

from pathlib import Path
#pretrained_path = "on_plurk_new_fix_data_arch_1_epoch_2_bs_16.pt"
pretrained_path = "arch1_unfreeze_all.pt" # the latest weights!
assert Path(pretrained_path).is_file()

model = EmoModel(AutoModelWithLMHead.from_pretrained("distilroberta-base").base_model)
model.load_state_dict(torch.load(pretrained_path,map_location=torch.device('cpu')))
model.eval()

from functools import lru_cache
@lru_cache(maxsize=1)
def get_tokenizer(max_tokens=512):
    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    # add error checking
    voc_file = "tokenizer/vocab.json"
    merg_file = "tokenizer/merges.txt"

    import os.path
    if not os.path.isfile(voc_file) or not os.path.isfile(merg_file):
        setup_tokenizer()

    t = ByteLevelBPETokenizer(
        voc_file,
        merg_file
    )
    t._tokenizer.post_processor = BertProcessing(
        ("</s>", t.token_to_id("</s>")),
        ("<s>", t.token_to_id("<s>")),
    )
    t.enable_truncation(max_tokens)
    t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
    return t

# Cell
def convert_text_to_tensor(text, tokenizer=None):
    if tokenizer is None:
        tokenizer = get_tokenizer()
    enc = tokenizer.encode(text)
    X = torch.tensor(enc.ids).unsqueeze(0)
    Attn = torch.tensor(enc.attention_mask).unsqueeze(0)
    return (X, Attn)

def get_output(text, model, tokenizer=None, return_tensor=False):
    # we should add try/Except error handling for "model" argument
    # , but i consider it to be ugly
    import torch
    with torch.no_grad():
        model.eval()
        out = model(convert_text_to_tensor(text, tokenizer))
    if return_tensor == True:
      return out

    else: # return [float, float]
      # remember to make it a 1-D tensor
      tt = out[0]
      return float(tt[0]), float(tt[1])

import gradio as gr

def fn2(text, model=model, return_tensor=False):
  out = get_output(text,model, return_tensor=return_tensor)
  return out

interface = gr.Interface(
    fn = fn2,
    inputs="text",
    outputs=["number", "number"]
)

interface.launch()