File size: 3,845 Bytes
4beebda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-

import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelWithLMHead
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import pandas as pd


def setup_tokenizer():
  tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
  tokenizer.save_pretrained("tokenizer")


import os
os.system("mkdir -p tokenizer")
setup_tokenizer()


# from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))
  
class Mish(nn.Module):
    def forward(self, input):
        return mish(input)

class EmoModel(nn.Module):
    def __init__(self, base_model, n_classes=2, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            # originally, n_classes = 6
            # now, we want to use VA, change it to 2
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_, *args):
        X, attention_mask = input_
        hidden_states = self.base_model(X, attention_mask=attention_mask)
        return self.classifier(hidden_states[0][:, 0, :])
        

from pathlib import Path
pretrained_path = "on_plurk_new_fix_data_arch_1_epoch_2_bs_16.pt"
assert Path(pretrained_path).is_file()

model = EmoModel(AutoModelWithLMHead.from_pretrained("distilroberta-base").base_model)
model.load_state_dict(torch.load(pretrained_path,map_location=torch.device('cpu')))
model.eval()

from functools import lru_cache
@lru_cache(maxsize=1)
def get_tokenizer(max_tokens=512):
    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    # add error checking
    voc_file = "tokenizer/vocab.json"
    merg_file = "tokenizer/merges.txt"

    import os.path
    if not os.path.isfile(voc_file) or not os.path.isfile(merg_file):
        setup_tokenizer()

    t = ByteLevelBPETokenizer(
        voc_file,
        merg_file
    )
    t._tokenizer.post_processor = BertProcessing(
        ("</s>", t.token_to_id("</s>")),
        ("<s>", t.token_to_id("<s>")),
    )
    t.enable_truncation(max_tokens)
    t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
    return t

# Cell
def convert_text_to_tensor(text, tokenizer=None):
    if tokenizer is None:
        tokenizer = get_tokenizer()
    enc = tokenizer.encode(text)
    X = torch.tensor(enc.ids).unsqueeze(0)
    Attn = torch.tensor(enc.attention_mask).unsqueeze(0)
    return (X, Attn)

def get_output(text, model, tokenizer=None, return_tensor=False):
    # we should add try/Except error handling for "model" argument
    # , but i consider it to be ugly
    import torch
    with torch.no_grad():
        model.eval()
        out = model(convert_text_to_tensor(text, tokenizer))
    if return_tensor == True:
      return out

    else: # return [float, float]
      # remember to make it a 1-D tensor
      tt = out[0]
      return float(tt[0]), float(tt[1])

import gradio as gr

def fn2(text, model=model, return_tensor=False):
  out = get_output(text,model, return_tensor=return_tensor)
  return out

interface = gr.Interface(
    fn = fn2,
    inputs="text",
    outputs=["number", "number"]
)

interface.launch()