File size: 2,089 Bytes
5657400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from typing import Tuple

import torch
import streamlit as st
from transformers import AutoModelForTokenClassification, AutoTokenizer
from dante_tokenizer import DanteTokenizer
from dante_tokenizer.data.preprocessing import expand_contractions
from annotated_text import annotated_text


def get_pos_tag_model(model_name: str = "Emanuel/autonlp-pos-tag-bosque") -> Tuple[AutoModelForTokenClassification, AutoTokenizer]:
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

def get_tag_color(tag: str) -> str:
    """
    Return the color for a given part-of-speech tag from the Universal Dependencies tagset.
    See: https://universaldependencies.org/u/pos/ 
    """
    pallete = {
        "ADJ": "#2E4C6D",
        "ADP": "#FBE7C6",
        "ADV": "#DADDFC",
        "AUX": "#FC997C",
        "CCONJ": "#544179",
        "DET": "#A0E7E5",
        "INTJ": "#32C1CD",
        "NOUN": "#17D7A0",
        "PART": "#C85C5C",
        "PRON": "#F9975D",
        "PROPN": "#FBD148",
        "PUNCT": "#B2EA70",
        "SCONJ": "#AA14F0",
        "SYM": "#34BE82",
        "VERB": "#FFBF86",
        "X": "#2F86A6",
    }
    return pallete[tag]

def main():
    text = st.text_area("Digite seu texto de entrada!")
    dt = DanteTokenizer()
    model, tokenizer = get_pos_tag_model()

    if text:
        tokens = dt.tokenize(text)
        input_cleaned_text = expand_contractions(text)
        inputs = tokenizer(text, return_tensors="pt")
        outputs = model(**inputs)
        labelids = outputs.logits.squeeze().argmax(axis=-1)
        scores, _ = torch.nn.functional.softmax(outputs.logits, dim=1).squeeze().max(axis=-1)
        scores = scores.tolist()
        labels = [model.config.id2label[int(x)] for x in labelids]
        labels = labels[1:-1]

        answer = []
        for token, label, score in zip(tokens, labels, scores):
            answer.append((token, label, get_tag_color(label)))
        annotated_text(*answer)
        
if __name__ == "__main__":
    main()