Emanuel's picture
Initial commit
5657400
raw
history blame
2.09 kB
from typing import Tuple
import torch
import streamlit as st
from transformers import AutoModelForTokenClassification, AutoTokenizer
from dante_tokenizer import DanteTokenizer
from dante_tokenizer.data.preprocessing import expand_contractions
from annotated_text import annotated_text
def get_pos_tag_model(model_name: str = "Emanuel/autonlp-pos-tag-bosque") -> Tuple[AutoModelForTokenClassification, AutoTokenizer]:
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer
def get_tag_color(tag: str) -> str:
"""
Return the color for a given part-of-speech tag from the Universal Dependencies tagset.
See: https://universaldependencies.org/u/pos/
"""
pallete = {
"ADJ": "#2E4C6D",
"ADP": "#FBE7C6",
"ADV": "#DADDFC",
"AUX": "#FC997C",
"CCONJ": "#544179",
"DET": "#A0E7E5",
"INTJ": "#32C1CD",
"NOUN": "#17D7A0",
"PART": "#C85C5C",
"PRON": "#F9975D",
"PROPN": "#FBD148",
"PUNCT": "#B2EA70",
"SCONJ": "#AA14F0",
"SYM": "#34BE82",
"VERB": "#FFBF86",
"X": "#2F86A6",
}
return pallete[tag]
def main():
text = st.text_area("Digite seu texto de entrada!")
dt = DanteTokenizer()
model, tokenizer = get_pos_tag_model()
if text:
tokens = dt.tokenize(text)
input_cleaned_text = expand_contractions(text)
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
labelids = outputs.logits.squeeze().argmax(axis=-1)
scores, _ = torch.nn.functional.softmax(outputs.logits, dim=1).squeeze().max(axis=-1)
scores = scores.tolist()
labels = [model.config.id2label[int(x)] for x in labelids]
labels = labels[1:-1]
answer = []
for token, label, score in zip(tokens, labels, scores):
answer.append((token, label, get_tag_color(label)))
annotated_text(*answer)
if __name__ == "__main__":
main()