|
import streamlit as st |
|
from PIL import Image |
|
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer |
|
|
|
|
|
def clear_text(): |
|
st.session_state.my_text = st.session_state.widget |
|
st.session_state.widget = "" |
|
|
|
|
|
def get_result_text_es_pt (list_entity, text, lang): |
|
result_words = [] |
|
tmp_word = "" |
|
if lang == "es": |
|
punc_tags = ['¿', '?', '¡', '!', ',', '.', ':'] |
|
else: |
|
punc_tags = ['?', '!', ',', '.', ':'] |
|
|
|
for idx, entity in enumerate(list_entity): |
|
tag = entity["entity"] |
|
word = entity["word"] |
|
start = entity["start"] |
|
end = entity["end"] |
|
|
|
|
|
punc_in = next((p for p in punc_tags if p in tag), "") |
|
|
|
subword = False |
|
|
|
if word[0] == "#": |
|
subword = True |
|
if tmp_word == "": |
|
p_s = list_entity[idx-1]["start"] |
|
p_e = list_entity[idx-1]["end"] |
|
tmp_word = text[p_s:p_e] + text[start:end] |
|
else: |
|
tmp_word = tmp_word + text[start:end] |
|
word = tmp_word |
|
else: |
|
tmp_word = "" |
|
word = text[start:end] |
|
|
|
if tag == "l": |
|
word = word |
|
elif tag == "u": |
|
word = word.capitalize() |
|
|
|
else: |
|
if tag[-1] == "l": |
|
word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in) |
|
elif tag[-1] == "u": |
|
word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in) |
|
|
|
if tag != "l": |
|
word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>' |
|
|
|
if subword == True: |
|
result_words[-1] = word |
|
else: |
|
result_words.append(word) |
|
|
|
return " ".join(result_words) |
|
|
|
|
|
|
|
def get_result_text_ca (list_entity, text): |
|
result_words = [] |
|
punc_tags = ['?', '!', ',', '.', ':'] |
|
tmp_word = "" |
|
for idx, entity in enumerate(list_entity): |
|
start = entity["start"] |
|
end = entity["end"] |
|
tag = entity["entity"] |
|
word = entity["word"] |
|
|
|
|
|
punc_in = next((p for p in punc_tags if p in tag), "") |
|
|
|
subword = False |
|
|
|
if word[0] != "Ġ": |
|
subword = True |
|
if tmp_word == "": |
|
p_s = list_entity[idx-1]["start"] |
|
p_e = list_entity[idx-1]["end"] |
|
tmp_word = text[p_s:p_e] + text[start:end] |
|
else: |
|
tmp_word = tmp_word + text[start:end] |
|
word = tmp_word |
|
else: |
|
tmp_word = "" |
|
word = text[start:end] |
|
|
|
if tag == "l": |
|
word = word |
|
elif tag == "u": |
|
word = word.capitalize() |
|
|
|
else: |
|
if tag[-1] == "l": |
|
word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in) |
|
elif tag[-1] == "u": |
|
word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in) |
|
|
|
if tag != "l": |
|
word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>' |
|
|
|
if subword == True: |
|
result_words[-1] = word |
|
else: |
|
result_words.append(word) |
|
|
|
return " ".join(result_words) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
if "text" not in st.session_state: |
|
st.session_state.text = "" |
|
|
|
st.title('Sanivert Punctuation And Capitalization Restoration') |
|
model_es = AutoModelForTokenClassification.from_pretrained("VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert") |
|
tokenizer_es = AutoTokenizer.from_pretrained("VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert") |
|
pipe_es = pipeline("token-classification", model=model_es, tokenizer=tokenizer_es) |
|
|
|
model_ca = AutoModelForTokenClassification.from_pretrained("VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert") |
|
tokenizer_ca = AutoTokenizer.from_pretrained("VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert") |
|
pipe_ca = pipeline("token-classification", model=model_ca, tokenizer=tokenizer_ca) |
|
|
|
model_pt = AutoModelForTokenClassification.from_pretrained("VOCALINLP/portuguese_capitalization_punctuation_restoration_sanivert") |
|
tokenizer_pt = AutoTokenizer.from_pretrained("VOCALINLP/portuguese_capitalization_punctuation_restoration_sanivert") |
|
pipe_pt = pipeline("token-classification", model=model_pt, tokenizer=tokenizer_pt) |
|
|
|
st.sidebar.image("vocali_logo.jpg") |
|
st.sidebar.subheader("Parque Científico de Murcia, Carretera de Madrid km 388. Complejo de Espinardo, 30100 Murcia") |
|
|
|
input_text = st.selectbox( |
|
label = "Choose an language", |
|
options = ["Spanish", "Portuguese", "Catalan"] |
|
) |
|
|
|
st.subheader("Enter the text to be analyzed.") |
|
st.text_input('Enter text', key='widget', on_change=clear_text) |
|
text = st.session_state.text |
|
|
|
if input_text == "Spanish": |
|
result_pipe = pipe_es(text) |
|
out = get_result_text_es_pt(result_pipe, text, "es") |
|
elif input_text == "Portuguese": |
|
result_pipe = pipe_pt(text) |
|
out = get_result_text_es_pt(result_pipe, text, "pt") |
|
elif input_text == "Catalan": |
|
result_pipe = pipe_ca(text) |
|
out = get_result_text_ca(result_pipe, text) |
|
|
|
st.markdown(out, unsafe_allow_html=True) |
|
text = "" |