from pandas.io.formats.format import return_docstring import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModelForMaskedLM from transformers import pipeline import os import json import random with open("config.json") as f: cfg = json.loads(f.read()) @st.cache(show_spinner=False, persist=True) def load_model(masked_text, model_name): model = AutoModelForMaskedLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer) MASK_TOKEN = tokenizer.mask_token masked_text = masked_text.replace("", MASK_TOKEN) result_sentence = nlp(masked_text) return result_sentence[0]["sequence"], result_sentence[0]["token_str"] def app(): st.markdown( "

RoBERTa Hindi

", unsafe_allow_html=True, ) st.markdown( "This demo uses multiple hindi transformer models for Masked Language Modelling (MLM)." ) models_list = list(cfg["models"].keys()) models = st.multiselect("Choose models", models_list, models_list) target_text_path = "./mlm_custom/mlm_targeted_text.csv" target_text_df = pd.read_csv(target_text_path) texts = target_text_df["text"] st.sidebar.title("Hindi MLM") pick_random = st.sidebar.checkbox("Pick any random text") results_df = pd.DataFrame(columns=["Model Name", "Filled Token", "Filled Text"]) model_names = [] filled_masked_texts = [] filled_tokens = [] if pick_random: random_text = texts[random.randint(0, texts.shape[0] - 1)] masked_text = st.text_area("Please type a masked sentence to fill", random_text) else: select_text = st.sidebar.selectbox("Select any of the following text", texts) masked_text = st.text_area("Please type a masked sentence to fill", select_text) # pd.set_option('max_colwidth',30) if st.button("Fill the Mask!"): with st.spinner("Filling the Mask..."): for selected_model in models: filled_sentence, filled_token = load_model( masked_text, cfg["models"][selected_model] ) model_names.append(selected_model) filled_tokens.append(filled_token) filled_masked_texts.append(filled_sentence) results_df["Model Name"] = model_names results_df["Filled Token"] = filled_tokens results_df["Filled Text"] = filled_masked_texts st.table(results_df)