nurasaki
gradio_nlp_berta_masked_example: added requeriments.txt
48b5e6e
import gradio as gr
import os
# https://huggingface.co/docs/hub/spaces-gpus
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.nn.functional import softmax
# import logging
# import pandas as pd
# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
auth_token = os.getenv("auth_token")
print("========================================================================")
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
print("AUTH TOKEN:", auth_token)
# load a model from https://hf.co/models as an interface, then use it as an api
# you can remove the api_key parameter if you don't care about rate limiting.
# api = gr.Interface.load(, api_key=auth_token,)
model_ref = "projecte-aina/roberta-base-ca-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ref)
model = AutoModelForMaskedLM.from_pretrained(model_ref)
def get_topk(text, tokenizer, model, k):
print("Get top K,", text)
# Tokenize
# ==========================================================================================
tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
input_ids = inputs.input_ids
# Get model outputs and probabilities
# ==========================================================================================
# logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
logits = model.to("cpu")(**inputs).logits
probs = softmax(logits, dim=2)
# Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
# ==========================================================================================
row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)
return probs[row_idx, mask_idx].topk(k), mask_idx
def generate_output(text, k):
# lines = print_topk(text, tokenizer, model, k=10)
(values, indices), input_idx = get_topk(text, tokenizer, model, int(k))
for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
labels = {tokenizer.decode(ind): val.item()
for val, ind in zip(mask_vals, mask_indices)}
return labels
md_text ="""
# Masked Language Modeling Example
by [nurasaki](https://huggingface.co/spaces/nurasaki)
* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
<br>
## Model description
The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
<br>
## Usage
The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
Choose one of the provided examples or enter your own masked text.
<br>
"""
examples = [
"La meva mare es diu <mask>.",
"La meva mare treballa de <mask>.",
"El meu fill es diu <mask>.",
"El teu pare treballa de <mask>.",
]
with gr.Blocks() as demo:
gr.Markdown(md_text)
with gr.Row():
with gr.Column():
text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
k = gr.Number(value=10, label="Num. results")
btn = gr.Button("Generate")
with gr.Column():
out_label = gr.Label(label="Results")
btn.click(generate_output, inputs=[text, k], outputs=[out_label])
gr.Examples(examples, inputs=[text])
# if __name__ == "__main__":
demo.launch(favicon_path="favicon.png")