|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import gradio as gr |
|
|
|
import torch |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
first_generation = True |
|
prefix = '' |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
model_checkpoint = "hackathon-pln-es/es_text_neutralizer" |
|
data_checkpoint = "hackathon-pln-es/neutral-es" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
dataset = load_dataset(data_checkpoint, split='test') |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) |
|
|
|
model.config.max_length = 512 |
|
model.to(device) |
|
|
|
article = """ |
|
|
|
<p style="text-align: justify;"> |
|
|
|
Given any input, our model will generate a gender neutral sentence, correcting any non-inclusive expressions or words. It's a straightforward and fast solution that creates a positive impact in the contemporary social panorama. |
|
|
|
<p align="center"> |
|
<img src="https://upload.wikimedia.org/wikipedia/commons/2/29/Gender_equality_symbol_%28clipart%29.png" width="250"/> |
|
</p> |
|
|
|
One of the toughest challenges when building the app and the model was to find proper data for training the model. Therefore, the team opted to dedicate a considerable amount of time to build it from the scratch. These data have been obtained from a series of guidelines and manuals issued by Spanish Ministry of Health, Social Services and Equality in the matter of the usage of non-sexist language, stipulated in this linked [document](https://www.inmujeres.gob.es/servRecursos/formacion/GuiasLengNoSexista/docs/Guiaslenguajenosexista_.pdf). You can take a look at some details of the process in the linked dataset |
|
|
|
<h1 style="font-size:2vw">Future steps</h1> |
|
<ul> |
|
<li> First of all, we would love to engage people and maybe inspire them to work on similar projects, because we believe this kind of projects we can actually make a difference</a> |
|
<li> Following that, a broader dataset would help reducing some overfitting and making a more robust model </a> |
|
<li> Further training of different model backbones</a> |
|
<li> Reduce the gender biases existing on some of the data for training Language Models by using the output of this model</a> |
|
<li> Share ideas with the community for further improvement </a> |
|
</ul> |
|
|
|
<h1 style="font-size:2vw">Team Members</h1> |
|
|
|
<ul> |
|
<li> Fernando Velasco <a href="https://huggingface.co/fermaat">(fermaat)</a> |
|
<li> Cibeles Redondo <a href="https://huggingface.co/CibelesR">(CibelesR)</a> |
|
<li> Juan Julian Cea <a href="https://huggingface.co/Juanju">(Juanju)</a> |
|
<li> Magdalena Kujalowicz <a href="https://huggingface.co/MacadellaCosta">(MacadellaCosta)</a> |
|
<li> Javier Blasco <a href="https://huggingface.co/javiblasco">(javiblasco)</a> |
|
</ul> |
|
|
|
</p> |
|
""" |
|
|
|
description = """ |
|
|
|
<p style="text-align: justify;"> |
|
Spanish is a beautiful language and it has many ways of referring to people, neutralizing the genders and using some of the resources inside the language. One would say *Todas las personas asistentes* instead of *Todos los asistentes* and it would end in a more inclusive way for talking about people. |
|
|
|
The purpose of this app is to transform Spanish gendered text into a neutral version, suitable for an unbiased environment |
|
</p> |
|
""" |
|
|
|
def postproc(input_sentence, preds): |
|
try: |
|
preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ') |
|
if preds[0].islower(): |
|
preds = preds.capitalize() |
|
preds = preds.replace(' . ', '. ').replace(' , ', ', ') |
|
|
|
|
|
prev_letter = '' |
|
for word in input_sentence.split(' '): |
|
if word: |
|
if word[0].isupper(): |
|
if word.lower() in preds and word != input_sentence.split(' ')[0]: |
|
if prev_letter == '.': |
|
preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ') |
|
else: |
|
if word[-1] == '.': |
|
preds = preds.replace(word.lower(), word) |
|
else: |
|
preds = preds.replace(word.lower() + ' ', word + ' ') |
|
prev_letter = word[-1] |
|
preds = preds.strip() |
|
except: |
|
pass |
|
return preds |
|
|
|
|
|
|
|
|
|
def get_output(sentence, first_generation=True): |
|
inputs = tokenizer([prefix + sentence], return_tensors="pt", padding=True) |
|
with torch.no_grad(): |
|
if first_generation: |
|
output_sequence = model.generate( |
|
input_ids=inputs["input_ids"].to(device), |
|
attention_mask=inputs["attention_mask"].to(device), |
|
do_sample=False, |
|
) |
|
else: |
|
|
|
output_sequence = model.generate( |
|
input_ids=inputs["input_ids"].to(device), |
|
attention_mask=inputs["attention_mask"].to(device), |
|
do_sample=False, |
|
num_beams=2, |
|
repetition_penalty=2.5, |
|
|
|
early_stopping=True |
|
) |
|
preds = postproc(sentence, |
|
preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) |
|
return preds |
|
|
|
examples=['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.', |
|
'Durante su ingreso no debe tomar agua que no le sea suministrada por los especialistas en su cirujía', |
|
'Debido a esto , el premio se asignará a los candidatos seleccionados en tres pagos'] |
|
|
|
iface = gr.Interface(fn=get_output, |
|
title="Spanish Text Neutralization app", |
|
description=description, |
|
article=article, |
|
examples=examples, |
|
|
|
inputs=gr.inputs.Textbox(label="Introduce some Spanish text here"), |
|
|
|
theme='peach', |
|
outputs=gr.outputs.Textbox(label="Neutral version of your text") |
|
) |
|
|
|
iface.launch() |
|
|