from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
import gradio as gr |
import torch |
from datasets import load_dataset |
first_generation = True |
prefix = '' |
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
model_checkpoint = "hackathon-pln-es/es_text_neutralizer" |
data_checkpoint = "hackathon-pln-es/neutral-es" |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
dataset = load_dataset(data_checkpoint, split='test') |
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) |
model.config.max_length = 512 |
model.to(device) |
article = """ |
<p style="text-align: justify;"> |
Given any input, our model will generate a gender neutral sentence, correcting any non-inclusive expressions or words. It's a straightforward and fast solution that creates a positive impact in the contemporary social panorama. |
<p align="center"> |
<img src="https://upload.wikimedia.org/wikipedia/commons/2/29/Gender_equality_symbol_%28clipart%29.png" width="250"/> |
</p> |
One of the toughest challenges when building the app and the model was to find proper data for training the model. Therefore, the team opted to dedicate a considerable amount of time to build it from the scratch. These data have been obtained from a series of guidelines and manuals issued by Spanish Ministry of Health, Social Services and Equality in the matter of the usage of non-sexist language, stipulated in this linked [document](https://www.inmujeres.gob.es/servRecursos/formacion/GuiasLengNoSexista/docs/Guiaslenguajenosexista_.pdf). You can take a look at some details of the process in the linked dataset |
<h1 style="font-size:2vw">Future steps</h1> |
<ul> |
<li> First of all, we would love to engage people and maybe inspire them to work on similar projects, because we believe this kind of projects we can actually make a difference</a> |
<li> Following that, a broader dataset would help reducing some overfitting and making a more robust model </a> |
<li> Further training of different model backbones</a> |
<li> Reduce the gender biases existing on some of the data for training Language Models by using the output of this model</a> |
<li> Share ideas with the community for further improvement </a> |
</ul> |
<h1 style="font-size:2vw">Team Members</h1> |
<ul> |
<li> Fernando Velasco <a href="https://huggingface.co/fermaat">(fermaat)</a> |
<li> Cibeles Redondo <a href="https://huggingface.co/CibelesR">(CibelesR)</a> |
<li> Juan Julian Cea <a href="https://huggingface.co/Juanju">(Juanju)</a> |
<li> Magdalena Kujalowicz <a href="https://huggingface.co/MacadellaCosta">(MacadellaCosta)</a> |
<li> Javier Blasco <a href="https://huggingface.co/javiblasco">(javiblasco)</a> |
</ul> |
</p> |
""" |
description = """ |
<p style="text-align: justify;"> |
Spanish is a beautiful language and it has many ways of referring to people, neutralizing the genders and using some of the resources inside the language. One would say *Todas las personas asistentes* instead of *Todos los asistentes* and it would end in a more inclusive way for talking about people. |
The purpose of this app is to transform Spanish gendered text into a neutral version, suitable for an unbiased environment |
</p> |
""" |
def postproc(input_sentence, preds): |
try: |
preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ') |
if preds[0].islower(): |
preds = preds.capitalize() |
preds = preds.replace(' . ', '. ').replace(' , ', ', ') |
prev_letter = '' |
for word in input_sentence.split(' '): |
if word: |
if word[0].isupper(): |
if word.lower() in preds and word != input_sentence.split(' ')[0]: |
if prev_letter == '.': |
preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ') |
else: |
if word[-1] == '.': |
preds = preds.replace(word.lower(), word) |
else: |
preds = preds.replace(word.lower() + ' ', word + ' ') |
prev_letter = word[-1] |
preds = preds.strip() |
except: |
pass |
return preds |
def get_output(sentence, first_generation=True): |
inputs = tokenizer([prefix + sentence], return_tensors="pt", padding=True) |
with torch.no_grad(): |
if first_generation: |
output_sequence = model.generate( |
input_ids=inputs["input_ids"].to(device), |
attention_mask=inputs["attention_mask"].to(device), |
do_sample=False, |
) |
else: |
output_sequence = model.generate( |
input_ids=inputs["input_ids"].to(device), |
attention_mask=inputs["attention_mask"].to(device), |
do_sample=False, |
num_beams=2, |
repetition_penalty=2.5, |
early_stopping=True |
) |
preds = postproc(sentence, |
preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) |
return preds |
examples=['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.', |
'Durante su ingreso no debe tomar agua que no le sea suministrada por los especialistas en su cirujía', |
'Debido a esto , el premio se asignará a los candidatos seleccionados en tres pagos'] |
iface = gr.Interface(fn=get_output, |
title="Spanish Text Neutralization app", |
description=description, |
article=article, |
examples=examples, |
inputs=gr.inputs.Textbox(label="Introduce some Spanish text here"), |
theme='peach', |
outputs=gr.outputs.Textbox(label="Neutral version of your text") |
) |
iface.launch() |