File size: 7,501 Bytes
243307b
d787566
 
243307b
8eb0b06
228c21a
 
 
 
243307b
 
 
 
fa49f33
228c21a
243307b
 
228c21a
243307b
 
 
 
 
f1decb3
ad23043
f1decb3
ad23043
 
 
 
 
 
 
 
f53df5a
ad23043
178ea88
aef9a4c
178ea88
 
1308e75
 
178ea88
f53df5a
aef9a4c
 
178ea88
ee93e2c
 
0d4c48c
 
 
 
 
ee93e2c
 
ad23043
 
 
c9273e4
 
 
cdd22d7
c9273e4
 
 
 
f1decb3
c3eabb9
2ce2556
 
 
 
 
 
 
 
c3eabb9
2ce2556
 
ebe7859
2ce2556
 
 
ebe7859
 
 
 
2ce2556
 
 
 
f1decb3
 
ebe7859
243307b
 
8eb0b06
 
243307b
 
8eb0b06
243307b
 
 
 
 
 
8eb0b06
243307b
 
 
 
 
 
 
 
ebe7859
f1decb3
243307b
8eb0b06
f1decb3
 
ebe7859
243307b
f1decb3
 
ad23043
 
f1decb3
 
 
 
8de90cf
f1decb3
 
ebe7859
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr

import torch

from datasets import load_dataset



first_generation = True
prefix = ''
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_checkpoint = "hackathon-pln-es/es_text_neutralizer"
data_checkpoint = "hackathon-pln-es/neutral-es"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
dataset = load_dataset(data_checkpoint, split='test')

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# TODO: jarl!! check this for avoiding short segments
model.config.max_length = 512
model.to(device)

article = """

<p style="text-align: justify;"> 

Given any input, our model will generate a gender neutral sentence, correcting any non-inclusive expressions or words. It's a straightforward and fast solution that creates a positive impact in the contemporary social panorama.

<p align="center">
  <img src="https://upload.wikimedia.org/wikipedia/commons/2/29/Gender_equality_symbol_%28clipart%29.png" width="250"/>
</p>

One of the toughest challenges when building the app and the model was to find proper data for training the model. Therefore, the team opted to dedicate a considerable amount of time to build it from the scratch. These data have been obtained from a series of guidelines and manuals issued by Spanish Ministry of Health, Social Services and Equality in the matter of the usage of non-sexist language, stipulated in this linked [document](https://www.inmujeres.gob.es/servRecursos/formacion/GuiasLengNoSexista/docs/Guiaslenguajenosexista_.pdf). You can take a look at some details of the process in the linked dataset

<h1 style="font-size:2vw">Future steps</h1> 
<ul>
  <li> First of all, we would love to engage people and maybe inspire them to work on similar projects, because we believe this kind of projects we can actually make a difference</a>
  <li> Following that, a broader dataset would help reducing some overfitting and making a more robust model </a>
  <li> Increase the model diversity to all forms of Spanish. Data generation was quite time consuming and we needed to focus on some sources, but we must change this in the future</a>
  <li> Further training of even more model backbones and architectures </a>
  <li> Reduce the gender biases existing on some of the data for training Language Models by using the output of this model</a>
  <li> Share ideas with the community for further improvement </a>
</ul> 

<h1 style="font-size:2vw">Team Members</h1> 

<ul>
  <li> Fernando Velasco  <a href="https://huggingface.co/fermaat">(fermaat)</a>
  <li> Cibeles Redondo <a href="https://huggingface.co/CibelesR">(CibelesR)</a>
  <li> Juan Julian Cea <a href="https://huggingface.co/Juanju">(Juanju)</a>
  <li> Magdalena Kujalowicz <a href="https://huggingface.co/MacadellaCosta">(MacadellaCosta)</a>
  <li> Javier Blasco <a href="https://huggingface.co/javiblasco">(javiblasco)</a>
</ul> 

</p>
"""

description = """

<p style="text-align: justify;"> 
Spanish is a beautiful language and it has many ways of referring to people, neutralizing the genders and using some of the resources inside the language. One would say <i>Todas las personas asistentes</i> instead of <i>Todos los asistentes</i> and it would end in a more inclusive way for talking about people. At the same time, this process must be coherent with the context (i.e: if we consider <i>las alumnas/los alumnos => el alumando</i>, we would keep <i>Los alumnos Juanju y Fernando</i> instead of <i>El alumnado Juanju y Fernando</i>)

The purpose of this app is to transform Spanish gendered text into a neutral version, suitable for an unbiased environment
</p>
"""

def postproc(input_sentence, preds):
    try:
        preds = preds.replace('De el', 'Del').replace('de el', 'del').replace('  ', ' ')
        if preds[0].islower():
            preds = preds.capitalize()
        preds = preds.replace(' . ', '. ').replace(' , ', ', ')

        # Nombres en mayusculas
        prev_letter = ''
        for word in input_sentence.split(' '):
            if word:
                if word[0].isupper():
                    if word.lower() in preds and word != input_sentence.split(' ')[0]:
                        if prev_letter == '.':
                            preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ')
                        else:
                            if word[-1] == '.':
                                preds = preds.replace(word.lower(), word)
                            else:
                                preds = preds.replace(word.lower() + ' ', word + ' ')
                prev_letter = word[-1]
        preds = preds.strip()  # quitar ultimo espacio
    except:
        pass
    return preds


# sentences = ["El libro relata las aventuras y desventuras de un hidalgo de 50 años llamado Alonso Quijano, quien decide ser un caballero andante como aquellos que aparecen en sus libros de caballerías favoritos.Las hazañas de don Quijote están contenidas en dos tomos que narran tres salidas. Por un lado, la “Primera parte” denominada como El ingenioso Hidalgo Don Quijote de la Mancha está formada por 52 capítulos y en ella se encuentran la primera salida y la segunda salida."]
# sentences = ['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.']
def get_output(sentence, first_generation=True):
    inputs = tokenizer([prefix + sentence], return_tensors="pt", padding=True)
    with torch.no_grad():
        if first_generation:
            output_sequence = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                do_sample=False,  # disable sampling to test if batching affects output
            )
        else:

            output_sequence = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                do_sample=False,  
                num_beams=2,
                repetition_penalty=2.5, 
                # length_penalty=1.0, 
                early_stopping=True# disable sampling to test if batching affects output
            )
    preds = postproc(sentence,
                     preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    return preds

examples=['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.',
          'Durante su ingreso no debe tomar agua que no le sea suministrada por los especialistas en su cirujía',
          'Debido a esto ,  el premio se asignará  a los candidatos seleccionados en tres pagos']

iface = gr.Interface(fn=get_output, 
                     title="Spanish Text Neutralization app",
                     description=description,
                     article=article,
                     examples=examples,
                     # inputs="text",
                     inputs=gr.inputs.Textbox(label="Introduce some Spanish text here"),
                     # outputs="text",
                     theme='peach',
                     outputs=gr.outputs.Textbox(label="Neutral version of your text")
                    )

iface.launch()