from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import gradio as gr import torch from datasets import load_dataset first_generation = True prefix = '' device = 'cuda' if torch.cuda.is_available() else 'cpu' model_checkpoint = "hackathon-pln-es/es_text_neutralizer" data_checkpoint = "hackathon-pln-es/neutral-es" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) dataset = load_dataset(data_checkpoint, split='test') model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) # TODO: jarl!! check this for avoiding short segments model.config.max_length = 512 model.to(device) article = """

Given any input, our model will generate a gender neutral sentence, correcting any non-inclusive expressions or words. It's a straightforward and fast solution that creates a positive impact in the contemporary social panorama.

One of the toughest challenges when building the app and the model was to find proper data for training the model. Therefore, the team opted to dedicate a considerable amount of time to build it from the scratch. These data have been obtained from a series of guidelines and manuals issued by Spanish Ministry of Health, Social Services and Equality in the matter of the usage of non-sexist language, stipulated in this linked [document](https://www.inmujeres.gob.es/servRecursos/formacion/GuiasLengNoSexista/docs/Guiaslenguajenosexista_.pdf). You can take a look at some details of the process in the linked dataset

Future steps

Team Members

""" description = """

Spanish is a beautiful language and it has many ways of referring to people, neutralizing the genders and using some of the resources inside the language. One would say Todas las personas asistentes instead of Todos los asistentes and it would end in a more inclusive way for talking about people. At the same time, this process must be coherent with the context (i.e: if we consider las alumnas/los alumnos => el alumando, we would keep Los alumnos Juanju y Fernando instead of El alumnado Juanju y Fernando) The purpose of this app is to transform Spanish gendered text into a neutral version, suitable for an unbiased environment

""" def postproc(input_sentence, preds): try: preds = preds.replace('De el', 'Del').replace('de el', 'del').replace(' ', ' ') if preds[0].islower(): preds = preds.capitalize() preds = preds.replace(' . ', '. ').replace(' , ', ', ') # Nombres en mayusculas prev_letter = '' for word in input_sentence.split(' '): if word: if word[0].isupper(): if word.lower() in preds and word != input_sentence.split(' ')[0]: if prev_letter == '.': preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ') else: if word[-1] == '.': preds = preds.replace(word.lower(), word) else: preds = preds.replace(word.lower() + ' ', word + ' ') prev_letter = word[-1] preds = preds.strip() # quitar ultimo espacio except: pass return preds # sentences = ["El libro relata las aventuras y desventuras de un hidalgo de 50 años llamado Alonso Quijano, quien decide ser un caballero andante como aquellos que aparecen en sus libros de caballerías favoritos.Las hazañas de don Quijote están contenidas en dos tomos que narran tres salidas. Por un lado, la “Primera parte” denominada como El ingenioso Hidalgo Don Quijote de la Mancha está formada por 52 capítulos y en ella se encuentran la primera salida y la segunda salida."] # sentences = ['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.'] def get_output(sentence, first_generation=True): inputs = tokenizer([prefix + sentence], return_tensors="pt", padding=True) with torch.no_grad(): if first_generation: output_sequence = model.generate( input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), do_sample=False, # disable sampling to test if batching affects output ) else: output_sequence = model.generate( input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), do_sample=False, num_beams=2, repetition_penalty=2.5, # length_penalty=1.0, early_stopping=True# disable sampling to test if batching affects output ) preds = postproc(sentence, preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) return preds examples=['De acuerdo con las informaciones anteriores , las alumnas se han quejado de la actitud de los profesores en los exámenes finales. Los representantes estudiantiles son los alumnos Juanju y Javi.', 'Durante su ingreso no debe tomar agua que no le sea suministrada por los especialistas en su cirujía', 'Debido a esto , el premio se asignará a los candidatos seleccionados en tres pagos'] iface = gr.Interface(fn=get_output, title="Spanish Text Neutralization app", description=description, article=article, examples=examples, # inputs="text", inputs=gr.inputs.Textbox(label="Introduce some Spanish text here"), # outputs="text", theme='peach', outputs=gr.outputs.Textbox(label="Neutral version of your text") ) iface.launch()