File size: 2,494 Bytes
dcc31e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265e009
dcc31e0
265e009
 
 
 
 
 
 
 
 
5827b0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265e009
 
 
 
5827b0b
 
 
 
265e009
 
 
 
 
 
dcc31e0
265e009
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from huggingface_hub import from_pretrained_fastai
import gradio as gr
# from fastai.vision.all import *
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from transformers import pipeline
from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import AutoTokenizer


# repo_id = "YOUR_USERNAME/YOUR_LEARNER_NAME"
repo_id = "islasher/mbart-spanishToQuechua"


# Definimos una función que se encarga de llevar a cabo las predicciones


# Cargar el modelo y el tokenizador
nombre_modelo = 'islasher/mbart-spanishToQuechua'

#tokenizer = AutoTokenizer.from_pretrained(nombre_modelo)

model_checkpoint = "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)



from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer) #para preparar los datos



import numpy as np

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels) 
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



from transformers import pipeline
neutralizer = pipeline('text2text-generation', model='islasher/mbart-spanishToQuechua')



#CAMBIAR LO QUE SE RETORNA Y PONER LO DEL DECODER.


# def predict(frase):
#     inputs = tokenizer(frase, return_tensors="pt")
#     outputs = model(**inputs)
#     trad = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return trad

# Creamos la interfaz y la lanzamos. 
gr.Interface(fn=neutralizer, inputs="text", outputs="text").launch(share=False)