Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,22 @@ nltk.download('punkt')
|
|
22 |
|
23 |
docs = None
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
26 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
27 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
@@ -41,6 +57,35 @@ def validate_dataset(dataset):
|
|
41 |
else:
|
42 |
return "⚠️Esperando documentos..."
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def request_pathname(files):
|
45 |
if files is None:
|
46 |
return [[]]
|
@@ -101,8 +146,9 @@ def encode_docs(docs,maxlen = 64, stride = 32):
|
|
101 |
return embeddings, spans, file_names
|
102 |
|
103 |
def predict(query,data):
|
|
|
104 |
name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
|
105 |
-
k=
|
106 |
st = str([query,name_to_save])
|
107 |
st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
|
108 |
hist = st + " " + st_hashed
|
@@ -114,7 +160,9 @@ def predict(query,data):
|
|
114 |
list_outputs = []
|
115 |
for i in range(k):
|
116 |
temp = [df.iloc[n] for n in range(k)][i]
|
117 |
-
tupla = (temp.Respuesta,
|
|
|
|
|
118 |
# text = ''
|
119 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
120 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
@@ -139,6 +187,8 @@ def predict(query,data):
|
|
139 |
text = text.replace("\r", " ")
|
140 |
text = text.replace("\n", " ")
|
141 |
text = text.replace(" . "," ")
|
|
|
|
|
142 |
|
143 |
doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
|
144 |
|
@@ -194,7 +244,9 @@ def predict(query,data):
|
|
194 |
list_outputs = []
|
195 |
for i in range(k):
|
196 |
temp = [df.iloc[n] for n in range(k)][i]
|
197 |
-
tupla = (temp.Respuesta,
|
|
|
|
|
198 |
# text = ''
|
199 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
200 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
@@ -248,7 +300,7 @@ with gr.Blocks() as demo:
|
|
248 |
ask.click(fn=predict, inputs=[query,
|
249 |
file], outputs=[answer, context, prob])
|
250 |
|
251 |
-
examples = ["¿Cuándo suelen comenzar las adicciones?","Entrevista Miguel Ruiz.txt"]
|
252 |
|
253 |
demo.queue(concurrency_count=20)
|
254 |
demo.launch(show_error=True)
|
|
|
22 |
|
23 |
docs = None
|
24 |
|
25 |
+
# Definimos los modelos:
|
26 |
+
# Traducción
|
27 |
+
mname = "Helsinki-NLP/opus-mt-es-en"
|
28 |
+
tokenizer_es_en = MarianTokenizer.from_pretrained(mname)
|
29 |
+
model_es_en = MarianMTModel.from_pretrained(mname)
|
30 |
+
model_es_en.to(device)
|
31 |
+
|
32 |
+
mname = "Helsinki-NLP/opus-mt-en-es"
|
33 |
+
tokenizer_en_es = MarianTokenizer.from_pretrained(mname)
|
34 |
+
model_en_es = MarianMTModel.from_pretrained(mname)
|
35 |
+
model_en_es.to(device)
|
36 |
+
|
37 |
+
lt = LineTokenizer()
|
38 |
+
|
39 |
+
# Responder preguntas
|
40 |
+
|
41 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
42 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
43 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
|
|
57 |
else:
|
58 |
return "⚠️Esperando documentos..."
|
59 |
|
60 |
+
def traducir_parrafos(parrafos, tokenizer, model, tam_bloque=8, ):
|
61 |
+
parrafos_traducidos = []
|
62 |
+
for parrafo in parrafos:
|
63 |
+
frases = sent_tokenize(parrafo)
|
64 |
+
batches = math.ceil(len(frases) / tam_bloque)
|
65 |
+
traducido = []
|
66 |
+
for i in range(batches):
|
67 |
+
|
68 |
+
bloque_enviado = frases[i*tam_bloque:(i+1)*tam_bloque]
|
69 |
+
model_inputs = tokenizer(bloque_enviado, return_tensors="pt",
|
70 |
+
padding=True, truncation=True,
|
71 |
+
max_length=500).to(device)
|
72 |
+
with torch.no_grad():
|
73 |
+
bloque_traducido = model.generate(**model_inputs)
|
74 |
+
traducido += bloque_traducido
|
75 |
+
traducido = [tokenizer.decode(t, skip_special_tokens=True) for t in traducido]
|
76 |
+
parrafos_traducidos += [" ".join(traducido)]
|
77 |
+
return parrafos_traducidos
|
78 |
+
|
79 |
+
def traducir_es_en(texto):
|
80 |
+
parrafos = lt.tokenize(texto)
|
81 |
+
par_tra = traducir_parrafos(parrafos, tokenizer_es_en, model_es_en)
|
82 |
+
return "\n".join(par_tra)
|
83 |
+
|
84 |
+
def traducir_en_es(texto):
|
85 |
+
parrafos = lt.tokenize(texto)
|
86 |
+
par_tra = traducir_parrafos(parrafos, tokenizer_en_es, model_en_es)
|
87 |
+
return "\n".join(par_tra)
|
88 |
+
|
89 |
def request_pathname(files):
|
90 |
if files is None:
|
91 |
return [[]]
|
|
|
146 |
return embeddings, spans, file_names
|
147 |
|
148 |
def predict(query,data):
|
149 |
+
query = traducir_es_en(query)
|
150 |
name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
|
151 |
+
k=2
|
152 |
st = str([query,name_to_save])
|
153 |
st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
|
154 |
hist = st + " " + st_hashed
|
|
|
160 |
list_outputs = []
|
161 |
for i in range(k):
|
162 |
temp = [df.iloc[n] for n in range(k)][i]
|
163 |
+
tupla = (traducir_en_es(temp.Respuesta),
|
164 |
+
traducir_en_es(temp.Contexto),
|
165 |
+
traducir_en_es(temp.Probabilidades))
|
166 |
# text = ''
|
167 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
168 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
|
187 |
text = text.replace("\r", " ")
|
188 |
text = text.replace("\n", " ")
|
189 |
text = text.replace(" . "," ")
|
190 |
+
|
191 |
+
text = traducir_es_en(text)
|
192 |
|
193 |
doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
|
194 |
|
|
|
244 |
list_outputs = []
|
245 |
for i in range(k):
|
246 |
temp = [df.iloc[n] for n in range(k)][i]
|
247 |
+
tupla = (traducir_en_es(temp.Respuesta),
|
248 |
+
traducir_en_es(temp.Contexto),
|
249 |
+
traducir_en_es(temp.Probabilidades))
|
250 |
# text = ''
|
251 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
252 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
|
300 |
ask.click(fn=predict, inputs=[query,
|
301 |
file], outputs=[answer, context, prob])
|
302 |
|
303 |
+
gr.Interface.load(examples = ["¿Cuándo suelen comenzar las adicciones?","Entrevista Miguel Ruiz.txt"])
|
304 |
|
305 |
demo.queue(concurrency_count=20)
|
306 |
demo.launch(show_error=True)
|