paascorb commited on
Commit
56efa96
1 Parent(s): 035481a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -107
app.py CHANGED
@@ -1,123 +1,192 @@
1
  import gradio as gr
2
  from pathlib import Path
3
  import os
4
-
5
- os.system('pip install tensorflow')
6
- os.system('pip install nltk')
7
-
8
- from transformers import pipeline
9
  from transformers import MarianMTModel, MarianTokenizer
10
  from nltk.tokenize import sent_tokenize
11
  from nltk.tokenize import LineTokenizer
12
  import math
13
  import torch
14
  import nltk
 
 
 
 
 
 
 
 
 
 
15
  nltk.download('punkt')
16
 
17
  docs = None
18
 
19
- if torch.cuda.is_available():
20
- dev = "cuda"
21
- else:
22
- dev = "cpu"
23
- device = torch.device(dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Definimos los modelos:
26
- mname = "Helsinki-NLP/opus-mt-es-en"
27
- tokenizer_es_en = MarianTokenizer.from_pretrained(mname)
28
- model_es_en = MarianMTModel.from_pretrained(mname)
29
- model_es_en.to(device)
30
-
31
- mname = "Helsinki-NLP/opus-mt-en-es"
32
- tokenizer_en_es = MarianTokenizer.from_pretrained(mname)
33
- model_en_es = MarianMTModel.from_pretrained(mname)
34
- model_en_es.to(device)
35
-
36
- lt = LineTokenizer()
37
-
38
- question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')
39
-
40
- def request_pathname(files):
41
- if files is None:
42
- return [[]]
43
- return [[file.name, file.name.split('/')[-1]] for file in files]
44
-
45
- def traducir_parrafos(parrafos, tokenizer, model, tam_bloque=8, ):
46
- parrafos_traducidos = []
47
- for parrafo in parrafos:
48
- frases = sent_tokenize(parrafo)
49
- batches = math.ceil(len(frases) / tam_bloque)
50
- traducido = []
51
- for i in range(batches):
52
-
53
- bloque_enviado = frases[i*tam_bloque:(i+1)*tam_bloque]
54
- model_inputs = tokenizer(bloque_enviado, return_tensors="pt",
55
- padding=True, truncation=True,
56
- max_length=500).to(device)
57
- with torch.no_grad():
58
- bloque_traducido = model.generate(**model_inputs)
59
- traducido += bloque_traducido
60
- traducido = [tokenizer.decode(t, skip_special_tokens=True) for t in traducido]
61
- parrafos_traducidos += [" ".join(traducido)]
62
- return parrafos_traducidos
63
-
64
- def traducir_es_en(texto):
65
- parrafos = lt.tokenize(texto)
66
- par_tra = traducir_parrafos(parrafos, tokenizer_es_en, model_es_en)
67
- return "\n".join(par_tra)
68
-
69
- def traducir_en_es(texto):
70
- parrafos = lt.tokenize(texto)
71
- par_tra = traducir_parrafos(parrafos, tokenizer_en_es, model_en_es)
72
- return "\n".join(par_tra)
73
-
74
- def validate_dataset(dataset):
75
- global docs
76
- docs = None # clear it out if dataset is modified
77
- docs_ready = dataset.iloc[-1, 0] != ""
78
- if docs_ready:
79
- return "✨Listo✨"
80
  else:
81
- return "⚠️Esperando documentos..."
82
-
83
- def do_ask(question, button, dataset):
84
- global docs
85
- docs_ready = dataset.iloc[-1, 0] != ""
86
- if button == "✨Listo✨" and docs_ready:
87
- for _, row in dataset.iterrows():
88
- path = row['filepath']
89
- text = Path(f'{path}').read_text()
90
- text_en = traducir_es_en(text)
91
- QA_input = {
92
- 'question': traducir_es_en(question),
93
- 'context': text_en
94
- }
95
- return traducir_en_es(question_answerer(QA_input)['answer'])
96
- else:
97
- return ""
98
-
99
- # def do_ask(question, button, dataset, progress=gr.Progress()):
100
- # global docs
101
- # docs_ready = dataset.iloc[-1, 0] != ""
102
- # if button == "✨Listo✨" and docs_ready:
103
- # if docs is None: # don't want to rebuild index if it's already built
104
- # import paperqa
105
- # docs = paperqa.Docs()
106
- # # dataset is pandas dataframe
107
- # for _, row in dataset.iterrows():
108
- # key = None
109
- # if ',' not in row['citation string']:
110
- # key = row['citation string']
111
- # docs.add(row['filepath'], row['citation string'], key=key)
112
- # else:
113
- # return ""
114
- # progress(0, "Construyendo índices...")
115
- # docs._build_faiss_index()
116
- # progress(0.25, "Encolando...")
117
- # result = docs.query(question)
118
- # progress(1.0, "¡Hecho!")
119
- # return result.formatted_answer, result.context
 
 
 
 
 
 
 
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  with gr.Blocks() as demo:
123
  gr.Markdown("""
@@ -160,8 +229,168 @@ with gr.Blocks() as demo:
160
  context = gr.Markdown(label="Contexto")
161
  # ask.click(fn=do_ask, inputs=[query, buildb,
162
  # dataset], outputs=[answer, context])
163
- ask.click(fn=do_ask, inputs=[query, buildb,
164
- dataset], outputs=[answer])
165
 
166
  demo.queue(concurrency_count=20)
167
- demo.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from pathlib import Path
3
  import os
4
+ from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
 
 
 
 
5
  from transformers import MarianMTModel, MarianTokenizer
6
  from nltk.tokenize import sent_tokenize
7
  from nltk.tokenize import LineTokenizer
8
  import math
9
  import torch
10
  import nltk
11
+ import numpy as np
12
+ import time
13
+ import hashlib
14
+ from tqdm import tqdm
15
+
16
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
+ import textract
18
+ from scipy.special import softmax
19
+ import pandas as pd
20
+ from datetime import datetime
21
  nltk.download('punkt')
22
 
23
  docs = None
24
 
25
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
26
+ model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
27
+ tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
28
+ model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
29
+
30
+ if device == 'cuda:0':
31
+ pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
32
+ else:
33
+ pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)
34
+
35
+ def cls_pooling(model_output):
36
+ return model_output.last_hidden_state[:,0]
37
+
38
+ def encode_query(query):
39
+ encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)
40
+
41
+ with torch.no_grad():
42
+ model_output = model(**encoded_input, return_dict=True)
43
+
44
+ embeddings = cls_pooling(model_output)
45
+
46
+ return embeddings.cpu()
47
+
48
+
49
+ def encode_docs(docs,maxlen = 64, stride = 32):
50
+ encoded_input = []
51
+ embeddings = []
52
+ spans = []
53
+ file_names = []
54
+ name, text = docs
55
+
56
+ text = text.split(" ")
57
+ if len(text) < maxlen:
58
+ text = " ".join(text)
59
+
60
+ encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
61
+ spans.append(temp_text)
62
+ file_names.append(name)
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  else:
65
+ num_iters = int(len(text)/maxlen)+1
66
+ for i in range(num_iters):
67
+ if i == 0:
68
+ temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
69
+ else:
70
+ temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])
71
+
72
+ encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
73
+ spans.append(temp_text)
74
+ file_names.append(name)
75
+
76
+ with torch.no_grad():
77
+ for encoded in tqdm(encoded_input):
78
+ model_output = model(**encoded, return_dict=True)
79
+ embeddings.append(cls_pooling(model_output))
80
+
81
+ embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())
82
+
83
+ np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings)))
84
+ np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
85
+ np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))
86
+
87
+ return embeddings, spans, file_names
88
+
89
+ def predict(query,data):
90
+ name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
91
+ k=20
92
+ st = str([query,name_to_save])
93
+ st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
94
+ hist = st + " " + st_hashed
95
+ now = datetime.now()
96
+ current_time = now.strftime("%H:%M:%S")
97
+
98
+ try: #if the same question was already asked for this document, upload question and answer
99
+ df = pd.read_csv("{}.csv".format(hash(st)))
100
+ list_outputs = []
101
+ for i in range(k):
102
+ temp = [df.iloc[n] for n in range(k)][i]
103
+ text = ''
104
+ text += 'Probabilidades: '+ temp.Probabilities + '\n\n'
105
+ text += 'Respuesta: ' +temp.Answer + '\n\n'
106
+ text += 'Contexto: '+temp.Passage + '\n\n'
107
+ list_outputs.append(text)
108
+ return list_outputs
109
+ except Exception as e:
110
+ print(e)
111
+ print(st)
112
 
113
+ if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
114
+ doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
115
+ doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
116
+ file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
117
+
118
+ doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
119
+ doc_text = list(doc_text.values())
120
+ file_names = list(file_names_dicto.values())
121
+
122
+ else:
123
+ text = textract.process("{}".format(data.name)).decode('utf8')
124
+ text = text.replace("\r", " ")
125
+ text = text.replace("\n", " ")
126
+ text = text.replace(" . "," ")
127
+
128
+ doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
129
+
130
+ doc_emb = doc_emb.reshape(-1, 768)
131
+ with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
132
+ f.write(text)
133
+
134
+ #once embeddings are calculated, run MIPS
135
+ start = time.time()
136
+ query_emb = encode_query(query)
137
+
138
+ scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
139
+ doc_score_pairs = list(zip(doc_text, scores, file_names))
140
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
141
+
142
+ probs_sum = 0
143
+ probs = softmax(sorted(scores,reverse = True)[:k])
144
+ table = {"Contexto":[],"Respuesta":[],"Probabilidades":[]}
145
+
146
+
147
+ #get answers for each pair of question (from user) and top best passages
148
+ for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
149
+ passage = passage.replace("\n","")
150
+ #passage = passage.replace(" . "," ")
151
+
152
+ if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
153
+ QA = {'question':query,'context':passage}
154
+ ans = pipe(QA)
155
+ probabilities = "P(a|p): {}, P(a|p,q): {}, P(p|q): {}".format(round(ans["score"],5),
156
+ round(ans["score"]*probs[i],5),
157
+ round(probs[i],5))
158
+ table["Contexto"].append(passage)
159
+ table["Respuesta"].append(str(ans["answer"]).upper())
160
+ table["Probabilidades"].append(probabilities)
161
+ else:
162
+ table["Contexto"].append(passage)
163
+ table["Respuesta"].append("no_answer_calculated")
164
+ table["Probabilidades"].append("P(p|q): {}".format(round(probs[i],5)))
165
+
166
+
167
+ #format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
168
+ df = pd.DataFrame(table)
169
+ print(df)
170
+ print("time: "+ str(time.time()-start))
171
+
172
+ with open("HISTORY.txt","a", encoding = "utf-8") as f:
173
+ f.write(hist)
174
+ f.write(" " + str(current_time))
175
+ f.write("\n")
176
+ f.close()
177
+ df.to_csv("{}.csv".format(hash(st)), index=False)
178
+
179
+ list_outputs = []
180
+ for i in range(k):
181
+ text = ''
182
+ temp = [df.iloc[n] for n in range(k)][i]
183
+ text += 'Probabilidades: '+ temp.Probabilities + '\n\n'
184
+ text += 'Respuesta: ' +temp.Answer + '\n\n'
185
+ text += 'Contexto: '+temp.Passage + '\n\n'
186
+
187
+ list_outputs.append(text)
188
+
189
+ return list_outputs
190
 
191
  with gr.Blocks() as demo:
192
  gr.Markdown("""
 
229
  context = gr.Markdown(label="Contexto")
230
  # ask.click(fn=do_ask, inputs=[query, buildb,
231
  # dataset], outputs=[answer, context])
232
+ ask.click(fn=predict, inputs=[query,
233
+ gr.inputs.File()], outputs=[answer])
234
 
235
  demo.queue(concurrency_count=20)
236
+ demo.launch(show_error=True)
237
+
238
+ # iface = gr.Interface(fn =predict,
239
+ # inputs = [gr.inputs.Textbox(default="What is Open-domain question answering?"),
240
+ # gr.inputs.File(),
241
+ # ],
242
+ # outputs = [
243
+ # gr.outputs.Carousel(['text']),
244
+ # ],
245
+ # description=description,
246
+ # title = title,
247
+ # allow_flagging ="manual",flagging_options = ["correct","wrong"],
248
+ # allow_screenshot=False)
249
+
250
+ # iface.launch(enable_queue=True, show_error =True)
251
+
252
+ # Definimos los modelos:
253
+ # Traducción
254
+ # mname = "Helsinki-NLP/opus-mt-es-en"
255
+ # tokenizer_es_en = MarianTokenizer.from_pretrained(mname)
256
+ # model_es_en = MarianMTModel.from_pretrained(mname)
257
+ # model_es_en.to(device)
258
+
259
+ # mname = "Helsinki-NLP/opus-mt-en-es"
260
+ # tokenizer_en_es = MarianTokenizer.from_pretrained(mname)
261
+ # model_en_es = MarianMTModel.from_pretrained(mname)
262
+ # model_en_es.to(device)
263
+
264
+ # lt = LineTokenizer()
265
+
266
+ # Responder preguntas
267
+ # question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')
268
+
269
+ # def request_pathname(files):
270
+ # if files is None:
271
+ # return [[]]
272
+ # return [[file.name, file.name.split('/')[-1]] for file in files]
273
+
274
+ # def traducir_parrafos(parrafos, tokenizer, model, tam_bloque=8, ):
275
+ # parrafos_traducidos = []
276
+ # for parrafo in parrafos:
277
+ # frases = sent_tokenize(parrafo)
278
+ # batches = math.ceil(len(frases) / tam_bloque)
279
+ # traducido = []
280
+ # for i in range(batches):
281
+
282
+ # bloque_enviado = frases[i*tam_bloque:(i+1)*tam_bloque]
283
+ # model_inputs = tokenizer(bloque_enviado, return_tensors="pt",
284
+ # padding=True, truncation=True,
285
+ # max_length=500).to(device)
286
+ # with torch.no_grad():
287
+ # bloque_traducido = model.generate(**model_inputs)
288
+ # traducido += bloque_traducido
289
+ # traducido = [tokenizer.decode(t, skip_special_tokens=True) for t in traducido]
290
+ # parrafos_traducidos += [" ".join(traducido)]
291
+ # return parrafos_traducidos
292
+
293
+ # def traducir_es_en(texto):
294
+ # parrafos = lt.tokenize(texto)
295
+ # par_tra = traducir_parrafos(parrafos, tokenizer_es_en, model_es_en)
296
+ # return "\n".join(par_tra)
297
+
298
+ # def traducir_en_es(texto):
299
+ # parrafos = lt.tokenize(texto)
300
+ # par_tra = traducir_parrafos(parrafos, tokenizer_en_es, model_en_es)
301
+ # return "\n".join(par_tra)
302
+
303
+ # def validate_dataset(dataset):
304
+ # global docs
305
+ # docs = None # clear it out if dataset is modified
306
+ # docs_ready = dataset.iloc[-1, 0] != ""
307
+ # if docs_ready:
308
+ # return "✨Listo✨"
309
+ # else:
310
+ # return "⚠️Esperando documentos..."
311
+
312
+ # def do_ask(question, button, dataset):
313
+ # global docs
314
+ # docs_ready = dataset.iloc[-1, 0] != ""
315
+ # if button == "✨Listo✨" and docs_ready:
316
+ # for _, row in dataset.iterrows():
317
+ # path = row['filepath']
318
+ # text = Path(f'{path}').read_text()
319
+ # text_en = traducir_es_en(text)
320
+ # QA_input = {
321
+ # 'question': traducir_es_en(question),
322
+ # 'context': text_en
323
+ # }
324
+ # return traducir_en_es(question_answerer(QA_input)['answer'])
325
+ # else:
326
+ # return ""
327
+
328
+ # # def do_ask(question, button, dataset, progress=gr.Progress()):
329
+ # # global docs
330
+ # # docs_ready = dataset.iloc[-1, 0] != ""
331
+ # # if button == "✨Listo✨" and docs_ready:
332
+ # # if docs is None: # don't want to rebuild index if it's already built
333
+ # # import paperqa
334
+ # # docs = paperqa.Docs()
335
+ # # # dataset is pandas dataframe
336
+ # # for _, row in dataset.iterrows():
337
+ # # key = None
338
+ # # if ',' not in row['citation string']:
339
+ # # key = row['citation string']
340
+ # # docs.add(row['filepath'], row['citation string'], key=key)
341
+ # # else:
342
+ # # return ""
343
+ # # progress(0, "Construyendo índices...")
344
+ # # docs._build_faiss_index()
345
+ # # progress(0.25, "Encolando...")
346
+ # # result = docs.query(question)
347
+ # # progress(1.0, "¡Hecho!")
348
+ # # return result.formatted_answer, result.context
349
+
350
+
351
+ # with gr.Blocks() as demo:
352
+ # gr.Markdown("""
353
+ # # Document Question and Answer adaptado al castellano por Pablo Ascorbe.
354
+
355
+ # Este espacio ha sido clonado y adaptado de: https://huggingface.co/spaces/whitead/paper-qa
356
+
357
+ # La idea es utilizar un modelo preentrenado de HuggingFace como "distilbert-base-cased-distilled-squad"
358
+ # y responder las preguntas en inglés, para ello, será necesario hacer primero una traducción de los textos en castellano
359
+ # a inglés y luego volver a traducir en sentido contrario.
360
+
361
+ # ## Instrucciones:
362
+
363
+ # Adjunte su documento, ya sea en formato .txt o .pdf, y pregunte lo que desee.
364
+
365
+ # """)
366
+ # uploaded_files = gr.File(
367
+ # label="Sus documentos subidos (PDF o txt)", file_count="multiple", )
368
+ # dataset = gr.Dataframe(
369
+ # headers=["filepath", "citation string"],
370
+ # datatype=["str", "str"],
371
+ # col_count=(2, "fixed"),
372
+ # interactive=True,
373
+ # label="Documentos y citas"
374
+ # )
375
+ # buildb = gr.Textbox("⚠️Esperando documentos...",
376
+ # label="Estado", interactive=False, show_label=True)
377
+ # dataset.change(validate_dataset, inputs=[
378
+ # dataset], outputs=[buildb])
379
+ # uploaded_files.change(request_pathname, inputs=[
380
+ # uploaded_files], outputs=[dataset])
381
+ # query = gr.Textbox(
382
+ # placeholder="Introduzca su pregunta aquí...", label="Pregunta")
383
+ # ask = gr.Button("Preguntar")
384
+ # gr.Markdown("## Respuesta")
385
+ # answer = gr.Markdown(label="Respuesta")
386
+ # with gr.Accordion("Contexto", open=False):
387
+ # gr.Markdown(
388
+ # "### Contexto\n\nEl siguiente contexto ha sido utilizado para generar la respuesta:")
389
+ # context = gr.Markdown(label="Contexto")
390
+ # # ask.click(fn=do_ask, inputs=[query, buildb,
391
+ # # dataset], outputs=[answer, context])
392
+ # ask.click(fn=do_ask, inputs=[query, buildb,
393
+ # dataset], outputs=[answer])
394
+
395
+ # demo.queue(concurrency_count=20)
396
+ # demo.launch(show_error=True)