Spaces:

ThePixOne
/

open_domain_qa

Running

App Files Files Community

open_domain_qa / app.py

ThePixOne

Update app.py

041b9f4 about 3 years ago

raw

history blame

7 kB

	import gradio as gr
	import numpy as np
	import time
	import hashlib
	import torch
	from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
	from tqdm import tqdm
	import os
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	import textract
	from scipy.special import softmax
	import pandas as pd
	from datetime import datetime
	tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
	model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
	tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
	model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
	if device == 'cuda:0':
	pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
	else:
	pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)

	def cls_pooling(model_output):
	return model_output.last_hidden_state[:,0]

	def encode_query(query):
	encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)

	with torch.no_grad():
	model_output = model(**encoded_input, return_dict=True)

	embeddings = cls_pooling(model_output)

	return embeddings.cpu()


	def encode_docs(docs,maxlen = 64, stride = 32):
	encoded_input = []
	embeddings = []
	spans = []
	file_names = []
	name, text = docs

	text = text.split(" ")
	if len(text) < maxlen:
	text = " ".join(text)

	encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
	spans.append(temp_text)
	file_names.append(name)

	else:
	num_iters = int(len(text)/maxlen)+1
	for i in range(num_iters):
	if i == 0:
	temp_text = " ".join(text[imaxlen:(i+1)maxlen+stride])
	else:
	temp_text = " ".join(text[(i-1)maxlen:(i)maxlen][-stride:] + text[imaxlen:(i+1)maxlen])

	encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
	spans.append(temp_text)
	file_names.append(name)

	with torch.no_grad():
	for encoded in tqdm(encoded_input):
	model_output = model(**encoded, return_dict=True)
	embeddings.append(cls_pooling(model_output))

	embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())

	np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings)))
	np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
	np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))

	return embeddings, spans, file_names

	def predict(query,data):
	name_to_save = data.name.split("\\")[-1].split(".")[0][:-8]
	st = str([query,name_to_save])
	hist = st + " " + str(hashlib.sha256(st.encode()).hexdigest())
	now = datetime.now()
	current_time = now.strftime("%H:%M:%S")
	try:
	df = pd.read_csv("{}.csv".format(hash(st)))
	return df
	except Exception as e:
	print(e)
	print(st)

	if name_to_save+".txt" in os.listdir():
	doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
	doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
	file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()

	doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
	doc_text = list(doc_text.values())
	file_names = list(file_names_dicto.values())

	else:
	text = textract.process("{}".format(data.name)).decode('utf8')
	text = text.replace("\r", " ")
	text = text.replace("\n", " ")
	text = text.replace(" . "," ")

	doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)

	doc_emb = doc_emb.reshape(-1, 768)
	with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
	f.write(text)
	start = time.time()
	query_emb = encode_query(query)

	scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
	doc_score_pairs = list(zip(doc_text, scores, file_names))
	doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
	k = 5
	probs_sum = 0
	probs = softmax(sorted(scores,reverse = True)[:k])
	table = {"Passage":[],"Answer":[],"Probabilities":[],"Source":[]}

	for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
	passage = passage.replace("\n","")
	passage = passage.replace(" . "," ")

	if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
	QA = {'question':query,'context':passage}
	ans = pipe(QA)
	probabilities = "P(a\|p): {}, P(a\|p,q): {}, P(p\|q): {}".format(round(ans["score"],5),
	round(ans["score"]*probs[i],5),
	round(probs[i],5))
	passage = passage.replace(str(ans["answer"]),str(ans["answer"]).upper())
	table["Passage"].append(passage)
	table["Passage"].append("---")
	table["Answer"].append(str(ans["answer"]).upper())
	table["Answer"].append("---")
	table["Probabilities"].append(probabilities)
	table["Probabilities"].append("---")
	table["Source"].append(names)
	table["Source"].append("---")
	else:
	table["Passage"].append(passage)
	table["Passage"].append("---")
	table["Answer"].append("no_answer_calculated")
	table["Answer"].append("---")
	table["Probabilities"].append("P(p\|q): {}".format(round(probs[i],5)))
	table["Probabilities"].append("---")
	table["Source"].append(names)
	table["Source"].append("---")
	df = pd.DataFrame(table)
	print("time: "+ str(time.time()-start))

	with open("HISTORY.txt","a", encoding = "utf-8") as f:
	f.write(hist)
	f.write(" " + str(current_time))
	f.write("\n")
	f.close()
	df.to_csv("{}.csv".format(hash(st)), index=False)

	return df

	iface = gr.Interface(examples = [
	["How high is the highest mountain?","China.pdf"],
	["Where does UK prime minister live?","London.pdf"]
	],

	fn =predict,
	inputs = [gr.inputs.Textbox(default="What is Open-domain question answering?"),
	gr.inputs.File(),
	],
	outputs = [
	gr.outputs.Dataframe(),
	],

	allow_flagging ="manual",flagging_options = ["correct","wrong"],
	allow_screenshot=False)

	iface.launch(share = True,enable_queue=True, show_error =True)