open_domain_qa

Paused

App Files Files Community

LectureExchange

ThePixOne commited on Nov 17, 2022

Commit

81aaa5b

0 Parent(s):

Duplicate from ThePixOne/open_domain_qa

Browse files

Co-authored-by: Piotr Antoniak <ThePixOne@users.noreply.huggingface.co>

Files changed (8) hide show

.gitattributes +27 -0
China.pdf +0 -0
HISTORY.txt +0 -0
London.pdf +0 -0
README.md +47 -0
README.txt +1 -0
app.py +212 -0
requirements.txt +6 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

China.pdf ADDED Viewed

Binary file (256 kB). View file

HISTORY.txt ADDED Viewed

File without changes

London.pdf ADDED Viewed

Binary file (212 kB). View file

README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+---
+title: Question Answering from PDFs
+emoji: 📈
+colorFrom: green
+colorTo: gray
+sdk: gradio
+app_file: app.py
+pinned: false
+license: wtfpl
+duplicated_from: ThePixOne/open_domain_qa
+---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio`, `streamlit`, or `static`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
+Path is relative to the root of the repository.
+`models`: _List[string]_
+HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`datasets`: _List[string]_
+HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

README.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Do you have a long document and bunch of questions that can be answered given the data in this file? Fear not because following demo can do it for you. Upload your pdf, ask question and wait for the magic to happen.

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Gradio requires input to be fed in a very peculiar way and does not provide too much flexibility - don't expect from this demo too much. The backbone had to be adjusted to work on hugging face spaces. Go see https://github.com/PiotrAntoniak/QuestionAnswering for a prettier version utilizing streamlit.
+"""
+import gradio as gr
+description = """Do you have a long document and a bunch of questions that can be answered given the data in this file?
+Fear not for this demo is for you.
+Upload your pdf, ask your questions and wait for the magic to happen.
+DISCLAIMER: I do no have idea what happens to the pdfs that you upload and who has access to them so make sure there is nothing confidential there.
+"""
+title = "QA answering from a pdf."
+import numpy as np
+import time
+import hashlib
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
+from tqdm import tqdm
+import os
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+import textract
+from scipy.special import softmax
+import pandas as pd
+from datetime import datetime
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
+model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
+tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
+model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
+if device == 'cuda:0':
+    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
+else:
+    pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)
+def cls_pooling(model_output):
+    return model_output.last_hidden_state[:,0]
+def encode_query(query):
+    encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)
+    with torch.no_grad():
+        model_output = model(**encoded_input, return_dict=True)
+    embeddings = cls_pooling(model_output)
+    return embeddings.cpu()
+def encode_docs(docs,maxlen = 64, stride = 32):
+    encoded_input = []
+    embeddings = []
+    spans = []
+    file_names = []
+    name, text = docs
+    text = text.split(" ")
+    if len(text) < maxlen:
+        text = " ".join(text)
+        encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
+        spans.append(temp_text)
+        file_names.append(name)
+    else:
+        num_iters = int(len(text)/maxlen)+1
+        for i in range(num_iters):
+            if i == 0:
+                temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
+            else:
+                temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])
+            encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
+            spans.append(temp_text)
+            file_names.append(name)
+    with torch.no_grad():
+        for encoded in tqdm(encoded_input):
+            model_output = model(**encoded, return_dict=True)
+            embeddings.append(cls_pooling(model_output))
+    embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())
+    np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings)))
+    np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
+    np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))
+    return embeddings, spans, file_names
+def predict(query,data):
+    name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
+    k=20
+    st = str([query,name_to_save])
+    st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
+    hist = st + " " + st_hashed
+    now = datetime.now()
+    current_time = now.strftime("%H:%M:%S")
+    try: #if the same question was already asked for this document, upload question and answer
+        df = pd.read_csv("{}.csv".format(hash(st)))
+        list_outputs = []
+        for i in range(k):
+            temp = [df.iloc[n] for n in range(k)][i]
+            text = ''
+            text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
+            text += 'ANSWER: ' +temp.Answer + '\n\n'
+            text += 'PASSAGE: '+temp.Passage + '\n\n'
+            list_outputs.append(text)
+        return list_outputs
+    except Exception as e:
+        print(e)
+        print(st)
+    if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
+        doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
+        doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
+        file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
+        doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
+        doc_text = list(doc_text.values())
+        file_names = list(file_names_dicto.values())
+    else:
+        text = textract.process("{}".format(data.name)).decode('utf8')
+        text = text.replace("\r", " ")
+        text = text.replace("\n", " ")
+        text = text.replace(" . "," ")
+        doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
+        doc_emb = doc_emb.reshape(-1, 768)
+        with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
+            f.write(text)
+    #once embeddings are calculated, run MIPS
+    start = time.time()
+    query_emb = encode_query(query)
+    scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
+    doc_score_pairs = list(zip(doc_text, scores, file_names))
+    doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
+    probs_sum = 0
+    probs = softmax(sorted(scores,reverse = True)[:k])
+    table = {"Passage":[],"Answer":[],"Probabilities":[]}
+    #get answers for each pair of question (from user) and top best passages
+    for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
+        passage = passage.replace("\n","")
+        #passage = passage.replace(" . "," ")
+        if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
+            QA = {'question':query,'context':passage}
+            ans = pipe(QA)
+            probabilities = "P(a|p): {}, P(a|p,q): {}, P(p|q): {}".format(round(ans["score"],5),
+                                                                          round(ans["score"]*probs[i],5),
+                                                                          round(probs[i],5))
+            table["Passage"].append(passage)
+            table["Answer"].append(str(ans["answer"]).upper())
+            table["Probabilities"].append(probabilities)
+        else:
+            table["Passage"].append(passage)
+            table["Answer"].append("no_answer_calculated")
+            table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
+    #format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
+    df = pd.DataFrame(table)
+    print(df)
+    print("time: "+ str(time.time()-start))
+    with open("HISTORY.txt","a", encoding = "utf-8") as f:
+        f.write(hist)
+        f.write(" " + str(current_time))
+        f.write("\n")
+        f.close()
+    df.to_csv("{}.csv".format(hash(st)), index=False)
+    list_outputs = []
+    for i in range(k):
+        text = ''
+        temp = [df.iloc[n] for n in range(k)][i]
+        text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
+        text += 'ANSWER: ' +temp.Answer + '\n\n'
+        text += 'PASSAGE: '+temp.Passage + '\n\n'
+        list_outputs.append(text)
+    return list_outputs
+iface = gr.Interface(examples = [
+        ["How high is the highest mountain?","China.pdf"],
+        ["Where does UK prime minister live?","London.pdf"]
+    ],
+    fn =predict,
+    inputs = [gr.inputs.Textbox(default="What is Open-domain question answering?"),
+              gr.inputs.File(),
+    ],
+    outputs = [
+        gr.outputs.Carousel(['text']),
+            ],
+    description=description,
+    title = title,
+allow_flagging ="manual",flagging_options = ["correct","wrong"],
+                     allow_screenshot=False)
+iface.launch(enable_queue=True, show_error =True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+textract
+scipy
+pandas
+numpy
+transformers