Spaces:
Runtime error
Runtime error
File size: 3,703 Bytes
442d312 472a794 15c3f5d 0e90d70 472a794 442d312 bcbee82 442d312 0e90d70 442d312 981b258 442d312 4912f11 442d312 d1391ee 442d312 a3e1a90 442d312 a3e1a90 442d312 dbd16a5 442d312 0e90d70 442d312 89b0019 442d312 a3e1a90 442d312 0e90d70 442d312 2482299 de8c106 2482299 442d312 7ef436d 7e61444 0e90d70 58cbf7b 442d312 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers termcolor")
from gradio.mix import Series
import re
from rank_bm25 import BM25Okapi
import string
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
from termcolor import colored
def read_pdf(file):
text = extract_text(file.name)
# Split text into smaller docs
len_doc = 400
overlap = 50
docs = []
i = 0
while i < len(text):
docs.append(text[i:i+len_doc])
i = i + len_doc - overlap
return docs
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
def bm25_tokenizer(text):
stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in stop_w:
tokenized_doc.append(token)
return tokenized_doc
def retrieval(query, top_k_retriver, docs, bm25_):
bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
bm25_hits = [{'corpus_id': idx,
'score': bm25_scores[idx],
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
return bm25_hits
qa_model = pipeline("question-answering",
model = "deepset/roberta-base-squad2")
def qa_ranker(query, docs_, top_k_ranker):
ans = []
for doc in docs_:
answer = qa_model(question = query,
context = doc)
answer['doc'] = doc
ans.append(answer)
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
def print_colored(text, start_idx, end_idx):
a = colored(text[:start_idx]) + \
colored(text[start_idx:end_idx], 'red', 'on_yellow') + \
colored(text[end_idx:]))
return a
def final_qa_pipeline(file, query):
docs = read_pdf(file)
tokenized_corpus = []
for doc in docs:
tokenized_corpus.append(bm25_tokenizer(doc))
bm25 = BM25Okapi(tokenized_corpus)
top_k_retriver, top_k_ranker = 10,1
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
if len(lvl1) > 0:
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
return (fnl_rank[0]["answer"], np.round(fnl_rank[0]["score"],3), print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']))
#for fnl_ in fnl_rank:
# print("\n")
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
else:
return ("No match", 0)
examples = [
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
]
iface = gr.Interface(
fn = final_qa_pipeline,
inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
outputs = [gr.outputs.TextBox(label="Answer"), gr.outputs.TextBox(label="Score"), gr.outputs.HTML(label="Reference text")],
examples=examples,
)
iface.launch() |