Spaces:
Runtime error
Runtime error
File size: 5,113 Bytes
442d312 472a794 15c3f5d d96dbbc 472a794 442d312 e5374a1 442d312 31f12cb bcbee82 442d312 bc84182 442d312 ed1b00c 442d312 981b258 442d312 4912f11 442d312 d1391ee 442d312 a3e1a90 442d312 a3e1a90 442d312 ed1b00c 442d312 0e90d70 bc84182 b5b0fd2 854c812 5e68e2f bc84182 854c812 1644b7d c010f14 5e68e2f c010f14 5e68e2f 0e90d70 442d312 89b0019 ed1b00c a3e1a90 442d312 bc84182 854c812 ad9a1e9 5e68e2f 442d312 5e68e2f 442d312 2482299 bc84182 de8c106 5da1616 2482299 442d312 7ef436d 7e61444 5e68e2f 58cbf7b 31f12cb 574b79f 8af8a15 442d312 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers")
from gradio.mix import Series
#import re
from rank_bm25 import BM25Okapi
import string
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
#from termcolor import colored
len_doc = 400
overlap = 50
def read_pdf(file):
text = extract_text(file.name)
# Split text into smaller docs
docs = []
i = 0
while i < len(text):
docs.append(text[i:i+len_doc])
i = i + len_doc - overlap
return docs
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
def bm25_tokenizer(text):
stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in stop_w:
tokenized_doc.append(token)
return tokenized_doc
def retrieval(query, top_k_retriver, docs, bm25_):
bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
bm25_hits = [{'corpus_id': idx,
'score': bm25_scores[idx],
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
return bm25_hits
qa_model = pipeline("question-answering",
model = "deepset/minilm-uncased-squad2")
#model = "deepset/roberta-base-squad2")
def qa_ranker(query, docs_, top_k_ranker):
ans = []
for doc in docs_:
answer = qa_model(question = query,
context = doc)
answer['doc'] = doc
ans.append(answer)
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
def cstr(s, color='black'):
return "<text style=color:{}>{}</text>".format(color, s)
def cstr_bold(s, color='black'):
return "<text style=color:{}><b>{}</b></text>".format(color, s)
def cstr_break(s, color='black'):
return "<text style=color:{}><br>{}</text>".format(color, s)
def print_colored(text, start_idx, end_idx, confidence):
conf_str = 'Confidence: ' + confidence
a = cstr(' '.join([text[:start_idx], \
cstr_bold(text[start_idx:end_idx], color='blue'), \
text[end_idx:], \
cstr_break(conf_str, color='grey')]), color='black')
return a
def final_qa_pipeline(file, query):
docs = read_pdf(file)
tokenized_corpus = []
for doc in docs:
tokenized_corpus.append(bm25_tokenizer(doc))
bm25 = BM25Okapi(tokenized_corpus)
top_k_retriver, top_k_ranker = 30,3
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
if len(lvl1) > 0:
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
#return (fnl_rank[0]["answer"], str(np.round(100*fnl_rank[0]["score"],2))+"%" , fnl_rank[0]['doc'])
#return (print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']), str(np.round(100*fnl_rank[0]["score"],2))+"%"
top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
if len(lvl1)>1:
top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],2))+"%")
else:
top2 = "None"
return (top1, top2)
#for fnl_ in fnl_rank:
# print("\n")
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
else:
return ("No match","No match")
examples = [
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?"],
]
iface = gr.Interface(
fn = final_qa_pipeline,
inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
outputs = [gr.outputs.HTML(label="Top 1 answer"), gr.outputs.HTML(label="Top 2 answer")],
examples=examples,
theme = "grass",
title = "Question Answering on annual reports",
description = "Navigate long annual reports by using Machine learning to answer your questions. \nSimply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
)
iface.launch() |