File size: 5,024 Bytes
442d312
472a794
15c3f5d
d96dbbc
472a794
442d312
e5374a1
442d312
31f12cb
bcbee82
442d312
 
 
 
5aa1bf9
b37de3c
 
 
5aa1bf9
442d312
981b258
442d312
 
 
 
 
 
 
 
 
 
 
 
4912f11
442d312
 
 
 
d1391ee
442d312
 
 
a3e1a90
442d312
a3e1a90
442d312
 
 
 
 
 
 
 
0418be5
442d312
 
 
 
 
 
 
0e90d70
bc84182
 
b5b0fd2
 
854c812
5e68e2f
bc84182
854c812
bbce466
c010f14
5e68e2f
c010f14
5e68e2f
0e90d70
 
d7f0548
442d312
89b0019
 
 
 
 
 
b37de3c
a3e1a90
442d312
d7f0548
 
0f32c86
d7f0548
442d312
0418be5
ad9a1e9
5e68e2f
bbce466
5e68e2f
 
 
442d312
5e68e2f
442d312
2482299
8de99fc
 
 
 
 
 
 
2482299
 
442d312
7ef436d
c379db0
5e68e2f
58cbf7b
31f12cb
574b79f
8af8a15
442d312
1a235c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers")

from gradio.mix import Series
#import re
from rank_bm25 import BM25Okapi
import string 
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text

len_doc = 500
overlap = 15 
param_top_k_retriver = 15
param_top_k_ranker = 3

def read_pdf(file):
  text = extract_text(file.name)
  # Split text into smaller docs
  docs = []
  
  i = 0
  while i < len(text):
      docs.append(text[i:i+len_doc])
      i = i + len_doc - overlap
  return docs
  
  # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching

def bm25_tokenizer(text):
    stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in stop_w:
            tokenized_doc.append(token)
    return tokenized_doc

def retrieval(query, top_k_retriver, docs, bm25_):

    bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
    top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
    bm25_hits = [{'corpus_id': idx, 
                  'score': bm25_scores[idx], 
                  'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    return bm25_hits

def qa_ranker(query, docs_, top_k_ranker, qa_model):
    ans = []
    for doc in docs_:
        answer = qa_model(question = query, 
                            context = doc)
        answer['doc'] = doc
        ans.append(answer)
    return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)
def cstr_bold(s, color='black'):
    return "<text style=color:{}><b>{}</b></text>".format(color, s)
def cstr_break(s, color='black'):
    return "<text style=color:{}><br>{}</text>".format(color, s)

def print_colored(text, start_idx, end_idx, confidence):
    conf_str = '- Confidence: ' +  confidence
    a = cstr(' '.join([text[:start_idx], \
                        cstr_bold(text[start_idx:end_idx], color='blue'), \
                        text[end_idx:], \
                        cstr_break(conf_str, color='grey')]), color='black')
    return a
       
def final_qa_pipeline(file, query, model_nm):
    docs = read_pdf(file)
    tokenized_corpus = []
    for doc in docs:
        tokenized_corpus.append(bm25_tokenizer(doc))
    
    bm25 = BM25Okapi(tokenized_corpus)
    
    top_k_retriver, top_k_ranker = param_top_k_retriver, param_top_k_ranker
    lvl1 = retrieval(query, top_k_retriver, docs, bm25)

    qa_model = pipeline("question-answering", 
                    #model = "deepset/minilm-uncased-squad2")
                    model = "deepset/"+ str(model_nm))

    if len(lvl1) > 0:
        fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker,qa_model)
        top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")  
        if len(lvl1)>1:
            top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
        else:
            top2 = "None"
        return (top1, top2)
    else:
        return ("No match","No match")
        
examples = [
    [os.path.abspath("dbs-annual-report-2020.pdf"), "how many times has DBS won Best bank in the world ?","minilm-uncased-squad2"],
    [os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?","minilm-uncased-squad2"],
    [os.path.abspath("dbs-annual-report-2020.pdf"), "what is the sustainability focus ?","minilm-uncased-squad2"],
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?","minilm-uncased-squad2"],
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?","minilm-uncased-squad2"],
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "who is the chief executive officer ?","minilm-uncased-squad2"],
    [os.path.abspath("NASDAQ_MSFT_2020.pdf"), "How much is the guided revenue for next quarter?","minilm-uncased-squad2"],
]

iface = gr.Interface(
   fn = final_qa_pipeline,
   inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:"), gr.inputs.Dropdown(choices=["minilm-uncased-squad2","roberta-base-squad2"],label="Model")],
   outputs = [gr.outputs.HTML(label="Top 1 answer"), gr.outputs.HTML(label="Top 2 answer")],
   examples=examples,
   theme = "grass",
   title = "Question Answering on annual reports",
   description = "Navigate long annual reports by using Machine learning to answer your questions. \nSimply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
   )
iface.launch(enable_queue = True)