File size: 3,703 Bytes
442d312
472a794
15c3f5d
0e90d70
472a794
442d312
 
 
 
bcbee82
442d312
 
 
0e90d70
442d312
 
981b258
442d312
 
 
 
 
 
 
 
 
 
 
 
 
 
4912f11
442d312
 
 
 
d1391ee
442d312
 
 
a3e1a90
442d312
a3e1a90
442d312
 
 
 
 
 
 
 
 
dbd16a5
442d312
 
 
 
 
 
 
 
 
0e90d70
 
 
 
 
 
 
442d312
 
89b0019
 
 
 
 
 
442d312
a3e1a90
442d312
 
 
0e90d70
442d312
 
 
 
 
 
 
2482299
de8c106
 
 
 
 
2482299
 
442d312
7ef436d
7e61444
0e90d70
58cbf7b
442d312
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers termcolor")

from gradio.mix import Series
import re
from rank_bm25 import BM25Okapi
import string 
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
from termcolor import colored

def read_pdf(file):
  text = extract_text(file.name)
  # Split text into smaller docs
  len_doc = 400
  overlap = 50 
  docs = []
  
  i = 0
  while i < len(text):
      docs.append(text[i:i+len_doc])
      i = i + len_doc - overlap
  return docs
  
  # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching

def bm25_tokenizer(text):
    stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in stop_w:
            tokenized_doc.append(token)
    return tokenized_doc

def retrieval(query, top_k_retriver, docs, bm25_):

    bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
    top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
    bm25_hits = [{'corpus_id': idx, 
                  'score': bm25_scores[idx], 
                  'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    return bm25_hits

qa_model = pipeline("question-answering", 
                    model = "deepset/roberta-base-squad2")
                    
def qa_ranker(query, docs_, top_k_ranker):
    ans = []
    for doc in docs_:
        answer = qa_model(question = query, 
                            context = doc)
        answer['doc'] = doc
        ans.append(answer)
    return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]

def print_colored(text, start_idx, end_idx):
    a = colored(text[:start_idx]) + \
        colored(text[start_idx:end_idx], 'red', 'on_yellow') + \
        colored(text[end_idx:]))
    return a
       
def final_qa_pipeline(file, query):
    docs = read_pdf(file)
    tokenized_corpus = []
    for doc in docs:
        tokenized_corpus.append(bm25_tokenizer(doc))
    
    bm25 = BM25Okapi(tokenized_corpus)
    
    top_k_retriver, top_k_ranker = 10,1
    lvl1 = retrieval(query, top_k_retriver, docs, bm25)

    if len(lvl1) > 0:
        fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
        return (fnl_rank[0]["answer"], np.round(fnl_rank[0]["score"],3), print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end']))
        #for fnl_ in fnl_rank:
        #    print("\n")
        #    print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
        #    print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
    else:
        return ("No match", 0)
        
examples = [
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares ?"],
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "How high is shareholders equity ?"],
    [os.path.abspath("NASDAQ_AAPL_2020.pdf"), "what is competitors strategy ?"],
    [os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders ?"],
    [os.path.abspath("dbs-annual-report-2020.pdf"), "what are the key risks ?"],
]

iface = gr.Interface(
   fn = final_qa_pipeline,
   inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
   outputs = [gr.outputs.TextBox(label="Answer"), gr.outputs.TextBox(label="Score"), gr.outputs.HTML(label="Reference text")],
   examples=examples,
   )
iface.launch()