samarthagarwal23's picture
Update app.py
4912f11
raw
history blame
3.26 kB
import gradio as gr
import os
import numpy as np
os.system("pip install pdfminer.six rank_bm25 torch transformers")
from gradio.mix import Series
import re
from rank_bm25 import BM25Okapi
import string
import torch
from transformers import pipeline
import pdfminer
from pdfminer.high_level import extract_text
#from termcolor import colored
def read_pdf(file):
text = extract_text(file.name)
# Split text into smaller docs
len_doc = 400
overlap = 50
docs = []
i = 0
while i < len(text):
docs.append(text[i:i+len_doc])
i = i + len_doc - overlap
return docs
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
def bm25_tokenizer(text):
stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
tokenized_doc = []
for token in text.lower().split():
token = token.strip(string.punctuation)
if len(token) > 0 and token not in stop_w:
tokenized_doc.append(token)
return tokenized_doc
def retrieval(query, top_k_retriver, docs, bm25_):
bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
bm25_hits = [{'corpus_id': idx,
'score': bm25_scores[idx],
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
return bm25_hits
qa_model = pipeline("question-answering",
model = "deepset/roberta-base-squad2")
def qa_ranker(query, docs_, top_k_ranker):
ans = []
for doc in docs_:
answer = qa_model(question = query,
context = doc)
answer['doc'] = doc
ans.append(answer)
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
def final_qa_pipeline(file, query):
docs = read_pdf(file)
tokenized_corpus = []
for doc in docs:
tokenized_corpus.append(bm25_tokenizer(doc))
bm25 = BM25Okapi(tokenized_corpus)
top_k_retriver, top_k_ranker = 10,1
lvl1 = retrieval(query, top_k_retriver, docs, bm25)
if len(lvl1) > 0:
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
return (fnl_rank[0]["answer"], np.round(fnl_rank[0]["score"],3), fnl_rank[0]["doc"])
#for fnl_ in fnl_rank:
# print("\n")
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
else:
return ("No match", 0)
examples = [
[os.path.abspath("NASDAQ_AAPL_2020.pdf"), "how much are the outstanding shares?"],
[os.path.abspath("NASDAQ_MSFT_2020.pdf"), "what was the income before taxes?"],
[os.path.abspath("dbs-annual-report-2020.pdf"), "how much dividend was paid to shareholders??"],
]
iface = gr.Interface(
fn = final_qa_pipeline,
inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
outputs = [gr.outputs.HTML(label="Answer"), gr.outputs.HTML(label="Score"), gr.outputs.HTML(label="doc")],
examples=examples,
)
iface.launch()