Spaces:
Runtime error
Runtime error
samarthagarwal23
commited on
Commit
•
442d312
1
Parent(s):
df1cdb5
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio.mix import Series
|
3 |
+
import re
|
4 |
+
from rank_bm25 import BM25Okapi
|
5 |
+
import string
|
6 |
+
from transformers import pipeline
|
7 |
+
import pdfminer
|
8 |
+
from pdfminer.high_level import extract_text
|
9 |
+
from termcolor import colored
|
10 |
+
|
11 |
+
def read_pdf(file):
|
12 |
+
text = extract_text(file)
|
13 |
+
# Split text into smaller docs
|
14 |
+
len_doc = 400
|
15 |
+
overlap = 50
|
16 |
+
docs = []
|
17 |
+
|
18 |
+
i = 0
|
19 |
+
while i < len(text):
|
20 |
+
docs.append(text[i:i+len_doc])
|
21 |
+
i = i + len_doc - overlap
|
22 |
+
return docs
|
23 |
+
|
24 |
+
# We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
|
25 |
+
|
26 |
+
def bm25_tokenizer(text):
|
27 |
+
tokenized_doc = []
|
28 |
+
for token in text.lower().split():
|
29 |
+
token = token.strip(string.punctuation)
|
30 |
+
|
31 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
32 |
+
tokenized_doc.append(token)
|
33 |
+
return tokenized_doc
|
34 |
+
|
35 |
+
tokenized_corpus = []
|
36 |
+
for doc in docs:
|
37 |
+
tokenized_corpus.append(bm25_tokenizer(doc))
|
38 |
+
|
39 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
40 |
+
|
41 |
+
def retrieval(query, top_k_retriver, docs):
|
42 |
+
|
43 |
+
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|
44 |
+
top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
|
45 |
+
bm25_hits = [{'corpus_id': idx,
|
46 |
+
'score': bm25_scores[idx],
|
47 |
+
'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
|
48 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
49 |
+
|
50 |
+
return bm25_hits
|
51 |
+
|
52 |
+
qa_model = pipeline("question-answering",
|
53 |
+
model = "huggingface/deepset/roberta-base-squad2")
|
54 |
+
|
55 |
+
def qa_ranker(query, docs_, top_k_ranker):
|
56 |
+
ans = []
|
57 |
+
for doc in docs_:
|
58 |
+
answer = qa_model(question = query,
|
59 |
+
context = doc)
|
60 |
+
answer['doc'] = doc
|
61 |
+
ans.append(answer)
|
62 |
+
return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
|
63 |
+
|
64 |
+
def final_qa_pipeline(file, query):
|
65 |
+
docs = read_pdf(file)
|
66 |
+
top_k_retriver, top_k_ranker = 10,1
|
67 |
+
lvl1 = retrieval(query, top_k_retriver, docs)
|
68 |
+
|
69 |
+
if len(lvl1) > 0:
|
70 |
+
fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker)
|
71 |
+
return (fnl_rank[0]["answer"], fnl_rank[0]["score"])
|
72 |
+
#for fnl_ in fnl_rank:
|
73 |
+
# print("\n")
|
74 |
+
# print_colored(fnl_['doc'], fnl_['start'], fnl_['end'])
|
75 |
+
# print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold']))
|
76 |
+
else:
|
77 |
+
return ("No match", 0)
|
78 |
+
|
79 |
+
iface = gr.Interface(
|
80 |
+
fn = pdf_to_text,
|
81 |
+
inputs = '[gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")],
|
82 |
+
outputs = [gr.outputs.HTML(label="Answer"), gr.outputs.HTML(label="Score")]
|
83 |
+
)
|
84 |
+
iface.launch()
|