Spaces:
Running
Running
PanigrahiNirma
commited on
Commit
•
08b6600
1
Parent(s):
b25cf43
Update app.py
Browse files
app.py
CHANGED
@@ -6,10 +6,12 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
6 |
import numpy as np
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|
|
|
9 |
|
10 |
nltk.download('punkt')
|
11 |
|
12 |
-
|
|
|
13 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
14 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
@@ -23,13 +25,14 @@ def read_pdf(file):
|
|
23 |
except Exception as e:
|
24 |
return str(e)
|
25 |
|
26 |
-
def
|
27 |
try:
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
return " ".join(relevant_texts)
|
34 |
except Exception as e:
|
35 |
return str(e)
|
@@ -41,12 +44,10 @@ def answer_question(pdf, question, num_words):
|
|
41 |
return text
|
42 |
|
43 |
sentences = sent_tokenize(text)
|
44 |
-
relevant_text =
|
45 |
|
46 |
response = qa_pipeline(question=question, context=relevant_text)
|
47 |
answer = response['answer']
|
48 |
-
start = response['start']
|
49 |
-
end = response['end']
|
50 |
|
51 |
words = answer.split()
|
52 |
if len(words) > num_words:
|
@@ -77,7 +78,7 @@ iface = gr.Interface(
|
|
77 |
gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Words")
|
78 |
],
|
79 |
outputs=gr.Textbox(label="Answer"),
|
80 |
-
title="PDF Q&A | Made by
|
81 |
)
|
82 |
|
83 |
if __name__ == "__main__":
|
|
|
6 |
import numpy as np
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|
9 |
+
from rank_bm25 import BM25Okapi # For BM25 retrieval
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
|
13 |
+
# Use a strong RoBERTa model
|
14 |
+
model_name = "deepset/roberta-large-squad2" # More powerful than base
|
15 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
16 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
|
|
25 |
except Exception as e:
|
26 |
return str(e)
|
27 |
|
28 |
+
def retrieve_relevant_text_bm25(question, sentences, top_n=3):
|
29 |
try:
|
30 |
+
tokenized_corpus = [sent.split() for sent in sentences]
|
31 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
32 |
+
tokenized_query = question.split()
|
33 |
+
doc_scores = bm25.get_scores(tokenized_query)
|
34 |
+
top_n_indices = np.argsort(doc_scores)[::-1][:top_n] # Get indices of top N
|
35 |
+
relevant_texts = [sentences[i] for i in top_n_indices]
|
36 |
return " ".join(relevant_texts)
|
37 |
except Exception as e:
|
38 |
return str(e)
|
|
|
44 |
return text
|
45 |
|
46 |
sentences = sent_tokenize(text)
|
47 |
+
relevant_text = retrieve_relevant_text_bm25(question, sentences) # Use BM25
|
48 |
|
49 |
response = qa_pipeline(question=question, context=relevant_text)
|
50 |
answer = response['answer']
|
|
|
|
|
51 |
|
52 |
words = answer.split()
|
53 |
if len(words) > num_words:
|
|
|
78 |
gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Words")
|
79 |
],
|
80 |
outputs=gr.Textbox(label="Answer"),
|
81 |
+
title="PDF Q&A with RoBERTa | Made by NP"
|
82 |
)
|
83 |
|
84 |
if __name__ == "__main__":
|