Spaces:
Running
Running
PanigrahiNirma
commited on
Commit
•
e2bc8f2
1
Parent(s):
08b6600
Update app.py
Browse files
app.py
CHANGED
@@ -6,12 +6,11 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
6 |
import numpy as np
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|
9 |
-
from rank_bm25 import BM25Okapi
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
|
13 |
-
|
14 |
-
model_name = "deepset/roberta-large-squad2" # More powerful than base
|
15 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
16 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
@@ -31,7 +30,7 @@ def retrieve_relevant_text_bm25(question, sentences, top_n=3):
|
|
31 |
bm25 = BM25Okapi(tokenized_corpus)
|
32 |
tokenized_query = question.split()
|
33 |
doc_scores = bm25.get_scores(tokenized_query)
|
34 |
-
top_n_indices = np.argsort(doc_scores)[::-1][:top_n]
|
35 |
relevant_texts = [sentences[i] for i in top_n_indices]
|
36 |
return " ".join(relevant_texts)
|
37 |
except Exception as e:
|
@@ -44,42 +43,41 @@ def answer_question(pdf, question, num_words):
|
|
44 |
return text
|
45 |
|
46 |
sentences = sent_tokenize(text)
|
47 |
-
relevant_text = retrieve_relevant_text_bm25(question, sentences)
|
48 |
|
49 |
response = qa_pipeline(question=question, context=relevant_text)
|
50 |
answer = response['answer']
|
51 |
|
52 |
words = answer.split()
|
53 |
-
if len(words)
|
54 |
-
answer = " ".join(words[:num_words])
|
55 |
-
elif
|
56 |
remaining_words = num_words - len(words)
|
57 |
-
|
58 |
for sentence in sent_tokenize(relevant_text):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
else:
|
65 |
break
|
66 |
-
answer
|
67 |
-
|
68 |
-
|
|
|
69 |
|
70 |
except Exception as e:
|
71 |
return str(e)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
gr.File(type="filepath", label="Upload PDF")
|
77 |
-
gr.Textbox(lines=2, placeholder="Ask a question")
|
78 |
-
gr.Slider(minimum=1, maximum=
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
)
|
83 |
|
84 |
if __name__ == "__main__":
|
85 |
iface.launch()
|
|
|
6 |
import numpy as np
|
7 |
import nltk
|
8 |
from nltk.tokenize import sent_tokenize
|
9 |
+
from rank_bm25 import BM25Okapi
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
|
13 |
+
model_name = "deepset/roberta-large-squad2"
|
|
|
14 |
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
|
|
|
30 |
bm25 = BM25Okapi(tokenized_corpus)
|
31 |
tokenized_query = question.split()
|
32 |
doc_scores = bm25.get_scores(tokenized_query)
|
33 |
+
top_n_indices = np.argsort(doc_scores)[::-1][:top_n]
|
34 |
relevant_texts = [sentences[i] for i in top_n_indices]
|
35 |
return " ".join(relevant_texts)
|
36 |
except Exception as e:
|
|
|
43 |
return text
|
44 |
|
45 |
sentences = sent_tokenize(text)
|
46 |
+
relevant_text = retrieve_relevant_text_bm25(question, sentences)
|
47 |
|
48 |
response = qa_pipeline(question=question, context=relevant_text)
|
49 |
answer = response['answer']
|
50 |
|
51 |
words = answer.split()
|
52 |
+
if len(words) >= num_words:
|
53 |
+
answer = " ".join(words[:num_words]) # Strict truncation
|
54 |
+
elif relevant_text: #only add context if there is relevant text
|
55 |
remaining_words = num_words - len(words)
|
56 |
+
added_words = 0
|
57 |
for sentence in sent_tokenize(relevant_text):
|
58 |
+
sentence_words = sentence.split()
|
59 |
+
words_to_add = min(remaining_words - added_words, len(sentence_words))
|
60 |
+
words.extend(sentence_words[:words_to_add])
|
61 |
+
added_words += words_to_add
|
62 |
+
if added_words == remaining_words:
|
|
|
63 |
break
|
64 |
+
answer = " ".join(words)
|
65 |
+
if len(answer.split()) > num_words: #truncate again if needed
|
66 |
+
answer = " ".join(answer.split()[:num_words])
|
67 |
+
return answer.strip()
|
68 |
|
69 |
except Exception as e:
|
70 |
return str(e)
|
71 |
|
72 |
+
with gr.Blocks() as iface: #use blocks for custom layout
|
73 |
+
gr.Markdown("PDF Q&A with RoBERTa | Made by NP")
|
74 |
+
with gr.Row():
|
75 |
+
pdf_input = gr.File(type="filepath", label="Upload PDF")
|
76 |
+
question_input = gr.Textbox(lines=2, placeholder="Ask a question")
|
77 |
+
num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
|
78 |
+
answer_output = gr.Textbox(label="Answer", lines=5) # increased lines for better display
|
79 |
+
btn = gr.Button("Submit")
|
80 |
+
btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
|
|
|
81 |
|
82 |
if __name__ == "__main__":
|
83 |
iface.launch()
|