Spaces:
Running
Running
PanigrahiNirma
commited on
Commit
•
616d967
1
Parent(s):
7523ea5
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
|
3 |
from pdfminer.high_level import extract_text
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
@@ -10,10 +10,15 @@ from rank_bm25 import BM25Okapi
|
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def read_pdf(file):
|
19 |
try:
|
@@ -48,34 +53,50 @@ def answer_question(pdf, question, num_words):
|
|
48 |
response = qa_pipeline(question=question, context=relevant_text)
|
49 |
answer = response['answer']
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
elif relevant_text: #only add context if there is relevant text
|
55 |
-
remaining_words = num_words - len(
|
56 |
added_words = 0
|
57 |
for sentence in sent_tokenize(relevant_text):
|
58 |
sentence_words = sentence.split()
|
59 |
words_to_add = min(remaining_words - added_words, len(sentence_words))
|
60 |
-
|
|
|
|
|
61 |
added_words += words_to_add
|
62 |
if added_words == remaining_words:
|
63 |
break
|
64 |
-
answer
|
65 |
-
if len(answer.split()) > num_words: #truncate again if needed
|
66 |
answer = " ".join(answer.split()[:num_words])
|
67 |
return answer.strip()
|
68 |
|
69 |
except Exception as e:
|
70 |
return str(e)
|
71 |
|
72 |
-
with gr.Blocks() as iface:
|
73 |
-
gr.Markdown("PDF Q&A with RoBERTa
|
74 |
with gr.Row():
|
75 |
pdf_input = gr.File(type="filepath", label="Upload PDF")
|
76 |
question_input = gr.Textbox(lines=2, placeholder="Ask a question")
|
77 |
num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
|
78 |
-
answer_output = gr.Textbox(label="Answer", lines=5)
|
79 |
btn = gr.Button("Submit")
|
80 |
btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
|
81 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
|
3 |
from pdfminer.high_level import extract_text
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
|
13 |
+
# QA model
|
14 |
+
qa_model_name = "deepset/roberta-large-squad2"
|
15 |
+
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
|
16 |
+
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
|
17 |
+
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)
|
18 |
+
|
19 |
+
# Summarization model
|
20 |
+
summarization_model_name = "facebook/bart-large-cnn"
|
21 |
+
summarizer = pipeline("summarization", model=summarization_model_name)
|
22 |
|
23 |
def read_pdf(file):
|
24 |
try:
|
|
|
53 |
response = qa_pipeline(question=question, context=relevant_text)
|
54 |
answer = response['answer']
|
55 |
|
56 |
+
answer = answer.strip()
|
57 |
+
answer = " ".join(answer.split())
|
58 |
+
|
59 |
+
if len(answer.split()) > num_words:
|
60 |
+
try:
|
61 |
+
summarized_answer = summarizer(answer, max_length=num_words+10, min_length=1) # increased max length for better summaries
|
62 |
+
answer = summarized_answer[0]['summary_text']
|
63 |
+
answer = answer.strip()
|
64 |
+
answer = " ".join(answer.split())
|
65 |
+
if len(answer.split()) > num_words:
|
66 |
+
answer = " ".join(answer.split()[:num_words]) #truncate if summary is still too long
|
67 |
+
except RuntimeError as e:
|
68 |
+
if "Input length of input_ids is" in str(e) and "but `max_length` is set to" in str(e): # more robust check for context window error
|
69 |
+
answer = " ".join(answer.split()[:num_words])
|
70 |
+
else:
|
71 |
+
return f"Summarization Error: {e}"
|
72 |
+
except Exception as e:
|
73 |
+
return f"Summarization Error: {e}"
|
74 |
elif relevant_text: #only add context if there is relevant text
|
75 |
+
remaining_words = num_words - len(answer.split())
|
76 |
added_words = 0
|
77 |
for sentence in sent_tokenize(relevant_text):
|
78 |
sentence_words = sentence.split()
|
79 |
words_to_add = min(remaining_words - added_words, len(sentence_words))
|
80 |
+
answer_words = answer.split()
|
81 |
+
answer_words.extend(sentence_words[:words_to_add])
|
82 |
+
answer = " ".join(answer_words)
|
83 |
added_words += words_to_add
|
84 |
if added_words == remaining_words:
|
85 |
break
|
86 |
+
if len(answer.split()) > num_words:
|
|
|
87 |
answer = " ".join(answer.split()[:num_words])
|
88 |
return answer.strip()
|
89 |
|
90 |
except Exception as e:
|
91 |
return str(e)
|
92 |
|
93 |
+
with gr.Blocks() as iface:
|
94 |
+
gr.Markdown("PDF Q&A with RoBERTa")
|
95 |
with gr.Row():
|
96 |
pdf_input = gr.File(type="filepath", label="Upload PDF")
|
97 |
question_input = gr.Textbox(lines=2, placeholder="Ask a question")
|
98 |
num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
|
99 |
+
answer_output = gr.Textbox(label="Answer", lines=5)
|
100 |
btn = gr.Button("Submit")
|
101 |
btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
|
102 |
|