PanigrahiNirma commited on
Commit
616d967
1 Parent(s): 7523ea5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -15
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
3
  from pdfminer.high_level import extract_text
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
@@ -10,10 +10,15 @@ from rank_bm25 import BM25Okapi
10
 
11
  nltk.download('punkt')
12
 
13
- model_name = "deepset/roberta-large-squad2"
14
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
 
 
 
 
 
17
 
18
  def read_pdf(file):
19
  try:
@@ -48,34 +53,50 @@ def answer_question(pdf, question, num_words):
48
  response = qa_pipeline(question=question, context=relevant_text)
49
  answer = response['answer']
50
 
51
- words = answer.split()
52
- if len(words) >= num_words:
53
- answer = " ".join(words[:num_words]) # Strict truncation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  elif relevant_text: #only add context if there is relevant text
55
- remaining_words = num_words - len(words)
56
  added_words = 0
57
  for sentence in sent_tokenize(relevant_text):
58
  sentence_words = sentence.split()
59
  words_to_add = min(remaining_words - added_words, len(sentence_words))
60
- words.extend(sentence_words[:words_to_add])
 
 
61
  added_words += words_to_add
62
  if added_words == remaining_words:
63
  break
64
- answer = " ".join(words)
65
- if len(answer.split()) > num_words: #truncate again if needed
66
  answer = " ".join(answer.split()[:num_words])
67
  return answer.strip()
68
 
69
  except Exception as e:
70
  return str(e)
71
 
72
- with gr.Blocks() as iface: #use blocks for custom layout
73
- gr.Markdown("PDF Q&A with RoBERTa | Made by NP")
74
  with gr.Row():
75
  pdf_input = gr.File(type="filepath", label="Upload PDF")
76
  question_input = gr.Textbox(lines=2, placeholder="Ask a question")
77
  num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
78
- answer_output = gr.Textbox(label="Answer", lines=5) # increased lines for better display
79
  btn = gr.Button("Submit")
80
  btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
81
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
3
  from pdfminer.high_level import extract_text
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
 
10
 
11
  nltk.download('punkt')
12
 
13
+ # QA model
14
+ qa_model_name = "deepset/roberta-large-squad2"
15
+ qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
16
+ qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
17
+ qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)
18
+
19
+ # Summarization model
20
+ summarization_model_name = "facebook/bart-large-cnn"
21
+ summarizer = pipeline("summarization", model=summarization_model_name)
22
 
23
  def read_pdf(file):
24
  try:
 
53
  response = qa_pipeline(question=question, context=relevant_text)
54
  answer = response['answer']
55
 
56
+ answer = answer.strip()
57
+ answer = " ".join(answer.split())
58
+
59
+ if len(answer.split()) > num_words:
60
+ try:
61
+ summarized_answer = summarizer(answer, max_length=num_words+10, min_length=1) # increased max length for better summaries
62
+ answer = summarized_answer[0]['summary_text']
63
+ answer = answer.strip()
64
+ answer = " ".join(answer.split())
65
+ if len(answer.split()) > num_words:
66
+ answer = " ".join(answer.split()[:num_words]) #truncate if summary is still too long
67
+ except RuntimeError as e:
68
+ if "Input length of input_ids is" in str(e) and "but `max_length` is set to" in str(e): # more robust check for context window error
69
+ answer = " ".join(answer.split()[:num_words])
70
+ else:
71
+ return f"Summarization Error: {e}"
72
+ except Exception as e:
73
+ return f"Summarization Error: {e}"
74
  elif relevant_text: #only add context if there is relevant text
75
+ remaining_words = num_words - len(answer.split())
76
  added_words = 0
77
  for sentence in sent_tokenize(relevant_text):
78
  sentence_words = sentence.split()
79
  words_to_add = min(remaining_words - added_words, len(sentence_words))
80
+ answer_words = answer.split()
81
+ answer_words.extend(sentence_words[:words_to_add])
82
+ answer = " ".join(answer_words)
83
  added_words += words_to_add
84
  if added_words == remaining_words:
85
  break
86
+ if len(answer.split()) > num_words:
 
87
  answer = " ".join(answer.split()[:num_words])
88
  return answer.strip()
89
 
90
  except Exception as e:
91
  return str(e)
92
 
93
+ with gr.Blocks() as iface:
94
+ gr.Markdown("PDF Q&A with RoBERTa")
95
  with gr.Row():
96
  pdf_input = gr.File(type="filepath", label="Upload PDF")
97
  question_input = gr.Textbox(lines=2, placeholder="Ask a question")
98
  num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
99
+ answer_output = gr.Textbox(label="Answer", lines=5)
100
  btn = gr.Button("Submit")
101
  btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
102