PanigrahiNirma commited on
Commit
e2bc8f2
1 Parent(s): 08b6600

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -28
app.py CHANGED
@@ -6,12 +6,11 @@ from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
7
  import nltk
8
  from nltk.tokenize import sent_tokenize
9
- from rank_bm25 import BM25Okapi # For BM25 retrieval
10
 
11
  nltk.download('punkt')
12
 
13
- # Use a strong RoBERTa model
14
- model_name = "deepset/roberta-large-squad2" # More powerful than base
15
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
@@ -31,7 +30,7 @@ def retrieve_relevant_text_bm25(question, sentences, top_n=3):
31
  bm25 = BM25Okapi(tokenized_corpus)
32
  tokenized_query = question.split()
33
  doc_scores = bm25.get_scores(tokenized_query)
34
- top_n_indices = np.argsort(doc_scores)[::-1][:top_n] # Get indices of top N
35
  relevant_texts = [sentences[i] for i in top_n_indices]
36
  return " ".join(relevant_texts)
37
  except Exception as e:
@@ -44,42 +43,41 @@ def answer_question(pdf, question, num_words):
44
  return text
45
 
46
  sentences = sent_tokenize(text)
47
- relevant_text = retrieve_relevant_text_bm25(question, sentences) # Use BM25
48
 
49
  response = qa_pipeline(question=question, context=relevant_text)
50
  answer = response['answer']
51
 
52
  words = answer.split()
53
- if len(words) > num_words:
54
- answer = " ".join(words[:num_words])
55
- elif len(words) < num_words:
56
  remaining_words = num_words - len(words)
57
- added_sentences = []
58
  for sentence in sent_tokenize(relevant_text):
59
- if remaining_words > 0:
60
- sentence_words = sentence.split()
61
- to_add = min(remaining_words, len(sentence_words))
62
- added_sentences.append(" ".join(sentence_words[:to_add]))
63
- remaining_words -= to_add
64
- else:
65
  break
66
- answer += " " + " ".join(added_sentences)
67
- answer = answer.strip()
68
- return answer
 
69
 
70
  except Exception as e:
71
  return str(e)
72
 
73
- iface = gr.Interface(
74
- fn=answer_question,
75
- inputs=[
76
- gr.File(type="filepath", label="Upload PDF"),
77
- gr.Textbox(lines=2, placeholder="Ask a question"),
78
- gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Words")
79
- ],
80
- outputs=gr.Textbox(label="Answer"),
81
- title="PDF Q&A with RoBERTa | Made by NP"
82
- )
83
 
84
  if __name__ == "__main__":
85
  iface.launch()
 
6
  import numpy as np
7
  import nltk
8
  from nltk.tokenize import sent_tokenize
9
+ from rank_bm25 import BM25Okapi
10
 
11
  nltk.download('punkt')
12
 
13
+ model_name = "deepset/roberta-large-squad2"
 
14
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
 
30
  bm25 = BM25Okapi(tokenized_corpus)
31
  tokenized_query = question.split()
32
  doc_scores = bm25.get_scores(tokenized_query)
33
+ top_n_indices = np.argsort(doc_scores)[::-1][:top_n]
34
  relevant_texts = [sentences[i] for i in top_n_indices]
35
  return " ".join(relevant_texts)
36
  except Exception as e:
 
43
  return text
44
 
45
  sentences = sent_tokenize(text)
46
+ relevant_text = retrieve_relevant_text_bm25(question, sentences)
47
 
48
  response = qa_pipeline(question=question, context=relevant_text)
49
  answer = response['answer']
50
 
51
  words = answer.split()
52
+ if len(words) >= num_words:
53
+ answer = " ".join(words[:num_words]) # Strict truncation
54
+ elif relevant_text: #only add context if there is relevant text
55
  remaining_words = num_words - len(words)
56
+ added_words = 0
57
  for sentence in sent_tokenize(relevant_text):
58
+ sentence_words = sentence.split()
59
+ words_to_add = min(remaining_words - added_words, len(sentence_words))
60
+ words.extend(sentence_words[:words_to_add])
61
+ added_words += words_to_add
62
+ if added_words == remaining_words:
 
63
  break
64
+ answer = " ".join(words)
65
+ if len(answer.split()) > num_words: #truncate again if needed
66
+ answer = " ".join(answer.split()[:num_words])
67
+ return answer.strip()
68
 
69
  except Exception as e:
70
  return str(e)
71
 
72
+ with gr.Blocks() as iface: #use blocks for custom layout
73
+ gr.Markdown("PDF Q&A with RoBERTa | Made by NP")
74
+ with gr.Row():
75
+ pdf_input = gr.File(type="filepath", label="Upload PDF")
76
+ question_input = gr.Textbox(lines=2, placeholder="Ask a question")
77
+ num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words")
78
+ answer_output = gr.Textbox(label="Answer", lines=5) # increased lines for better display
79
+ btn = gr.Button("Submit")
80
+ btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output)
 
81
 
82
  if __name__ == "__main__":
83
  iface.launch()