PanigrahiNirma commited on
Commit
3bd289d
·
verified ·
1 Parent(s): 1b4d7ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -25
app.py CHANGED
@@ -4,23 +4,27 @@ from pdfminer.high_level import extract_text
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
 
 
7
 
8
- # Load BERT for QA
9
- bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
10
- bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)
11
- bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
12
- qa_pipeline = pipeline("question-answering", model=bert_model, tokenizer=bert_tokenizer)
 
 
13
 
14
  def read_pdf(file):
15
  try:
16
  text = extract_text(file)
17
  if not text:
18
- raise ValueError("PDF extraction failed. The PDF might be scanned or have an unsupported format.")
19
  return text
20
  except Exception as e:
21
  return str(e)
22
 
23
- def retrieve_relevant_text(question, context, top_n=5):
24
  try:
25
  vectorizer = TfidfVectorizer().fit_transform([question] + context)
26
  vectors = vectorizer.toarray()
@@ -33,12 +37,12 @@ def retrieve_relevant_text(question, context, top_n=5):
33
 
34
  def answer_question(pdf, question, num_words):
35
  try:
36
- context = read_pdf(pdf).split("\n")
37
- if isinstance(context, str):
38
- return context # Return error message if read_pdf failed
39
- relevant_text = retrieve_relevant_text(question, context)
40
- if isinstance(relevant_text, str):
41
- return relevant_text # Return error message if retrieve_relevant_text failed
42
 
43
  response = qa_pipeline(question=question, context=relevant_text)
44
  answer = response['answer']
@@ -48,21 +52,21 @@ def answer_question(pdf, question, num_words):
48
  answer = " ".join(words[:num_words])
49
  elif len(words) < num_words:
50
  remaining_words = num_words - len(words)
51
- sentences = relevant_text.split(". ")
52
- added_words = 0
53
- for sentence in sentences:
54
- sentence_words = sentence.split()
55
- if added_words < remaining_words:
56
- words.extend(sentence_words)
57
- added_words += len(sentence_words)
58
- answer = " ".join(words[:num_words])
59
- return answer
60
- except IndexError as e:
61
- return f"Error: Could not retrieve enough context. {e}" #Handle IndexErrors
 
62
  except Exception as e:
63
  return str(e)
64
 
65
- # Define Gradio interface
66
  iface = gr.Interface(
67
  fn=answer_question,
68
  inputs=[
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
7
+ import nltk
8
+ from nltk.tokenize import sent_tokenize
9
 
10
+ nltk.download('punkt') # Download necessary NLTK data
11
+
12
+ # Use a potentially better QA model
13
+ model_name = "deepset/roberta-base-squad2" # More robust than the previous one
14
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
17
 
18
  def read_pdf(file):
19
  try:
20
  text = extract_text(file)
21
  if not text:
22
+ raise ValueError("PDF extraction failed.")
23
  return text
24
  except Exception as e:
25
  return str(e)
26
 
27
+ def retrieve_relevant_text(question, context, top_n=3): #reduced the top n
28
  try:
29
  vectorizer = TfidfVectorizer().fit_transform([question] + context)
30
  vectors = vectorizer.toarray()
 
37
 
38
  def answer_question(pdf, question, num_words):
39
  try:
40
+ text = read_pdf(pdf)
41
+ if isinstance(text, str):
42
+ return text
43
+
44
+ sentences = sent_tokenize(text) #tokenize the text into sentences
45
+ relevant_text = retrieve_relevant_text(question, sentences)
46
 
47
  response = qa_pipeline(question=question, context=relevant_text)
48
  answer = response['answer']
 
52
  answer = " ".join(words[:num_words])
53
  elif len(words) < num_words:
54
  remaining_words = num_words - len(words)
55
+ added_sentences = []
56
+ for sentence in sent_tokenize(relevant_text):
57
+ if remaining_words > 0:
58
+ sentence_words = sentence.split()
59
+ to_add = min(remaining_words, len(sentence_words))
60
+ added_sentences.append(" ".join(sentence_words[:to_add]))
61
+ remaining_words -= to_add
62
+ else:
63
+ break
64
+ answer += " " + " ".join(added_sentences)
65
+
66
+ return answer.strip() #strip white spaces
67
  except Exception as e:
68
  return str(e)
69
 
 
70
  iface = gr.Interface(
71
  fn=answer_question,
72
  inputs=[