PanigrahiNirma commited on
Commit
08b6600
1 Parent(s): b25cf43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -6,10 +6,12 @@ from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
7
  import nltk
8
  from nltk.tokenize import sent_tokenize
 
9
 
10
  nltk.download('punkt')
11
 
12
- model_name = "deepset/roberta-base-squad2"
 
13
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
@@ -23,13 +25,14 @@ def read_pdf(file):
23
  except Exception as e:
24
  return str(e)
25
 
26
- def retrieve_relevant_text(question, context, top_n=3):
27
  try:
28
- vectorizer = TfidfVectorizer().fit_transform([question] + context)
29
- vectors = vectorizer.toarray()
30
- cosine_matrix = cosine_similarity(vectors)
31
- similar_ix = np.argsort(cosine_matrix[0])[::-1][1:top_n+1]
32
- relevant_texts = [context[ix] for ix in similar_ix]
 
33
  return " ".join(relevant_texts)
34
  except Exception as e:
35
  return str(e)
@@ -41,12 +44,10 @@ def answer_question(pdf, question, num_words):
41
  return text
42
 
43
  sentences = sent_tokenize(text)
44
- relevant_text = retrieve_relevant_text(question, sentences)
45
 
46
  response = qa_pipeline(question=question, context=relevant_text)
47
  answer = response['answer']
48
- start = response['start']
49
- end = response['end']
50
 
51
  words = answer.split()
52
  if len(words) > num_words:
@@ -77,7 +78,7 @@ iface = gr.Interface(
77
  gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Words")
78
  ],
79
  outputs=gr.Textbox(label="Answer"),
80
- title="PDF Q&A | Made by PanigrahiNirma"
81
  )
82
 
83
  if __name__ == "__main__":
 
6
  import numpy as np
7
  import nltk
8
  from nltk.tokenize import sent_tokenize
9
+ from rank_bm25 import BM25Okapi # For BM25 retrieval
10
 
11
  nltk.download('punkt')
12
 
13
+ # Use a strong RoBERTa model
14
+ model_name = "deepset/roberta-large-squad2" # More powerful than base
15
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
 
25
  except Exception as e:
26
  return str(e)
27
 
28
+ def retrieve_relevant_text_bm25(question, sentences, top_n=3):
29
  try:
30
+ tokenized_corpus = [sent.split() for sent in sentences]
31
+ bm25 = BM25Okapi(tokenized_corpus)
32
+ tokenized_query = question.split()
33
+ doc_scores = bm25.get_scores(tokenized_query)
34
+ top_n_indices = np.argsort(doc_scores)[::-1][:top_n] # Get indices of top N
35
+ relevant_texts = [sentences[i] for i in top_n_indices]
36
  return " ".join(relevant_texts)
37
  except Exception as e:
38
  return str(e)
 
44
  return text
45
 
46
  sentences = sent_tokenize(text)
47
+ relevant_text = retrieve_relevant_text_bm25(question, sentences) # Use BM25
48
 
49
  response = qa_pipeline(question=question, context=relevant_text)
50
  answer = response['answer']
 
 
51
 
52
  words = answer.split()
53
  if len(words) > num_words:
 
78
  gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Words")
79
  ],
80
  outputs=gr.Textbox(label="Answer"),
81
+ title="PDF Q&A with RoBERTa | Made by NP"
82
  )
83
 
84
  if __name__ == "__main__":