PanigrahiNirma commited on
Commit
4944874
1 Parent(s): 877126a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -21
app.py CHANGED
@@ -1,42 +1,63 @@
1
  import gradio as gr
2
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
- from transformers.pipelines import pipeline
4
  from pdfminer.high_level import extract_text
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import numpy as np
8
 
9
- # Load T5 model and tokenizer
10
- model_name = "t5-large"
11
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
12
- tokenizer = AutoTokenizer.from_pretrained(model_name)
13
- qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
 
 
 
 
 
14
 
15
  def read_pdf(file):
16
- return extract_text(file)
 
 
 
 
 
 
17
 
18
  def retrieve_relevant_text(question, context, top_n=5):
19
- vectorizer = TfidfVectorizer().fit_transform([question] + context)
20
- vectors = vectorizer.toarray()
21
- cosine_matrix = cosine_similarity(vectors)
22
- similar_ix = np.argsort(cosine_matrix[0])[::-1][1:top_n+1]
23
- relevant_texts = [context[ix] for ix in similar_ix]
24
- return " ".join(relevant_texts)
 
 
 
25
 
26
  def answer_question(pdf, question):
27
- context = read_pdf(pdf).split("\n")
28
- relevant_text = retrieve_relevant_text(question, context)
29
- input_text = f"question: {question} context: {relevant_text}"
30
- response = qa_pipeline(input_text, max_length=512, do_sample=False)
31
- return response[0]['generated_text']
 
 
 
 
 
 
 
32
 
33
  # Define Gradio interface
34
  iface = gr.Interface(
35
  fn=answer_question,
36
  inputs=[gr.inputs.File(type="file", label="Upload PDF"), gr.inputs.Textbox(lines=2, placeholder="Ask a question")],
37
  outputs=gr.outputs.Textbox(label="Answer"),
38
- title="PDF Q&A with T5"
39
  )
40
 
41
  if __name__ == "__main__":
42
- iface.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
3
+ from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
4
  from pdfminer.high_level import extract_text
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import numpy as np
8
 
9
+ # Load BigBird for text extraction
10
+ bb_model_name = "google/bigbird-roberta-base"
11
+ bb_model = BigBirdForSequenceClassification.from_pretrained(bb_model_name)
12
+ bb_tokenizer = BigBirdTokenizer.from_pretrained(bb_model_name)
13
+
14
+ # Load BERT for QA
15
+ bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
16
+ bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)
17
+ bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
18
+ qa_pipeline = pipeline("question-answering", model=bert_model, tokenizer=bert_tokenizer)
19
 
20
  def read_pdf(file):
21
+ try:
22
+ text = extract_text(file)
23
+ if not text:
24
+ raise ValueError("PDF extraction failed. The PDF might be scanned or have an unsupported format.")
25
+ return text
26
+ except Exception as e:
27
+ return str(e)
28
 
29
  def retrieve_relevant_text(question, context, top_n=5):
30
+ try:
31
+ vectorizer = TfidfVectorizer().fit_transform([question] + context)
32
+ vectors = vectorizer.toarray()
33
+ cosine_matrix = cosine_similarity(vectors)
34
+ similar_ix = np.argsort(cosine_matrix[0])[::-1][1:top_n+1]
35
+ relevant_texts = [context[ix] for ix in similar_ix]
36
+ return " ".join(relevant_texts)
37
+ except Exception as e:
38
+ return str(e)
39
 
40
  def answer_question(pdf, question):
41
+ try:
42
+ context = read_pdf(pdf).split("\n")
43
+ if isinstance(context, str):
44
+ return context # Return error message if read_pdf failed
45
+ relevant_text = retrieve_relevant_text(question, context)
46
+ if isinstance(relevant_text, str):
47
+ return relevant_text # Return error message if retrieve_relevant_text failed
48
+ input_text = f"question: {question} context: {relevant_text}"
49
+ response = qa_pipeline(question=question, context=relevant_text)
50
+ return response['answer']
51
+ except Exception as e:
52
+ return str(e)
53
 
54
  # Define Gradio interface
55
  iface = gr.Interface(
56
  fn=answer_question,
57
  inputs=[gr.inputs.File(type="file", label="Upload PDF"), gr.inputs.Textbox(lines=2, placeholder="Ask a question")],
58
  outputs=gr.outputs.Textbox(label="Answer"),
59
+ title="PDF Q&A with Mixed Models"
60
  )
61
 
62
  if __name__ == "__main__":
63
+ iface.launch()