PanigrahiNirma commited on
Commit
9a5a690
1 Parent(s): 8a789bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -7
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
  from transformers.pipelines import pipeline
4
- import PyPDF2
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import numpy as np
@@ -13,12 +13,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
  qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
14
 
15
  def read_pdf(file):
16
- reader = PyPDF2.PdfFileReader(file)
17
- text = ""
18
- for page_num in range(reader.numPages):
19
- page = reader.getPage(page_num)
20
- text += page.extract_text()
21
- return text
22
 
23
  def retrieve_relevant_text(question, context, top_n=5):
24
  vectorizer = TfidfVectorizer().fit_transform([question] + context)
 
1
  import gradio as gr
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
  from transformers.pipelines import pipeline
4
+ from pdfminer.high_level import extract_text
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import numpy as np
 
13
  qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
14
 
15
  def read_pdf(file):
16
+ return extract_text(file)
 
 
 
 
 
17
 
18
  def retrieve_relevant_text(question, context, top_n=5):
19
  vectorizer = TfidfVectorizer().fit_transform([question] + context)