G.Hemanth Sai
qgen
32d9382
import PyPDF2
import fitz
class PdfReader:
def __init__(self, filename):
self.filename = filename
def total_pages(self):
with open(self.filename, 'rb') as f:
pdf_reader = PyPDF2.PdfFileReader(f)
return pdf_reader.numPages
def read(self):
with open(self.filename, 'rb') as f:
pdf_reader = PyPDF2.PdfFileReader(f)
num_pages = pdf_reader.numPages
count = 0
text = ''
while count < num_pages:
text += pdf_reader.getPage(count).extractText()
count += 1
return text
def read_pages(self, start_page, end_page):
with open(self.filename, 'rb') as f:
pdf_reader = PyPDF2.PdfFileReader(f)
text = ''
for page in range(start_page, end_page):
text += pdf_reader.getPage(page).extractText()
return text
def extract_images(self):
doc = fitz.open(self.filename)
for page_index in range(len(doc)):
for img in doc.get_page_images(page_index):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n < 5: # GRAY or RGB
pix.save(f"{xref}.png")
else: # convert to RGB
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.save(f"{xref}.png")
pix1 = None
pix = None
class ExtractedText(PdfReader):
def __init__(self, filename, output_filename):
super().__init__(filename)
self.output_filename = output_filename
def save(self,start_page, end_page):
with open(self.filename,'rb') as f:
pdf_reader = PyPDF2.PdfFileReader(f)
text = ''
for page in range(start_page, end_page):
text += pdf_reader.getPage(page).extractText()
with open(self.output_filename, 'w',encoding='utf-8') as f:
f.write(text)