File size: 2,061 Bytes
eccde2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from pathlib import Path
import pypdf  
import docx2txt 

class DocumentReader:
    @staticmethod
    def read_pdf(data_path):
        with open(data_path, "rb") as fp:
            pdf = pypdf.PdfReader(fp)  # Open the PDF file
            num_pages = len(pdf.pages)  # Get the number of pages in the PDF
            docs = []
            for page in range(num_pages):
                page_text = pdf.pages[page].extract_text()  # Extract text from the page
                page_label = pdf.page_labels[page]  # Get page label (e.g., page number)
                metadata = {"page_label": page_label, "file_name": data_path.name}
                docs.append({"text": page_text, "metadata": metadata})
            return docs

    @staticmethod
    def read_docx(data_path):
        metadata = {"file_name": data_path.name}
        doc = docx2txt.process(data_path)  # Extract text from the DOCX file
        docs = [{'text': doc, 'metadata': metadata}]
        return docs

    @staticmethod
    def read_txt(data_path):
        print(data_path.name)
        with open(data_path, "r") as fp:
            text = fp.read()  # Read text from the TXT file
            metadata = {"file_name": data_path.name}
            docs = [{'text': text, 'metadata': metadata}]
        return docs

    @staticmethod
    def read_document(file_path):
        data_path = Path(file_path)
        if data_path.suffix == ".pdf":
            return DocumentReader.read_pdf(data_path)  # Read PDF document
        elif data_path.suffix == ".docx":
            return DocumentReader.read_docx(data_path)  # Read DOCX document
        elif data_path.suffix == ".txt":
            return DocumentReader.read_txt(data_path)  # Read TXT document
        else:
            raise ValueError("Unsupported file format")

if __name__=='__main__':
    # Example usage:
    DATA_PATH = '71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf'
    documents = DocumentReader.read_document(DATA_PATH)  # Read the specified document
    print(documents)  # Print the extracted text and metadata