mratanusarkar commited on
Commit
4304db6
·
1 Parent(s): 9761deb

add: docs & docstrings for base + pymupdf4llm

Browse files
docs/document_loader/text_loader/base_text_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.text_loader.base_text_loader
docs/document_loader/text_loader/pymupdf4llm_text_loader.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ## Load text from PDF files
2
+
3
+ ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader
medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED
@@ -10,6 +10,20 @@ from firerequests import FireRequests
10
 
11
 
12
  class BaseTextLoader(ABC):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def __init__(self, url: str, document_name: str, document_file_path: str):
14
  self.url = url
15
  self.document_name = document_name
@@ -23,6 +37,17 @@ class BaseTextLoader(ABC):
23
  def get_page_indices(
24
  self, start_page: Optional[int] = None, end_page: Optional[int] = None
25
  ) -> tuple[int, int]:
 
 
 
 
 
 
 
 
 
 
 
26
  if start_page:
27
  if start_page > self.page_count:
28
  raise ValueError(
@@ -41,6 +66,18 @@ class BaseTextLoader(ABC):
41
 
42
  @abstractmethod
43
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
44
  pass
45
 
46
  async def load_data(
@@ -49,6 +86,39 @@ class BaseTextLoader(ABC):
49
  end_page: Optional[int] = None,
50
  weave_dataset_name: Optional[str] = None,
51
  ) -> List[Dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  start_page, end_page = self.get_page_indices(start_page, end_page)
53
  pages = []
54
  processed_pages_counter: int = 1
 
10
 
11
 
12
  class BaseTextLoader(ABC):
13
+ """
14
+ An abstract base class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
15
+
16
+ This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
17
+ Subclasses should implement the specific PDF reading, text extraction, and markdown conversion methods.
18
+
19
+ The processed pages are finally stored in a list of Page objects, which can be optionally published to a Weave dataset.
20
+
21
+ Args:
22
+ url (str): The URL of the PDF file to download if not present locally.
23
+ document_name (str): The name of the document for metadata purposes.
24
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
25
+ """
26
+
27
  def __init__(self, url: str, document_name: str, document_file_path: str):
28
  self.url = url
29
  self.document_name = document_name
 
37
  def get_page_indices(
38
  self, start_page: Optional[int] = None, end_page: Optional[int] = None
39
  ) -> tuple[int, int]:
40
+ """
41
+ Get the start and end page indices for processing.
42
+
43
+ Args:
44
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
45
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
46
+
47
+ Returns:
48
+ tuple[int, int]: A tuple containing the start and end page indices.
49
+ """
50
+
51
  if start_page:
52
  if start_page > self.page_count:
53
  raise ValueError(
 
66
 
67
  @abstractmethod
68
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
69
+ """
70
+ Abstract method to process a single page of the PDF.
71
+
72
+ Overwrite this method in the subclass to provide the actual implementation and
73
+ processing logic for each page of the PDF using various PDF processing libraries.
74
+
75
+ Args:
76
+ page_idx (int): The index of the page to process.
77
+
78
+ Returns:
79
+ Dict[str, str]: A dictionary containing the processed page data.
80
+ """
81
  pass
82
 
83
  async def load_data(
 
86
  end_page: Optional[int] = None,
87
  weave_dataset_name: Optional[str] = None,
88
  ) -> List[Dict[str, str]]:
89
+ """
90
+ Asynchronously loads text from a PDF file specified by a URL or local file path.
91
+ The overrided processing abstract method then processes the text into markdown format,
92
+ and optionally publishes it to a Weave dataset.
93
+
94
+ This function downloads a PDF from a given URL if it does not already exist locally,
95
+ reads the specified range of pages, converts each page's content to markdown, and
96
+ returns a list of Page objects containing the text and metadata.
97
+
98
+ It uses `PyPDF2` to calculate the number of pages in the PDF and the
99
+ overriden `_process_page` method provides the actual implementation to process
100
+ each page, extract the text from the PDF, and convert it to markdown.
101
+ It processes pages concurrently using `asyncio` for efficiency.
102
+
103
+ If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
104
+
105
+ Args:
106
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
107
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
108
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
109
+
110
+ Returns:
111
+ List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
112
+ Each dictionary will have the following keys and values:
113
+ - "text": (str) the processed page data in markdown format.
114
+ - "page_idx": (int) the index of the page.
115
+ - "document_name": (str) the name of the document.
116
+ - "file_path": (str) the local file path where the PDF is stored.
117
+ - "file_url": (str) the URL of the PDF file.
118
+
119
+ Raises:
120
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
121
+ """
122
  start_page, end_page = self.get_page_indices(start_page, end_page)
123
  pages = []
124
  processed_pages_counter: int = 1
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED
@@ -6,7 +6,67 @@ from .base_text_loader import BaseTextLoader
6
 
7
 
8
  class PyMuPDF4LLMTextLoader(BaseTextLoader):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  text = pymupdf4llm.to_markdown(
11
  doc=self.document_file_path, pages=[page_idx], show_progress=False
12
  )
 
6
 
7
 
8
  class PyMuPDF4LLMTextLoader(BaseTextLoader):
9
+ """
10
+ A concrete implementation of the BaseTextLoader for loading text from a PDF file,
11
+ processing it into markdown using `pymupdf4llm`, and optionally publishing it to a Weave dataset.
12
+
13
+ This class extends the BaseTextLoader and implements the abstract methods to load and process pages from a PDF file.
14
+
15
+ This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
16
+ It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
17
+ of Page objects, which can be optionally published to a Weave dataset.
18
+
19
+ !!! example "Example Usage"
20
+ ```python
21
+ import asyncio
22
+
23
+ import weave
24
+
25
+ from medrag_multi_modal.document_loader.text_loader import (
26
+ PyMuPDF4LLMTextLoader
27
+ )
28
+
29
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
30
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
31
+ loader = PyMuPDF4LLMTextLoader(
32
+ url=url,
33
+ document_name="Gray's Anatomy",
34
+ document_file_path="grays_anatomy.pdf",
35
+ )
36
+ asyncio.run(
37
+ loader.load_data(
38
+ start_page=31,
39
+ end_page=36,
40
+ weave_dataset_name="grays-anatomy-text",
41
+ )
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ url (str): The URL of the PDF file to download if not present locally.
47
+ document_name (str): The name of the document for metadata purposes.
48
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
49
+ """
50
+
51
  async def _process_page(self, page_idx: int) -> Dict[str, str]:
52
+ """
53
+
54
+ Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
55
+
56
+ Returns a dictionary with the processed page data.
57
+ The dictionary will have the following keys and values:
58
+ - "text": (str) the processed page data in markdown format.
59
+ - "page_idx": (int) the index of the page.
60
+ - "document_name": (str) the name of the document.
61
+ - "file_path": (str) the local file path where the PDF is stored.
62
+ - "file_url": (str) the URL of the PDF file.
63
+
64
+ Args:
65
+ page_idx (int): The index of the page to process.
66
+
67
+ Returns:
68
+ Dict[str, str]: A dictionary containing the processed page data.
69
+ """
70
  text = pymupdf4llm.to_markdown(
71
  doc=self.document_file_path, pages=[page_idx], show_progress=False
72
  )
mkdocs.yml CHANGED
@@ -63,7 +63,9 @@ nav:
63
  - Installation: 'installation/install.md'
64
  - Development: 'installation/development.md'
65
  - Document Loader:
66
- - Text Loader: 'document_loader/load_text.md'
 
 
67
  - Text and Image Loader: 'document_loader/load_text_image.md'
68
  - Image Loader: 'document_loader/load_image.md'
69
  - Retrieval:
 
63
  - Installation: 'installation/install.md'
64
  - Development: 'installation/development.md'
65
  - Document Loader:
66
+ - Text Loader:
67
+ - Base: 'document_loader/text_loader/base_text_loader.md'
68
+ - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
69
  - Text and Image Loader: 'document_loader/load_text_image.md'
70
  - Image Loader: 'document_loader/load_image.md'
71
  - Retrieval: