Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

mratanusarkar commited on Oct 16, 2024

Commit

4304db6

1 Parent(s): 9761deb

add: docs & docstrings for base + pymupdf4llm

Browse files

Files changed (5) hide show

docs/document_loader/text_loader/base_text_loader.md +3 -0
docs/document_loader/text_loader/pymupdf4llm_text_loader.md +3 -0
medrag_multi_modal/document_loader/text_loader/base_text_loader.py +70 -0
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +60 -0
mkdocs.yml +3 -1

docs/document_loader/text_loader/base_text_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load text from PDF files
2	+
3	+ ::: medrag_multi_modal.document_loader.text_loader.base_text_loader

docs/document_loader/text_loader/pymupdf4llm_text_loader.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ ## Load text from PDF files
2	+
3	+ ::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader

medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED Viewed

@@ -10,6 +10,20 @@ from firerequests import FireRequests
 class BaseTextLoader(ABC):
     def __init__(self, url: str, document_name: str, document_file_path: str):
         self.url = url
         self.document_name = document_name
@@ -23,6 +37,17 @@ class BaseTextLoader(ABC):
     def get_page_indices(
         self, start_page: Optional[int] = None, end_page: Optional[int] = None
     ) -> tuple[int, int]:
         if start_page:
             if start_page > self.page_count:
                 raise ValueError(
@@ -41,6 +66,18 @@ class BaseTextLoader(ABC):
     @abstractmethod
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
         pass
     async def load_data(
@@ -49,6 +86,39 @@ class BaseTextLoader(ABC):
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
     ) -> List[Dict[str, str]]:
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []
         processed_pages_counter: int = 1

 class BaseTextLoader(ABC):
+    """
+    An abstract base class for loading text from a PDF file, processing it into markdown, and optionally publishing it to a Weave dataset.
+    This class handles the downloading of a PDF file from a given URL if it does not already exist locally.
+    Subclasses should implement the specific PDF reading, text extraction, and markdown conversion methods.
+    The processed pages are finally stored in a list of Page objects, which can be optionally published to a Weave dataset.
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
     def __init__(self, url: str, document_name: str, document_file_path: str):
         self.url = url
         self.document_name = document_name
     def get_page_indices(
         self, start_page: Optional[int] = None, end_page: Optional[int] = None
     ) -> tuple[int, int]:
+        """
+        Get the start and end page indices for processing.
+        Args:
+            start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
+            end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+        Returns:
+            tuple[int, int]: A tuple containing the start and end page indices.
+        """
         if start_page:
             if start_page > self.page_count:
                 raise ValueError(
     @abstractmethod
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
+        """
+        Abstract method to process a single page of the PDF.
+        Overwrite this method in the subclass to provide the actual implementation and
+        processing logic for each page of the PDF using various PDF processing libraries.
+        Args:
+            page_idx (int): The index of the page to process.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         pass
     async def load_data(
         end_page: Optional[int] = None,
         weave_dataset_name: Optional[str] = None,
     ) -> List[Dict[str, str]]:
+        """
+        Asynchronously loads text from a PDF file specified by a URL or local file path.
+        The overrided processing abstract method then processes the text into markdown format,
+        and optionally publishes it to a Weave dataset.
+        This function downloads a PDF from a given URL if it does not already exist locally,
+        reads the specified range of pages, converts each page's content to markdown, and
+        returns a list of Page objects containing the text and metadata.
+        It uses `PyPDF2` to calculate the number of pages in the PDF and the
+        overriden `_process_page` method provides the actual implementation to process
+        each page, extract the text from the PDF, and convert it to markdown.
+        It processes pages concurrently using `asyncio` for efficiency.
+        If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
+        Args:
+            start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
+            end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+            weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
+        Returns:
+            List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
+            Each dictionary will have the following keys and values:
+                - "text": (str) the processed page data in markdown format.
+                - "page_idx": (int) the index of the page.
+                - "document_name": (str) the name of the document.
+                - "file_path": (str) the local file path where the PDF is stored.
+                - "file_url": (str) the URL of the PDF file.
+        Raises:
+            ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
+        """
         start_page, end_page = self.get_page_indices(start_page, end_page)
         pages = []
         processed_pages_counter: int = 1

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED Viewed

@@ -6,7 +6,67 @@ from .base_text_loader import BaseTextLoader
 class PyMuPDF4LLMTextLoader(BaseTextLoader):
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
         text = pymupdf4llm.to_markdown(
             doc=self.document_file_path, pages=[page_idx], show_progress=False
         )

 class PyMuPDF4LLMTextLoader(BaseTextLoader):
+    """
+    A concrete implementation of the BaseTextLoader for loading text from a PDF file,
+    processing it into markdown using `pymupdf4llm`, and optionally publishing it to a Weave dataset.
+    This class extends the BaseTextLoader and implements the abstract methods to load and process pages from a PDF file.
+    This class will handle the downloading of a PDF file from a given URL if it does not already exist locally.
+    It uses PyPDF2 to read the PDF and pymupdf4llm to convert pages to markdown. The processed pages are stored in a list
+    of Page objects, which can be optionally published to a Weave dataset.
+    !!! example "Example Usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader.text_loader import (
+            PyMuPDF4LLMTextLoader
+        )
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        loader = PyMuPDF4LLMTextLoader(
+            url=url,
+            document_name="Gray's Anatomy",
+            document_file_path="grays_anatomy.pdf",
+        )
+        asyncio.run(
+            loader.load_data(
+                start_page=31,
+                end_page=36,
+                weave_dataset_name="grays-anatomy-text",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+    """
     async def _process_page(self, page_idx: int) -> Dict[str, str]:
+        """
+        Process a single page of the PDF and convert it to markdown using `pymupdf4llm`.
+        Returns a dictionary with the processed page data.
+        The dictionary will have the following keys and values:
+            - "text": (str) the processed page data in markdown format.
+            - "page_idx": (int) the index of the page.
+            - "document_name": (str) the name of the document.
+            - "file_path": (str) the local file path where the PDF is stored.
+            - "file_url": (str) the URL of the PDF file.
+        Args:
+            page_idx (int): The index of the page to process.
+        Returns:
+            Dict[str, str]: A dictionary containing the processed page data.
+        """
         text = pymupdf4llm.to_markdown(
             doc=self.document_file_path, pages=[page_idx], show_progress=False
         )

mkdocs.yml CHANGED Viewed

@@ -63,7 +63,9 @@ nav:
     - Installation: 'installation/install.md'
     - Development: 'installation/development.md'
   - Document Loader:
-    - Text Loader: 'document_loader/load_text.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval:

     - Installation: 'installation/install.md'
     - Development: 'installation/development.md'
   - Document Loader:
+    - Text Loader:
+      - Base: 'document_loader/text_loader/base_text_loader.md'
+      - PyMuPDF4LLM: 'document_loader/text_loader/pymupdf4llm_text_loader.md'
     - Text and Image Loader: 'document_loader/load_text_image.md'
     - Image Loader: 'document_loader/load_image.md'
   - Retrieval: