Spaces:

geekyrakshit
/

medrag

Running

App Files Files Community

geekyrakshit commited on Oct 9, 2024

Commit

b9d8094

1 Parent(s): 053a082

add: load_text_from_pdf

Browse files

Files changed (6) hide show

.gitignore +8 -0
medrag_multi_modal/__init__.py +0 -0
medrag_multi_modal/document_loader/__init__.py +3 -0
medrag_multi_modal/document_loader/load_text.py +122 -0
pyproject.toml +62 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.venv/
+.env
+cursor_prompt.txt
+**egg-info/
+**pycache**
+.ruff_cache/
+test.py
+**.pdf

medrag_multi_modal/__init__.py ADDED Viewed

File without changes

medrag_multi_modal/document_loader/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .load_text import load_text_from_pdf
2	+
3	+ __all__ = ["load_text_from_pdf"]

medrag_multi_modal/document_loader/load_text.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import asyncio
+import os
+from typing import Optional
+import pymupdf4llm
+import PyPDF2
+import rich
+import weave
+from firerequests import FireRequests
+from pydantic import BaseModel
+class Page(BaseModel):
+    text: str
+    page_idx: int
+    document_name: str
+    file_path: str
+    file_url: str
+async def load_text_from_pdf(
+    url: str,
+    document_name: str,
+    document_file_path: str,
+    start_page: Optional[int] = None,
+    end_page: Optional[int] = None,
+    weave_dataset_name: Optional[str] = None,
+) -> list[Page]:
+    """
+    Asynchronously loads text from a PDF file specified by a URL or local file path,
+    processes the text into markdown format, and optionally publishes it to a Weave dataset.
+    This function downloads a PDF from a given URL if it does not already exist locally,
+    reads the specified range of pages, converts each page's content to markdown, and
+    returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
+    the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
+    `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
+    to a Weave dataset.
+    !!! example "Example usage"
+        ```python
+        import asyncio
+        import weave
+        from medrag_multi_modal.document_loader import load_text_from_pdf
+        weave.init(project_name="ml-colabs/medrag-multi-modal")
+        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
+        asyncio.run(
+            load_text_from_pdf(
+                url=url,
+                document_name="Gray's Anatomy",
+                start_page=9,
+                end_page=15,
+                document_file_path="grays_anatomy.pdf",
+            )
+        )
+        ```
+    Args:
+        url (str): The URL of the PDF file to download if not present locally.
+        document_name (str): The name of the document for metadata purposes.
+        document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+        start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
+        end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+        weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
+    Returns:
+        list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
+    Raises:
+        ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
+    """
+    if not os.path.exists(document_file_path):
+        FireRequests().download(url, filename=document_file_path)
+    with open(document_file_path, "rb") as file:
+        pdf_reader = PyPDF2.PdfReader(file)
+        page_count = len(pdf_reader.pages)
+    print(f"Page count: {page_count}")
+    if start_page:
+        if start_page > page_count:
+            raise ValueError(
+                f"Start page {start_page} is greater than the total page count {page_count}"
+            )
+    else:
+        start_page = 0
+    if end_page:
+        if end_page > page_count:
+            raise ValueError(
+                f"End page {end_page} is greater than the total page count {page_count}"
+            )
+    else:
+        end_page = page_count - 1
+    pages: list[Page] = []
+    processed_pages_counter: int = 1
+    total_pages = end_page - start_page
+    async def process_page(page_idx):
+        nonlocal processed_pages_counter
+        text = pymupdf4llm.to_markdown(
+            doc=document_file_path, pages=[page_idx], show_progress=False
+        )
+        pages.append(
+            Page(
+                text=text,
+                page_idx=page_idx,
+                document_name=document_name,
+                file_path=document_file_path,
+                file_url=url,
+            )
+        )
+        rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
+        processed_pages_counter += 1
+    tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
+    for task in asyncio.as_completed(tasks):
+        await task
+    if weave_dataset_name:
+        weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
+    return pages

pyproject.toml ADDED Viewed

	@@ -0,0 +1,62 @@

+[project]
+name = "medrag-multi-modal"
+version = "0.0.1"
+description = ""
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "Byaldi>=0.0.5",
+    "firerequests>=0.0.7",
+    "python-dotenv>=1.0.1",
+    "pymupdf4llm>=0.0.17",
+    "torch>=2.4.1",
+    "weave>=0.51.12",
+    "pip>=24.2",
+    "uv>=0.4.20",
+    "pytest>=8.3.3",
+    "PyPDF2>=3.0.1",
+    "isort>=5.13.2",
+    "black>=24.10.0",
+    "ruff>=0.6.9",
+    "mkdocs>=1.6.1",
+    "mkdocstrings>=0.26.1",
+    "mkdocstrings-python>=1.11.1",
+    "mkdocs-material>=9.5.39",
+    "mkdocs-minify-plugin>=0.8.0",
+    "mkdocs-glightbox>=0.4.0",
+    "mkdocs-jupyter>=0.25.0",
+    "jupyter>=1.1.1",
+]
+[project.optional-dependencies]
+core = [
+    "Byaldi>=0.0.5",
+    "firerequests>=0.0.7",
+    "python-dotenv>=1.0.1",
+    "pymupdf4llm>=0.0.17",
+    "torch>=2.4.1",
+    "weave>=0.51.12",
+]
+dev = [
+    "pytest>=8.3.3",
+    "PyPDF2>=3.0.1",
+    "isort>=5.13.2",
+    "black>=24.10.0",
+    "ruff>=0.6.9",
+]
+docs = [
+    "mkdocs>=1.6.1",
+    "mkdocstrings>=0.26.1",
+    "mkdocstrings-python>=1.11.1",
+    "mkdocs-material>=9.5.39",
+    "mkdocs-minify-plugin>=0.8.0",
+    "mkdocs-glightbox>=0.4.0",
+    "mkdocs-jupyter>=0.25.0",
+    "jupyter>=1.1.1",
+]
+[tool.pytest.ini_options]
+pythonpath = "."

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff