geekyrakshit commited on
Commit
b9d8094
·
1 Parent(s): 053a082

add: load_text_from_pdf

Browse files
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ .env
3
+ cursor_prompt.txt
4
+ **egg-info/
5
+ **pycache**
6
+ .ruff_cache/
7
+ test.py
8
+ **.pdf
medrag_multi_modal/__init__.py ADDED
File without changes
medrag_multi_modal/document_loader/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .load_text import load_text_from_pdf
2
+
3
+ __all__ = ["load_text_from_pdf"]
medrag_multi_modal/document_loader/load_text.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from typing import Optional
4
+
5
+ import pymupdf4llm
6
+ import PyPDF2
7
+ import rich
8
+ import weave
9
+ from firerequests import FireRequests
10
+ from pydantic import BaseModel
11
+
12
+
13
+ class Page(BaseModel):
14
+ text: str
15
+ page_idx: int
16
+ document_name: str
17
+ file_path: str
18
+ file_url: str
19
+
20
+
21
+ async def load_text_from_pdf(
22
+ url: str,
23
+ document_name: str,
24
+ document_file_path: str,
25
+ start_page: Optional[int] = None,
26
+ end_page: Optional[int] = None,
27
+ weave_dataset_name: Optional[str] = None,
28
+ ) -> list[Page]:
29
+ """
30
+ Asynchronously loads text from a PDF file specified by a URL or local file path,
31
+ processes the text into markdown format, and optionally publishes it to a Weave dataset.
32
+
33
+ This function downloads a PDF from a given URL if it does not already exist locally,
34
+ reads the specified range of pages, converts each page's content to markdown, and
35
+ returns a list of Page objects containing the text and metadata. It uses PyPDF2 to read
36
+ the PDF and pymupdf4llm to convert pages to markdown. It processes pages concurrently using
37
+ `asyncio` for efficiency. If a weave_dataset_name is provided, the processed pages are published
38
+ to a Weave dataset.
39
+
40
+ !!! example "Example usage"
41
+ ```python
42
+ import asyncio
43
+
44
+ import weave
45
+
46
+ from medrag_multi_modal.document_loader import load_text_from_pdf
47
+
48
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
49
+ url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
50
+ asyncio.run(
51
+ load_text_from_pdf(
52
+ url=url,
53
+ document_name="Gray's Anatomy",
54
+ start_page=9,
55
+ end_page=15,
56
+ document_file_path="grays_anatomy.pdf",
57
+ )
58
+ )
59
+ ```
60
+
61
+ Args:
62
+ url (str): The URL of the PDF file to download if not present locally.
63
+ document_name (str): The name of the document for metadata purposes.
64
+ document_file_path (str): The local file path where the PDF is stored or will be downloaded.
65
+ start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
66
+ end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
67
+ weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
68
+
69
+ Returns:
70
+ list[Page]: A list of Page objects, each containing the text and metadata for a processed page.
71
+
72
+ Raises:
73
+ ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
74
+ """
75
+ if not os.path.exists(document_file_path):
76
+ FireRequests().download(url, filename=document_file_path)
77
+ with open(document_file_path, "rb") as file:
78
+ pdf_reader = PyPDF2.PdfReader(file)
79
+ page_count = len(pdf_reader.pages)
80
+ print(f"Page count: {page_count}")
81
+ if start_page:
82
+ if start_page > page_count:
83
+ raise ValueError(
84
+ f"Start page {start_page} is greater than the total page count {page_count}"
85
+ )
86
+ else:
87
+ start_page = 0
88
+ if end_page:
89
+ if end_page > page_count:
90
+ raise ValueError(
91
+ f"End page {end_page} is greater than the total page count {page_count}"
92
+ )
93
+ else:
94
+ end_page = page_count - 1
95
+
96
+ pages: list[Page] = []
97
+ processed_pages_counter: int = 1
98
+ total_pages = end_page - start_page
99
+
100
+ async def process_page(page_idx):
101
+ nonlocal processed_pages_counter
102
+ text = pymupdf4llm.to_markdown(
103
+ doc=document_file_path, pages=[page_idx], show_progress=False
104
+ )
105
+ pages.append(
106
+ Page(
107
+ text=text,
108
+ page_idx=page_idx,
109
+ document_name=document_name,
110
+ file_path=document_file_path,
111
+ file_url=url,
112
+ )
113
+ )
114
+ rich.print(f"Processed pages {processed_pages_counter}/{total_pages}")
115
+ processed_pages_counter += 1
116
+
117
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
118
+ for task in asyncio.as_completed(tasks):
119
+ await task
120
+ if weave_dataset_name:
121
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
122
+ return pages
pyproject.toml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "medrag-multi-modal"
3
+ version = "0.0.1"
4
+ description = ""
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "Byaldi>=0.0.5",
9
+ "firerequests>=0.0.7",
10
+ "python-dotenv>=1.0.1",
11
+ "pymupdf4llm>=0.0.17",
12
+ "torch>=2.4.1",
13
+ "weave>=0.51.12",
14
+ "pip>=24.2",
15
+ "uv>=0.4.20",
16
+ "pytest>=8.3.3",
17
+ "PyPDF2>=3.0.1",
18
+ "isort>=5.13.2",
19
+ "black>=24.10.0",
20
+ "ruff>=0.6.9",
21
+ "mkdocs>=1.6.1",
22
+ "mkdocstrings>=0.26.1",
23
+ "mkdocstrings-python>=1.11.1",
24
+ "mkdocs-material>=9.5.39",
25
+ "mkdocs-minify-plugin>=0.8.0",
26
+ "mkdocs-glightbox>=0.4.0",
27
+ "mkdocs-jupyter>=0.25.0",
28
+ "jupyter>=1.1.1",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ core = [
33
+ "Byaldi>=0.0.5",
34
+ "firerequests>=0.0.7",
35
+ "python-dotenv>=1.0.1",
36
+ "pymupdf4llm>=0.0.17",
37
+ "torch>=2.4.1",
38
+ "weave>=0.51.12",
39
+ ]
40
+
41
+ dev = [
42
+ "pytest>=8.3.3",
43
+ "PyPDF2>=3.0.1",
44
+ "isort>=5.13.2",
45
+ "black>=24.10.0",
46
+ "ruff>=0.6.9",
47
+ ]
48
+
49
+ docs = [
50
+ "mkdocs>=1.6.1",
51
+ "mkdocstrings>=0.26.1",
52
+ "mkdocstrings-python>=1.11.1",
53
+ "mkdocs-material>=9.5.39",
54
+ "mkdocs-minify-plugin>=0.8.0",
55
+ "mkdocs-glightbox>=0.4.0",
56
+ "mkdocs-jupyter>=0.25.0",
57
+ "jupyter>=1.1.1",
58
+ ]
59
+
60
+
61
+ [tool.pytest.ini_options]
62
+ pythonpath = "."
uv.lock ADDED
The diff for this file is too large to render. See raw diff