mratanusarkar commited on
Commit
9761deb
·
1 Parent(s): bb79bf4

add: base text loader and pymupdf4llm loader

Browse files

break the existing load_text.py into a proper text loader sub module.

medrag_multi_modal/document_loader/text_loader/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .pymupdf4llm_text_loader import PyMuPDF4LLMTextLoader
2
+
3
+ __all__ = ["PyMuPDF4LLMTextLoader"]
medrag_multi_modal/document_loader/text_loader/base_text_loader.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List, Optional
5
+
6
+ import PyPDF2
7
+ import rich
8
+ import weave
9
+ from firerequests import FireRequests
10
+
11
+
12
+ class BaseTextLoader(ABC):
13
+ def __init__(self, url: str, document_name: str, document_file_path: str):
14
+ self.url = url
15
+ self.document_name = document_name
16
+ self.document_file_path = document_file_path
17
+ if not os.path.exists(self.document_file_path):
18
+ FireRequests().download(url, filename=self.document_file_path)
19
+ with open(self.document_file_path, "rb") as file:
20
+ pdf_reader = PyPDF2.PdfReader(file)
21
+ self.page_count = len(pdf_reader.pages)
22
+
23
+ def get_page_indices(
24
+ self, start_page: Optional[int] = None, end_page: Optional[int] = None
25
+ ) -> tuple[int, int]:
26
+ if start_page:
27
+ if start_page > self.page_count:
28
+ raise ValueError(
29
+ f"Start page {start_page} is greater than the total page count {self.page_count}"
30
+ )
31
+ else:
32
+ start_page = 0
33
+ if end_page:
34
+ if end_page > self.page_count:
35
+ raise ValueError(
36
+ f"End page {end_page} is greater than the total page count {self.page_count}"
37
+ )
38
+ else:
39
+ end_page = self.page_count - 1
40
+ return start_page, end_page
41
+
42
+ @abstractmethod
43
+ async def _process_page(self, page_idx: int) -> Dict[str, str]:
44
+ pass
45
+
46
+ async def load_data(
47
+ self,
48
+ start_page: Optional[int] = None,
49
+ end_page: Optional[int] = None,
50
+ weave_dataset_name: Optional[str] = None,
51
+ ) -> List[Dict[str, str]]:
52
+ start_page, end_page = self.get_page_indices(start_page, end_page)
53
+ pages = []
54
+ processed_pages_counter: int = 1
55
+ total_pages = end_page - start_page
56
+
57
+ async def process_page(page_idx):
58
+ nonlocal processed_pages_counter
59
+ page_data = await self._process_page(page_idx)
60
+ pages.append(page_data)
61
+ rich.print(
62
+ f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
63
+ )
64
+ processed_pages_counter += 1
65
+
66
+ tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
67
+ for task in asyncio.as_completed(tasks):
68
+ await task
69
+
70
+ if weave_dataset_name:
71
+ weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
72
+ return pages
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+
3
+ import pymupdf4llm
4
+
5
+ from .base_text_loader import BaseTextLoader
6
+
7
+
8
+ class PyMuPDF4LLMTextLoader(BaseTextLoader):
9
+ async def _process_page(self, page_idx: int) -> Dict[str, str]:
10
+ text = pymupdf4llm.to_markdown(
11
+ doc=self.document_file_path, pages=[page_idx], show_progress=False
12
+ )
13
+ return {
14
+ "text": text,
15
+ "page_idx": page_idx,
16
+ "document_name": self.document_name,
17
+ "file_path": self.document_file_path,
18
+ "file_url": self.url,
19
+ }