geekyrakshit commited on
Commit
ace03e3
·
1 Parent(s): 49d583d

add: SemanticChunker

Browse files
medrag_multi_modal/{semantic_chunker.py → semantic_chunking.py} RENAMED
@@ -17,6 +17,42 @@ TOKENIZER_OR_TOKEN_COUNTER = Union[
17
 
18
 
19
  class SemanticChunker:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def __init__(
21
  self,
22
  tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",
 
17
 
18
 
19
  class SemanticChunker:
20
+ """
21
+ SemanticChunker is a class that chunks documents into smaller segments and
22
+ publishes them as datasets.
23
+
24
+ This class uses the `semchunk` library to break down large documents into
25
+ smaller, manageable chunks based on a specified tokenizer or token counter.
26
+ This is particularly useful for processing large text datasets where
27
+ smaller segments are needed for analysis or other operations.
28
+
29
+ !!! example "Example Usage"
30
+ ```python
31
+ import weave
32
+ from dotenv import load_dotenv
33
+
34
+ from medrag_multi_modal.semantic_chunking import SemanticChunker
35
+
36
+ load_dotenv()
37
+ weave.init(project_name="ml-colabs/medrag-multi-modal")
38
+ chunker = SemanticChunker(chunk_size=256)
39
+ chunker.chunk_and_publish(
40
+ document_dataset_name="grays-anatomy-text:v13",
41
+ chunk_dataset_name="grays-anatomy-chunks",
42
+ )
43
+ ```
44
+
45
+ Args:
46
+ tokenizer_or_token_counter (TOKENIZER_OR_TOKEN_COUNTER): The tokenizer or
47
+ token counter to be used for chunking.
48
+ chunk_size (Optional[int]): The size of each chunk. If not specified, the
49
+ default chunk size from `semchunk` will be used.
50
+ max_token_chars (Optional[int]): The maximum number of characters per token.
51
+ If not specified, the default value from `semchunk` will be used.
52
+ memoize (bool): Whether to memoize the chunking process for efficiency.
53
+ Default is True.
54
+ """
55
+
56
  def __init__(
57
  self,
58
  tokenizer_or_token_counter: TOKENIZER_OR_TOKEN_COUNTER = "o200k_base",