Muddasri commited on
Commit
6c2979e
·
1 Parent(s): 1c7cc69

Changed ingestion Logic

Browse files
Files changed (2) hide show
  1. ingest.py +4 -4
  2. retriever/processor.py +23 -1
ingest.py CHANGED
@@ -25,28 +25,28 @@ CHUNKING_TECHNIQUES = [
25
  {
26
  "name": "sentence",
27
  "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
- "chunk_size": 1000,
29
  "chunk_overlap": 100,
30
  "kwargs": {},
31
  },
32
  {
33
  "name": "paragraph",
34
  "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
- "chunk_size": 1000,
36
  "chunk_overlap": 100,
37
  "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
  },
39
  {
40
  "name": "semantic",
41
  "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
- "chunk_size": 1000,
43
  "chunk_overlap": 100,
44
  "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
  },
46
  {
47
  "name": "recursive",
48
  "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
49
- "chunk_size": 1000,
50
  "chunk_overlap": 100,
51
  "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
  },
 
25
  {
26
  "name": "sentence",
27
  "description": "Sentence-level chunking - respects sentence boundaries (NLTK)",
28
+ "chunk_size": 2400,
29
  "chunk_overlap": 100,
30
  "kwargs": {},
31
  },
32
  {
33
  "name": "paragraph",
34
  "description": "Paragraph-level chunking - uses natural paragraph breaks",
35
+ "chunk_size": 4000,
36
  "chunk_overlap": 100,
37
  "kwargs": {"separator": "\n\n"}, # Split on paragraph breaks
38
  },
39
  {
40
  "name": "semantic",
41
  "description": "Semantic chunking - splits where topic/meaning shifts (embedding similarity)",
42
+ "chunk_size": 2000,
43
  "chunk_overlap": 100,
44
  "kwargs": {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 70},
45
  },
46
  {
47
  "name": "recursive",
48
  "description": "Recursive chunking - hierarchical splitting (paragraphs → sentences → words → chars)",
49
+ "chunk_size": 2000,
50
  "chunk_overlap": 100,
51
  "kwargs": {"separators": ["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""], "keep_separator": True},
52
  },
retriever/processor.py CHANGED
@@ -32,8 +32,10 @@ class ChunkProcessor:
32
  - "fixed": Character-based, may split mid-sentence
33
  - "recursive": Recursive character splitting with hierarchical separators
34
  - "character": Character-based splitting on paragraph boundaries
 
35
  - "sentence": Sliding window over NLTK sentences
36
  - "semantic": Embedding-based semantic chunking
 
37
  """
38
  if technique == "fixed":
39
  return CharacterTextSplitter(
@@ -62,6 +64,16 @@ class ChunkProcessor:
62
  is_separator_regex=False
63
  )
64
 
 
 
 
 
 
 
 
 
 
 
65
  elif technique == "sentence":
66
  # sentence-level chunking using NLTK
67
  return NLTKTextSplitter(
@@ -78,8 +90,18 @@ class ChunkProcessor:
78
  breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
79
  )
80
 
 
 
 
 
 
 
 
 
 
 
81
  else:
82
- raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, sentence, semantic")
83
 
84
  # ------------------------------------------------------------------
85
  # Processing
 
32
  - "fixed": Character-based, may split mid-sentence
33
  - "recursive": Recursive character splitting with hierarchical separators
34
  - "character": Character-based splitting on paragraph boundaries
35
+ - "paragraph": Paragraph-level splitting on \\n\\n boundaries
36
  - "sentence": Sliding window over NLTK sentences
37
  - "semantic": Embedding-based semantic chunking
38
+ - "page": Page-level splitting on page markers
39
  """
40
  if technique == "fixed":
41
  return CharacterTextSplitter(
 
64
  is_separator_regex=False
65
  )
66
 
67
+ elif technique == "paragraph":
68
+ # Paragraph-level chunking using paragraph breaks
69
+ return CharacterTextSplitter(
70
+ separator=kwargs.get('separator', "\n\n"),
71
+ chunk_size=chunk_size,
72
+ chunk_overlap=chunk_overlap,
73
+ length_function=len,
74
+ is_separator_regex=False
75
+ )
76
+
77
  elif technique == "sentence":
78
  # sentence-level chunking using NLTK
79
  return NLTKTextSplitter(
 
90
  breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)
91
  )
92
 
93
+ elif technique == "page":
94
+ # Page-level chunking using page markers
95
+ return CharacterTextSplitter(
96
+ separator=kwargs.get('separator', "--- Page"),
97
+ chunk_size=chunk_size,
98
+ chunk_overlap=chunk_overlap,
99
+ length_function=len,
100
+ is_separator_regex=False
101
+ )
102
+
103
  else:
104
+ raise ValueError(f"Technique '{technique}' is not supported. Choose from: fixed, recursive, character, paragraph, sentence, semantic, page")
105
 
106
  # ------------------------------------------------------------------
107
  # Processing