Spaces:
Paused
Paused
File size: 2,432 Bytes
4a0c158 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from enum import Enum
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter
separators=[
"\n\n",
"\n",
" ",
".",
",",
"\u200b", # Zero-width space
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
"\u3002", # Ideographic full stop
"",
]
class ChunkingStrategy(Enum):
RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter"
NLTK_TEXT_SPLITTER = "nltk_text_splitter"
SPACY_TEXT_SPLITTER = "spacy_text_splitter"
class TextLoaderAndSplitterWrapper:
def __init__(self, strategy: ChunkingStrategy, file_path:str):
# Defaults
self.splitter = None
self.documents = []
# Determine with splitter strategy to use from parameter
if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER:
self.splitter = RecursiveCharacterTextSplitter(separators=separators)
elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER:
self.splitter = NLTKTextSplitter()
elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER:
self.splitter = SpacyTextSplitter()
else:
raise ValueError(f"Unknown strategy: {strategy}")
# Load the document and chunk it
self.file_path = file_path
def load_documents(self):
if self.file_path.endswith(".pdf"):
# Use PDF loader
pdf_loader = PyPDFLoader(self.file_path)
self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter.
return self.documents
elif self.file_path.endswith(".txt"):
# Use Text loader
text_loader = TextLoader(self.file_path)
self.documents = text_loader.load_and_split(text_splitter=self.splitter)
return self.documents
else:
raise ValueError(f"Unknown file type: {self.file_path}")
def split(self, text: str):
return self.splitter.split(text)
def join(self, chunks: list):
return self.splitter.join(chunks)
def __str__(self):
return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})"
def __repr__(self):
return str(self) |