HuggingDocsLLM / backend_utils /text_processor.py
Jofthomas's picture
Jofthomas HF staff
initial commit
88768cb
from abc import ABC, abstractmethod
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
class TextProcessor(ABC):
@abstractmethod
def split_text(self, text):
pass
@abstractmethod
def create_embeddings(self, chunks):
pass
class DefaultTextProcessor(TextProcessor):
def __init__(self,chunk_size,chunk_overlap):
self.chunk_overlap = chunk_overlap
self.chunk_size = chunk_size
def split_text(self, text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size ,
chunk_overlap=self.chunk_overlap,
separators=[" ", ",", "\n"],
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def create_embeddings(self, chunks):
if not chunks:
return None
embeddings = OpenAIEmbeddings()
try:
return FAISS.from_texts(chunks, embeddings)
except Exception as e:
print(f"Error creating embeddings: {e}")
return None