QueryQuack / backend /text_chunking.py
Subrahmanyagaonkar's picture
added all required files from github
9670e58
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings # Updated import
from backend.model_utils import ensure_model_exists
def chunk_and_embed(text, metadata=None):
"""
Split text into chunks and create embeddings using LangChain.
Args:
text: Text to chunk and embed
metadata: Document metadata
Returns:
chunks: Text chunks
embeddings: Embeddings for each chunk
chunk_metadata: Metadata for each chunk
"""
if not text or not isinstance(text, str):
st.warning("No valid text to process")
return [], [], []
try:
# Create text splitter
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
# Split text into chunks
chunks = text_splitter.split_text(text)
if not chunks:
st.warning("No chunks created")
return [], [], []
# Ensure the model exists
model_path = ensure_model_exists("all-MiniLM-L6-v2")
if not model_path:
st.error("Failed to load embedding model")
return chunks, [], []
# Create embeddings
embeddings_model = HuggingFaceEmbeddings(
model_name=model_path,
model_kwargs={'device': 'cpu'}
)
# Generate chunk metadata
chunk_metadata = []
for i, chunk in enumerate(chunks):
chunk_meta = {
"text": chunk,
"chunk_index": i
}
# Add document metadata if provided
if metadata and isinstance(metadata, dict):
for key, value in metadata.items():
if key != "text" and key != "chunk_index":
chunk_meta[key] = value
chunk_metadata.append(chunk_meta)
# Create raw embeddings for storage
raw_embeddings = embeddings_model.embed_documents(chunks)
return chunks, raw_embeddings, chunk_metadata
except Exception as e:
st.error(f"Error chunking and embedding text: {str(e)}")
return [], [], []