Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings # Updated import | |
from backend.model_utils import ensure_model_exists | |
def chunk_and_embed(text, metadata=None): | |
""" | |
Split text into chunks and create embeddings using LangChain. | |
Args: | |
text: Text to chunk and embed | |
metadata: Document metadata | |
Returns: | |
chunks: Text chunks | |
embeddings: Embeddings for each chunk | |
chunk_metadata: Metadata for each chunk | |
""" | |
if not text or not isinstance(text, str): | |
st.warning("No valid text to process") | |
return [], [], [] | |
try: | |
# Create text splitter | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
# Split text into chunks | |
chunks = text_splitter.split_text(text) | |
if not chunks: | |
st.warning("No chunks created") | |
return [], [], [] | |
# Ensure the model exists | |
model_path = ensure_model_exists("all-MiniLM-L6-v2") | |
if not model_path: | |
st.error("Failed to load embedding model") | |
return chunks, [], [] | |
# Create embeddings | |
embeddings_model = HuggingFaceEmbeddings( | |
model_name=model_path, | |
model_kwargs={'device': 'cpu'} | |
) | |
# Generate chunk metadata | |
chunk_metadata = [] | |
for i, chunk in enumerate(chunks): | |
chunk_meta = { | |
"text": chunk, | |
"chunk_index": i | |
} | |
# Add document metadata if provided | |
if metadata and isinstance(metadata, dict): | |
for key, value in metadata.items(): | |
if key != "text" and key != "chunk_index": | |
chunk_meta[key] = value | |
chunk_metadata.append(chunk_meta) | |
# Create raw embeddings for storage | |
raw_embeddings = embeddings_model.embed_documents(chunks) | |
return chunks, raw_embeddings, chunk_metadata | |
except Exception as e: | |
st.error(f"Error chunking and embedding text: {str(e)}") | |
return [], [], [] |