Spaces:
Runtime error
Runtime error
##return embedding vector for a given text | |
##uses senetence based emebdings | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_core.documents import Document | |
from langchain_chroma import Chroma | |
##from sentence_transformers import SentenceTransformer | |
##model_name = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
##SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
from sentence_transformers import SentenceTransformer | |
import os | |
# Ensure HF cache is set to a writable location | |
os.environ["TRANSFORMERS_CACHE"] = "/app/cache" | |
model_name = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2") | |
model_kwargs = {"device": "cpu"} | |
encode_kwargs = {"normalize_embeddings": True} | |
hf = HuggingFaceEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
vector_store = Chroma( | |
collection_name="collection", | |
embedding_function=hf, | |
persist_directory="chroma_langchain_db", | |
) | |
def set_embedding(text:str,doc_id:str,user_id:str): | |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40 | |
) | |
texts = text_splitter.split_text(text) | |
print(type(texts[0]))## IT IS LIST OF STRINGS | |
for i in range(len(texts)): | |
##vector=hf.embed_query(texts[i]) | |
vector_id=(user_id+doc_id+str(i)) | |
globals()[f"document_{i}"]=Document( | |
page_content= texts[i], | |
metadata={"doc_id": doc_id, "user_id": user_id}, | |
id= vector_id, | |
) | |
vector_store.add_documents([globals()[f"document_{i}"]]) | |
print(f"Added document {i} with id {vector_id}") | |
def get_chunks(query:str,user_id:str,doc_id:str): | |
results = vector_store.similarity_search( | |
query, | |
k=5, | |
filter={"user_id": user_id} | |
) | |
list_of_chunks=[] | |
for res in results: | |
list_of_chunks.append(res.page_content) | |
return list_of_chunks |