PDF_Query_System / embeddings.py
ajoy0071998's picture
Upload 5 files
6d9f20a verified
##return embedding vector for a given text
##uses senetence based emebdings
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
vector_store = Chroma(
collection_name="collection",
embedding_function=hf,
persist_directory="chroma_langchain_db",
)
def set_embedding(text:str,doc_id:str,user_id:str):
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
)
texts = text_splitter.split_text(text)
print(type(texts[0]))## IT IS LIST OF STRINGS
for i in range(len(texts)):
##vector=hf.embed_query(texts[i])
vector_id=(user_id+doc_id+str(i))
globals()[f"document_{i}"]=Document(
page_content= texts[i],
metadata={"doc_id": doc_id, "user_id": user_id},
id= vector_id,
)
vector_store.add_documents([globals()[f"document_{i}"]])
print(f"Added document {i} with id {vector_id}")
def get_chunks(query:str,user_id:str,doc_id:str):
results = vector_store.similarity_search(
query,
k=5,
filter={"user_id": user_id}
)
list_of_chunks=[]
for res in results:
list_of_chunks.append(res.page_content)
return list_of_chunks