File size: 1,917 Bytes
62cc506
 
 
 
 
 
7ed401c
 
3a31bbc
7ed401c
 
 
 
 
 
 
 
 
 
62cc506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
effc96d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
##return embedding vector for a given text
##uses senetence based emebdings
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma

##from sentence_transformers import SentenceTransformer
##model_name = SentenceTransformer("paraphrase-MiniLM-L6-v2")
##SentenceTransformer("paraphrase-MiniLM-L6-v2")

from sentence_transformers import SentenceTransformer
import os

# Ensure HF cache is set to a writable location
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"

model_name = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")


model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)   


vector_store = Chroma(
collection_name="collection",
embedding_function=hf,
persist_directory="chroma_langchain_db",
)


def set_embedding(text:str,doc_id:str,user_id:str):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base", chunk_size=300, chunk_overlap=40
)
    texts = text_splitter.split_text(text)
    print(type(texts[0]))## IT IS LIST OF STRINGS 
    
    for i in range(len(texts)):
        ##vector=hf.embed_query(texts[i])
        vector_id=(user_id+doc_id+str(i))
        globals()[f"document_{i}"]=Document(
            page_content= texts[i],
            metadata={"doc_id": doc_id, "user_id": user_id},
            id= vector_id,
        )
        vector_store.add_documents([globals()[f"document_{i}"]])
        print(f"Added document {i} with id {vector_id}")

def get_chunks(query:str,user_id:str,doc_id:str):
    results = vector_store.similarity_search(
    query,
    k=5,
    filter={"user_id": user_id}
)
    list_of_chunks=[]
    for res in results:
        list_of_chunks.append(res.page_content)
    return list_of_chunks