Spaces:
Running
Running
| import os,glob,chromadb | |
| from sentence_transformers import SentenceTransformer,models | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from utils.constants import CHROMA_DIR,DOCS_DIR,COLLECTION,EMB_MODEL_NAME | |
| def get_embedder(): | |
| w=models.Transformer(EMB_MODEL_NAME);p=models.Pooling(w.get_word_embedding_dimension()) | |
| return SentenceTransformer(modules=[w,p]) | |
| def get_chroma(): | |
| c=chromadb.PersistentClient(path=CHROMA_DIR) | |
| return c,c.get_or_create_collection(COLLECTION,metadata={"hnsw:space":"cosine"}) | |
| def embed(m,txts):return m.encode(txts,convert_to_numpy=True).tolist() | |
| def seed_index(col,m,folder): | |
| sp=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=150) | |
| paths=glob.glob(folder+'/*.txt') | |
| ids,docs,meta=[],[],[] | |
| for p in paths: | |
| t=os.path.basename(p).replace('.txt','') | |
| with open(p) as f:tx=f.read() | |
| for i,ch in enumerate(sp.split_text(tx)): | |
| ids.append(f"{t}-{i}");docs.append(ch);meta.append({"title":t,"source":p}) | |
| em=embed(m,docs) | |
| try:col.add(ids=ids,documents=docs,metadatas=meta,embeddings=em) | |
| except:col.delete(ids=ids);col.add(ids=ids,documents=docs,metadatas=meta,embeddings=em) | |
| return len(docs) | |
| def retrieve(col,m,q,k): | |
| em=embed(m,[q])[0] | |
| r=col.query(query_embeddings=[em],n_results=k,include=["documents","metadatas"]) | |
| out=[] | |
| if r.get("ids"): | |
| for i in range(len(r["ids"][0])): | |
| out.append({"text":r["documents"][0][i],"title":r["metadatas"][0][i]["title"],"source":r["metadatas"][0][i]["source"]}) | |
| return out | |