import sys from langchain_chroma import Chroma from langchain_core.documents import Document # sys.path.append('C://Users//Admin//Desktop//PDPO//NLL_LLM//util') sys.path.append('/home/user/app/util') from Embeddings import TextEmb3LargeEmbedding from pathlib import Path import time class EmbeddingFunction(): def __init__(self, embeddingmodel): self.embeddingmodel = embeddingmodel def embed_query(self, query): return list(self.embeddingmodel.get_embedding(query)) def embed_documents(self, documents): return [self.embeddingmodel.get_embedding(document) for document in documents] def get_or_create_vector_base(collection_name: str, embedding, documents=None) -> Chroma: """ 判断vector store是否已经构建好,如果没有构建好,则先初始化vector store。不使用embed_documents 方法批量初始化vector store而是for循环逐个加入,同时使用sleep,以此避免调用openai的接口达到最大 上限而导致初始化失败。 """ persist_directory = "/home/user/app/store/" +collection_name persist_path = Path(persist_directory) if not persist_path.exists and not documents: raise ValueError("vector store does not exist and documents is empty") elif persist_path.exists(): print("vector store already exists") vector_store = Chroma( collection_name=collection_name, embedding_function=embedding, persist_directory=persist_directory ) else: print("start creating vector store") vector_store = Chroma( collection_name=collection_name, embedding_function=embedding, persist_directory=persist_directory ) for document in documents: vector_store.add_documents(documents=[document]) time.sleep(1) return vector_store if __name__=="__main__": import pandas as pd requirements_data = pd.read_csv("/root/PTR-LLM/tasks/pcf/reference/NLL_DATA_NEW_Test.csv") requirements_dict_v2 = {} for index, row in requirements_data.iterrows(): requirement = row['Requirement'].split("- ")[1] requirement = requirement + ": " + row['Details'] requirement = requirement.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') if requirement not in requirements_dict_v2: requirements_dict_v2[requirement] = { 'PO': set(), 'safeguard': set() } requirements_dict_v2[requirement]['PO'].add(row['PCF-Privacy Objective'].lower().rstrip() if isinstance(row['PCF-Privacy Objective'], str) else None) requirements_dict_v2[requirement]['safeguard'].add(row['Safeguard'].lower().rstrip()) index = 0 documents = [] for key, value in requirements_dict_v2.items(): page_content = key metadata = { "index": index, "version":2, "PO": str([po for po in value['PO'] if po]), "safeguard":str([safeguard for safeguard in value['safeguard']]) } index += 1 document=Document( page_content=page_content, metadata=metadata ) documents.append(document) embeddingmodel = TextEmb3LargeEmbedding(max_qpm=58) embedding = EmbeddingFunction(embeddingmodel) requirement_v2_vector_store = get_or_create_vector_base('requirement_v2_database', embedding, documents)