from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceBgeEmbeddings from sentence_transformers import SentenceTransformer, util from langchain.docstore.document import Document import numpy as np from config import * import os os.environ['CURL_CA_BUNDLE'] = "" embedding_int = HuggingFaceBgeEmbeddings( model_name=MODEL_NAME, encode_kwargs=ENCODE_KWARGS, query_instruction=QUERY_INSTRUCTION ) embedding_sim = HuggingFaceBgeEmbeddings( model_name=MODEL_NAME, encode_kwargs=ENCODE_KWARGS, query_instruction='Retrieve semantically similar text.' ) db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_int) retriever = db.as_retriever(search_kwargs={"k": TOP_K}) def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func): # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content. # Der Inhalt von page_content wird embedded und so für die sucher verwendet. docs = [] for index, beruf in berufe.iterrows(): # Create document. doc = Document( page_content= beruf['short name'] + ' ' + beruf['full name'] + ' ' + beruf['description'], metadata={ "id": beruf["id"], "name": beruf['short name'], "description": beruf["description"], "entry_requirements": beruf["entry requirements"] }, ) docs.append(doc) db_temp = Chroma.from_documents(documents = docs, embedding= embedding_sim, collection_metadata = {"hnsw:space": similarity_func}) # Retriever will search for the top_5 most similar documents to the query. retriever_temp = db_temp.as_retriever(search_kwargs={"k": top_k}) top_similar_occupations = retriever_temp.get_relevant_documents(target_occupation_query) return top_similar_occupations