Course_rec / app /embedding_setup.py
Tao Wu
initial
3430cbb
raw
history blame
No virus
1.94 kB
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from sentence_transformers import SentenceTransformer, util
from langchain.docstore.document import Document
import numpy as np
from config import *
import os
os.environ['CURL_CA_BUNDLE'] = ""
embedding_int = HuggingFaceBgeEmbeddings(
model_name=MODEL_NAME,
encode_kwargs=ENCODE_KWARGS,
query_instruction=QUERY_INSTRUCTION
)
embedding_sim = HuggingFaceBgeEmbeddings(
model_name=MODEL_NAME,
encode_kwargs=ENCODE_KWARGS,
query_instruction='Retrieve semantically similar text.'
)
db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_int)
retriever = db.as_retriever(search_kwargs={"k": TOP_K})
def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func):
# Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content.
# Der Inhalt von page_content wird embedded und so für die sucher verwendet.
docs = []
for index, beruf in berufe.iterrows():
# Create document.
doc = Document(
page_content= beruf['short name'] + ' ' + beruf['full name'] + ' ' + beruf['description'],
metadata={
"id": beruf["id"],
"name": beruf['short name'],
"description": beruf["description"],
"entry_requirements": beruf["entry requirements"]
},
)
docs.append(doc)
db_temp = Chroma.from_documents(documents = docs, embedding= embedding_sim, collection_metadata = {"hnsw:space": similarity_func})
# Retriever will search for the top_5 most similar documents to the query.
retriever_temp = db_temp.as_retriever(search_kwargs={"k": top_k})
top_similar_occupations = retriever_temp.get_relevant_documents(target_occupation_query)
return top_similar_occupations