|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
CACHE_DIR = "/tmp/hf_cache" |
|
|
os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
|
os.environ["HF_HOME"] = CACHE_DIR |
|
|
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR |
|
|
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR |
|
|
os.environ["HF_MODULES_CACHE"] = CACHE_DIR |
|
|
|
|
|
print(f"β
Using Hugging Face cache at {CACHE_DIR}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_model = SentenceTransformer( |
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
cache_folder=CACHE_DIR |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_embeddings(chunks: list) -> list: |
|
|
""" |
|
|
π Generate embeddings for a list of text chunks. |
|
|
Args: |
|
|
chunks (list): List of text chunks. |
|
|
Returns: |
|
|
list: List of embedding vectors (plain Python lists). |
|
|
""" |
|
|
embeddings = _model.encode(chunks, convert_to_numpy=True) |
|
|
return embeddings.tolist() |
|
|
|