from jinja2 import Template from transformers import AutoModel, AutoTokenizer from .logging import logging_info def initEmbedding(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): return AutoModel.from_pretrained(model_name, **model_wargs) def initTokenizer(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): return AutoTokenizer.from_pretrained(model_name, **model_wargs) def detectEncoding(b: bytes): import chardet logging_info(f"chardet.detect(b): {chardet.detect(b)}") return chardet.detect(b)["encoding"] def convertToUTF8(b: bytes): if detectEncoding(b): return b.decode(detectEncoding(b)) return b.decode("utf-8")