from jinja2 import Template | |
from transformers import AutoModel, AutoTokenizer | |
from .logging import logging_info | |
def initEmbedding(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): | |
return AutoModel.from_pretrained(model_name, **model_wargs) | |
def initTokenizer(model_name="BAAI/bge-small-zh-v1.5", **model_wargs): | |
return AutoTokenizer.from_pretrained(model_name, **model_wargs) | |
def detectEncoding(b: bytes): | |
import chardet | |
logging_info(f"chardet.detect(b): {chardet.detect(b)}") | |
return chardet.detect(b)["encoding"] | |
def convertToUTF8(b: bytes): | |
if detectEncoding(b): | |
return b.decode(detectEncoding(b)) | |
return b.decode("utf-8") | |