File size: 675 Bytes
6f179e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from jinja2 import Template
from transformers import AutoModel, AutoTokenizer
from .logging import logging_info
def initEmbedding(model_name="BAAI/bge-small-zh-v1.5", **model_wargs):
return AutoModel.from_pretrained(model_name, **model_wargs)
def initTokenizer(model_name="BAAI/bge-small-zh-v1.5", **model_wargs):
return AutoTokenizer.from_pretrained(model_name, **model_wargs)
def detectEncoding(b: bytes):
import chardet
logging_info(f"chardet.detect(b): {chardet.detect(b)}")
return chardet.detect(b)["encoding"]
def convertToUTF8(b: bytes):
if detectEncoding(b):
return b.decode(detectEncoding(b))
return b.decode("utf-8")
|