File size: 675 Bytes
6f179e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from jinja2 import Template
from transformers import AutoModel, AutoTokenizer

from .logging import logging_info


def initEmbedding(model_name="BAAI/bge-small-zh-v1.5", **model_wargs):
    return AutoModel.from_pretrained(model_name, **model_wargs)


def initTokenizer(model_name="BAAI/bge-small-zh-v1.5", **model_wargs):
    return AutoTokenizer.from_pretrained(model_name, **model_wargs)


def detectEncoding(b: bytes):
    import chardet

    logging_info(f"chardet.detect(b): {chardet.detect(b)}")

    return chardet.detect(b)["encoding"]


def convertToUTF8(b: bytes):
    if detectEncoding(b):
        return b.decode(detectEncoding(b))

    return b.decode("utf-8")