|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
""" |
|
|
|
from nemo.collections.nlp.modules.common.megatron.retrieval_services.bert_service import start_sentence_bert_server |
|
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer |
|
from nemo.core.config import hydra_runner |
|
|
|
|
|
def get_tokenizer(args): |
|
tokenizer = get_nmt_tokenizer( |
|
library=args.library, |
|
model_name=args.type, |
|
tokenizer_model=args.model, |
|
vocab_file=args.vocab_file, |
|
merges_file=args.merge_file, |
|
delimiter=args.delimiter, |
|
) |
|
if not hasattr(tokenizer, "pad_id"): |
|
tokenizer.add_special_tokens({'pad_token': '<pad>'}) |
|
elif hasattr(tokenizer, "pad_id") and (tokenizer.pad_id is None or tokenizer.pad_id < 0): |
|
tokenizer.add_special_tokens({'pad_token': '<pad>'}) |
|
return tokenizer |
|
|
|
|
|
@hydra_runner(config_path="conf", config_name="bert_service") |
|
def main(cfg) -> None: |
|
tokenizer = get_tokenizer(cfg.tokenizer) |
|
start_sentence_bert_server( |
|
cfg.name, |
|
cfg.sentence_bert.devices, |
|
tokenizer, |
|
cfg.sentence_bert.sentence_bert, |
|
cfg.sentence_bert.sentence_bert_batch, |
|
port=cfg.sentence_bert.port, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|