Running inference code on README.md has an error

#15
by sigridjineth - opened

When running the inference code which is guided on README.md:

from sentence_transformers import CrossEncoder

model = CrossEncoder(
    "jinaai/jina-reranker-v2-base-multilingual",
    automodel_args={"torch_dtype": "auto"},
    trust_remote_code=True,
)

# Example query and documents
query = "Organic skincare products for sensitive skin"
documents = [
    "Organic skincare for sensitive skin with aloe vera and chamomile.",
    "New makeup trends focus on bold colors and innovative techniques",
    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
    "针对敏感肌专门设计的天然有机护肤产品",
    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
    "敏感肌のために特別に設計された天然有機スキンケア製品",
    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
]

# construct sentence pairs
sentence_pairs = [[query, doc] for doc in documents]

scores = model.predict(sentence_pairs, convert_to_tensor=True).tolist()
"""
[0.828125, 0.0927734375, 0.6328125, 0.08251953125, 0.76171875, 0.099609375, 0.92578125, 0.058349609375, 0.84375, 0.111328125]
"""

rankings = model.rank(query, documents, return_documents=True, convert_to_tensor=True)
print(f"Query: {query}")
for ranking in rankings:
    print(f"ID: {ranking['corpus_id']}, Score: {ranking['score']:.4f}, Text: {ranking['text']}")
"""
Query: Organic skincare products for sensitive skin
ID: 6, Score: 0.9258, Text: 针对敏感肌专门设计的天然有机护肤产品
ID: 8, Score: 0.8438, Text: 敏感肌のために特別に設計された天然有機スキンケア製品
ID: 0, Score: 0.8281, Text: Organic skincare for sensitive skin with aloe vera and chamomile.
ID: 4, Score: 0.7617, Text: Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla
ID: 2, Score: 0.6328, Text: Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille
ID: 9, Score: 0.1113, Text: 新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています
ID: 5, Score: 0.0996, Text: Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras
ID: 1, Score: 0.0928, Text: New makeup trends focus on bold colors and innovative techniques
ID: 3, Score: 0.0825, Text: Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken
ID: 7, Score: 0.0583, Text: 新的化妆趋势注重鲜艳的颜色和创新的技巧
"""

It gets this error.

(venv) root@99074ab04cc2:~/FlagEmbedding/experiments/240710/jina# python inference.py 
/root/venv/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
/root/venv/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
/root/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
tokenizer_config >>>>>>>>>>>>>>>>>>>>>>>>>>>  {'added_tokens_decoder': {'0': {'content': '<s>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '1': {'content': '<pad>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '2': {'content': '</s>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '3': {'content': '<unk>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '250001': {'content': '<mask>', 'lstrip': True, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}}, 'bos_token': '<s>', 'clean_up_tokenization_spaces': True, 'cls_token': '<s>', 'eos_token': '</s>', 'mask_token': '<mask>', 'model_max_length': 1026, 'pad_token': '<pad>', 'sep_token': '</s>', 'tokenizer_class': 'XLMRobertaTokenizer', 'unk_token': '<unk>', '_commit_hash': '2f34eeb3099209d80b9ebf42f8a62713bf44ec6a'}
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>. instantiate the tokenizer () {'added_tokens_decoder': {0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True)}, 'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'clean_up_tokenization_spaces': True, 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True), 'model_max_length': 1026, 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'vocab_file': None, 'tokenizer_file': '/root/.cache/huggingface/hub/models--jinaai--jina-reranker-v2-base-multilingual/snapshots/2f34eeb3099209d80b9ebf42f8a62713bf44ec6a/tokenizer.json', 'name_or_path': 'jinaai/jina-reranker-v2-base-multilingual'}
................... fast_tokenizer_file ............ /root/.cache/huggingface/hub/models--jinaai--jina-reranker-v2-base-multilingual/snapshots/2f34eeb3099209d80b9ebf42f8a62713bf44ec6a/tokenizer.json
Traceback (most recent call last):
  File "/root/FlagEmbedding/experiments/240710/jina/inference.py", line 3, in <module>
    model = CrossEncoder(
            ^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py", line 99, in __init__
    self.tokenizer = AutoTokenizer.from_pretrained(
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 789, in from_pretrained
    return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2028, in from_pretrained
    return cls._from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2261, in _from_pretrained
    tokenizer = cls(*init_inputs, **init_kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py", line 155, in __init__
    super().__init__(
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 112, in __init__
    fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 69 column 3
bash: pyenv: command not found
bash: pyenv: command not found

The following code happens the same.

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'jinaai/jina-reranker-v2-base-multilingual',
    torch_dtype="auto",
    trust_remote_code=True,
)

model.to('cuda') # or 'cpu' if no GPU is available
model.eval()

# Example query and documents
query = "Organic skincare products for sensitive skin"
documents = [
    "Organic skincare for sensitive skin with aloe vera and chamomile.",
    "New makeup trends focus on bold colors and innovative techniques",
    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
    "针对敏感肌专门设计的天然有机护肤产品",
    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
    "敏感肌のために特別に設計された天然有機スキンケア製品",
    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
]

# construct sentence pairs
sentence_pairs = [[query, doc] for doc in documents]

scores = model.compute_score(sentence_pairs, max_length=1024)
venv) root@99074ab04cc2:~/FlagEmbedding/experiments/240710/jina# python inference.py 
/root/venv/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
/root/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
/root/venv/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
  _torch_pytree._register_pytree_node(
/root/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
tokenizer_config >>>>>>>>>>>>>>>>>>>>>>>>>>>  {'added_tokens_decoder': {'0': {'content': '<s>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '1': {'content': '<pad>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '2': {'content': '</s>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '3': {'content': '<unk>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}, '250001': {'content': '<mask>', 'lstrip': True, 'normalized': False, 'rstrip': False, 'single_word': False, 'special': True}}, 'bos_token': '<s>', 'clean_up_tokenization_spaces': True, 'cls_token': '<s>', 'eos_token': '</s>', 'mask_token': '<mask>', 'model_max_length': 1026, 'pad_token': '<pad>', 'sep_token': '</s>', 'tokenizer_class': 'XLMRobertaTokenizer', 'unk_token': '<unk>', '_commit_hash': '2f34eeb3099209d80b9ebf42f8a62713bf44ec6a'}
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>. instantiate the tokenizer () {'added_tokens_decoder': {0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True)}, 'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'clean_up_tokenization_spaces': True, 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True), 'model_max_length': 1026, 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), 'vocab_file': None, 'tokenizer_file': '/root/.cache/huggingface/hub/models--jinaai--jina-reranker-v2-base-multilingual/snapshots/2f34eeb3099209d80b9ebf42f8a62713bf44ec6a/tokenizer.json', 'name_or_path': 'jinaai/jina-reranker-v2-base-multilingual'}
................... fast_tokenizer_file ............ /root/.cache/huggingface/hub/models--jinaai--jina-reranker-v2-base-multilingual/snapshots/2f34eeb3099209d80b9ebf42f8a62713bf44ec6a/tokenizer.json
Traceback (most recent call last):
  File "/root/FlagEmbedding/experiments/240710/jina/inference.py", line 30, in <module>
    scores = model.compute_score(sentence_pairs, max_length=1024)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/huggingface/modules/transformers_modules/jinaai/jina-reranker-v2-base-multilingual/2f34eeb3099209d80b9ebf42f8a62713bf44ec6a/modeling_xlm_roberta.py", line 918, in compute_score
    self._tokenizer = AutoTokenizer.from_pretrained(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 789, in from_pretrained
    return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2028, in from_pretrained
    return cls._from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2261, in _from_pretrained
    tokenizer = cls(*init_inputs, **init_kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/venv/lib/python3.11/site-packages/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py", line 155, in __init__
    super().__init__(
  File "/root/venv/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 112, in __init__
    fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 69 column 3
bash: pyenv: command not found
bash: pyenv: command not found

Sign up or log in to comment