test / tests /test_tokenizer.py
iblfe's picture
Upload folder using huggingface_hub
b585c7f verified
import os
import nltk
import pytest
from tests.utils import wrap_test_forked
def nltkTokenize(text):
words = nltk.word_tokenize(text)
return words
import re
WORD = re.compile(r'\w+')
def regTokenize(text):
words = WORD.findall(text)
return words
import time
@pytest.mark.skipif(not os.getenv('MEASURE'),
reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
@wrap_test_forked
def test_tokenizer1():
prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:
def fib(n):
a, b = 0, 1
if n == 0 or n == 1:
return a
for i in range(n-2):
a, b = b, a+b
return b
for i in range(10):
print(fib(i))
This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.
The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.
In the main part of the program, we use a for loop to call the fib function with different"""
prompt = os.getenv('PROMPT', prompt)
run_tokenizer1(prompt)
def run_tokenizer1(prompt):
from transformers import AutoTokenizer
t = AutoTokenizer.from_pretrained("distilgpt2")
llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')
from InstructorEmbedding import INSTRUCTOR
emb = INSTRUCTOR('hkunlp/instructor-large')
t0 = time.time()
a = len(regTokenize(prompt))
print("Regexp Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(nltkTokenize(prompt))
print("NLTK Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(t(prompt)['input_ids'])
print("Slow Tokenizer", a, time.time() - t0)
t0 = time.time()
a = len(llm_tokenizer(prompt)['input_ids'])
print("Fast Tokenizer LLM", a, time.time() - t0)
t0 = time.time()
a = emb.tokenize([prompt])['input_ids'].shape[1]
print("Instruct Embedding", a, time.time() - t0)
@wrap_test_forked
def test_fake_tokenizer():
from src.utils import FakeTokenizer
t = FakeTokenizer()
assert t.num_tokens_from_string('How are you?') == 4
assert t.num_tokens_from_string('<|endoftext|>') == 7
try:
t.encoding.encode('<|endoftext|>')
raise RuntimeError("Shouldn't reach here")
except ValueError as e:
assert "disallowed special token" in str(e)
@wrap_test_forked
def test_tokenizer_base_model1():
# test separate tokenizer
from tests.test_langchain_units import get_test_model
model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-beta',
tokenizer_base_model='amazon/MistralLite',
prompt_type='human_bot')
assert 'MistralForCausalLM' in str(model)
assert 'amazon/MistralLite' in str(tokenizer)
assert prompt_type == 'human_bot'
print("here")
@wrap_test_forked
def test_tokenizer_base_model2():
# separate tokenizer for vllm, so don't have to share full model, just proxy tokenizer
# if vllm endpoint, we shouldn't fail at all if have invalid base model
from tests.test_langchain_units import get_test_model
model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-omega',
tokenizer_base_model='amazon/MistralLite',
prompt_type='human_bot',
inference_server="vllm:localhost:8080",
max_seq_len=4096)
assert model['base_url'] == 'http://localhost:8080/v1'
assert 'amazon/MistralLite' in str(tokenizer)
assert prompt_type == 'human_bot'
print("here")
if __name__ == '__main__':
test_tokenizer1()