File size: 4,447 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os

import nltk
import pytest

from tests.utils import wrap_test_forked


def nltkTokenize(text):
    words = nltk.word_tokenize(text)
    return words


import re

WORD = re.compile(r'\w+')


def regTokenize(text):
    words = WORD.findall(text)
    return words


import time


@pytest.mark.skipif(not os.getenv('MEASURE'),
                    reason="For checking token length for various methods: MEASURE=1 pytest -s -v tests/test_tokenizer.py")
@wrap_test_forked
def test_tokenizer1():
    prompt = """Here is an example of how to write a Python program to generate the Fibonacci sequence:
    
    
    
    
    def fib(n):
        a, b = 0, 1
        if n == 0 or n == 1:
            return a
        for i in range(n-2):
            a, b = b, a+b
        return b
    
    for i in range(10):
        print(fib(i))
    This program defines a function called fib that takes an integer n as input and returns the nth Fibonacci number. The function uses two variables a and b to keep track of the current and previous Fibonacci numbers.
    
    The first two lines of the function check if n is either 0 or 1, in which case the function returns 0 or 1 respectively. If n is greater than 1, the function iterates over the range of integers from 2 to n-1, adding the previous two Fibonacci numbers to get the current Fibonacci number. Finally, the function returns the last Fibonacci number calculated.
    
    In the main part of the program, we use a for loop to call the fib function with different"""

    prompt = os.getenv('PROMPT', prompt)
    run_tokenizer1(prompt)


def run_tokenizer1(prompt):
    from transformers import AutoTokenizer

    t = AutoTokenizer.from_pretrained("distilgpt2")
    llm_tokenizer = AutoTokenizer.from_pretrained('h2oai/h2ogpt-oig-oasst1-512-6_9b')

    from InstructorEmbedding import INSTRUCTOR
    emb = INSTRUCTOR('hkunlp/instructor-large')

    t0 = time.time()
    a = len(regTokenize(prompt))
    print("Regexp Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(nltkTokenize(prompt))
    print("NLTK Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(t(prompt)['input_ids'])
    print("Slow Tokenizer", a, time.time() - t0)

    t0 = time.time()
    a = len(llm_tokenizer(prompt)['input_ids'])
    print("Fast Tokenizer LLM", a, time.time() - t0)

    t0 = time.time()
    a = emb.tokenize([prompt])['input_ids'].shape[1]
    print("Instruct Embedding", a, time.time() - t0)


@wrap_test_forked
def test_fake_tokenizer():
    from src.utils import FakeTokenizer
    t = FakeTokenizer()
    assert t.num_tokens_from_string('How are you?') == 4
    assert t.num_tokens_from_string('<|endoftext|>') == 7
    try:
        t.encoding.encode('<|endoftext|>')
        raise RuntimeError("Shouldn't reach here")
    except ValueError as e:
        assert "disallowed special token" in str(e)


@wrap_test_forked
def test_tokenizer_base_model1():
    # test separate tokenizer
    from tests.test_langchain_units import get_test_model
    model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-beta',
                                                               tokenizer_base_model='amazon/MistralLite',
                                                               prompt_type='human_bot')
    assert 'MistralForCausalLM' in str(model)
    assert 'amazon/MistralLite' in str(tokenizer)
    assert prompt_type == 'human_bot'
    print("here")


@wrap_test_forked
def test_tokenizer_base_model2():
    # separate tokenizer for vllm, so don't have to share full model, just proxy tokenizer
    # if vllm endpoint, we shouldn't fail at all if have invalid base model
    from tests.test_langchain_units import get_test_model
    model, tokenizer, base_model, prompt_type = get_test_model(base_model='HuggingFaceH4/zephyr-7b-omega',
                                                               tokenizer_base_model='amazon/MistralLite',
                                                               prompt_type='human_bot',
                                                               inference_server="vllm:localhost:8080",
                                                               max_seq_len=4096)
    assert model['base_url'] == 'http://localhost:8080/v1'
    assert 'amazon/MistralLite' in str(tokenizer)
    assert prompt_type == 'human_bot'
    print("here")


if __name__ == '__main__':
    test_tokenizer1()