File size: 4,034 Bytes
7db0ae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#### What this tests ####
#    This adds perf testing to the router, to ensure it's never > 50ms slower than the azure-openai sdk.
import sys, os, time, inspect, asyncio, traceback
from datetime import datetime
import pytest

sys.path.insert(0, os.path.abspath("../.."))
import openai, litellm, uuid
from openai import AsyncAzureOpenAI

client = AsyncAzureOpenAI(
    api_key=os.getenv("AZURE_API_KEY"),
    azure_endpoint=os.getenv("AZURE_API_BASE"),  # type: ignore
    api_version=os.getenv("AZURE_API_VERSION"),
)

model_list = [
    {
        "model_name": "azure-test",
        "litellm_params": {
            "model": "azure/chatgpt-v-2",
            "api_key": os.getenv("AZURE_API_KEY"),
            "api_base": os.getenv("AZURE_API_BASE"),
            "api_version": os.getenv("AZURE_API_VERSION"),
        },
    }
]

router = litellm.Router(model_list=model_list)


async def _openai_completion():
    try:
        start_time = time.time()
        response = await client.chat.completions.create(
            model="chatgpt-v-2",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
            stream=True,
        )
        time_to_first_token = None
        first_token_ts = None
        init_chunk = None
        async for chunk in response:
            if (
                time_to_first_token is None
                and len(chunk.choices) > 0
                and chunk.choices[0].delta.content is not None
            ):
                first_token_ts = time.time()
                time_to_first_token = first_token_ts - start_time
                init_chunk = chunk
        end_time = time.time()
        print(
            "OpenAI Call: ",
            init_chunk,
            start_time,
            first_token_ts,
            time_to_first_token,
            end_time,
        )
        return time_to_first_token
    except Exception as e:
        print(e)
        return None


async def _router_completion():
    try:
        start_time = time.time()
        response = await router.acompletion(
            model="azure-test",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
            stream=True,
        )
        time_to_first_token = None
        first_token_ts = None
        init_chunk = None
        async for chunk in response:
            if (
                time_to_first_token is None
                and len(chunk.choices) > 0
                and chunk.choices[0].delta.content is not None
            ):
                first_token_ts = time.time()
                time_to_first_token = first_token_ts - start_time
                init_chunk = chunk
        end_time = time.time()
        print(
            "Router Call: ",
            init_chunk,
            start_time,
            first_token_ts,
            time_to_first_token,
            end_time - first_token_ts,
        )
        return time_to_first_token
    except Exception as e:
        print(e)
        return None


async def test_azure_completion_streaming():
    """
    Test azure streaming call - measure on time to first (non-null) token.
    """
    n = 3  # Number of concurrent tasks
    ## OPENAI AVG. TIME
    tasks = [_openai_completion() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    total_time = 0
    for item in successful_completions:
        total_time += item
    avg_openai_time = total_time / 3
    ## ROUTER AVG. TIME
    tasks = [_router_completion() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    total_time = 0
    for item in successful_completions:
        total_time += item
    avg_router_time = total_time / 3
    ## COMPARE
    print(f"avg_router_time: {avg_router_time}; avg_openai_time: {avg_openai_time}")
    assert avg_router_time < avg_openai_time + 0.5


# asyncio.run(test_azure_completion_streaming())